Blame - bitbake/lib/bs4/builder/__init__.py - stefanberger/openbmc

blob: 740f5f29cd72f82820b67eb19e9a2178129733e1 [file] [log] [blame]

Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1	from collections import defaultdict
				2	import itertools
				3	import sys
				4	from bs4.element import (
				5	CharsetMetaAttributeValue,
				6	ContentMetaAttributeValue,
				7	whitespace_re
				8	)
				9
				10	__all__ = [
				11	'HTMLTreeBuilder',
				12	'SAXTreeBuilder',
				13	'TreeBuilder',
				14	'TreeBuilderRegistry',
				15	]
				16
				17	# Some useful features for a TreeBuilder to have.
				18	FAST = 'fast'
				19	PERMISSIVE = 'permissive'
				20	STRICT = 'strict'
				21	XML = 'xml'
				22	HTML = 'html'
				23	HTML_5 = 'html5'
				24
				25
				26	class TreeBuilderRegistry(object):
				27
				28	def __init__(self):
				29	self.builders_for_feature = defaultdict(list)
				30	self.builders = []
				31
				32	def register(self, treebuilder_class):
				33	"""Register a treebuilder based on its advertised features."""
				34	for feature in treebuilder_class.features:
				35	self.builders_for_feature[feature].insert(0, treebuilder_class)
				36	self.builders.insert(0, treebuilder_class)
				37
				38	def lookup(self, *features):
				39	if len(self.builders) == 0:
				40	# There are no builders at all.
				41	return None
				42
				43	if len(features) == 0:
				44	# They didn't ask for any features. Give them the most
				45	# recently registered builder.
				46	return self.builders[0]
				47
				48	# Go down the list of features in order, and eliminate any builders
				49	# that don't match every feature.
				50	features = list(features)
				51	features.reverse()
				52	candidates = None
				53	candidate_set = None
				54	while len(features) > 0:
				55	feature = features.pop()
				56	we_have_the_feature = self.builders_for_feature.get(feature, [])
				57	if len(we_have_the_feature) > 0:
				58	if candidates is None:
				59	candidates = we_have_the_feature
				60	candidate_set = set(candidates)
				61	else:
				62	# Eliminate any candidates that don't have this feature.
				63	candidate_set = candidate_set.intersection(
				64	set(we_have_the_feature))
				65
				66	# The only valid candidates are the ones in candidate_set.
				67	# Go through the original list of candidates and pick the first one
				68	# that's in candidate_set.
				69	if candidate_set is None:
				70	return None
				71	for candidate in candidates:
				72	if candidate in candidate_set:
				73	return candidate
				74	return None
				75
				76	# The BeautifulSoup class will take feature lists from developers and use them
				77	# to look up builders in this registry.
				78	builder_registry = TreeBuilderRegistry()
				79
				80	class TreeBuilder(object):
				81	"""Turn a document into a Beautiful Soup object tree."""
				82
				83	features = []
				84
				85	is_xml = False
				86	preserve_whitespace_tags = set()
				87	empty_element_tags = None # A tag will be considered an empty-element
				88	# tag when and only when it has no contents.
				89
				90	# A value for these tag/attribute combinations is a space- or
				91	# comma-separated list of CDATA, rather than a single CDATA.
				92	cdata_list_attributes = {}
				93
				94
				95	def __init__(self):
				96	self.soup = None
				97
				98	def reset(self):
				99	pass
				100
				101	def can_be_empty_element(self, tag_name):
				102	"""Might a tag with this name be an empty-element tag?
				103
				104	The final markup may or may not actually present this tag as
				105	self-closing.
				106
				107	For instance: an HTMLBuilder does not consider a <p> tag to be
				108	an empty-element tag (it's not in
				109	HTMLBuilder.empty_element_tags). This means an empty <p> tag
				110	will be presented as "<p></p>", not "<p />".
				111
				112	The default implementation has no opinion about which tags are
				113	empty-element tags, so a tag will be presented as an
				114	empty-element tag if and only if it has no contents.
				115	"<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
				116	be left alone.
				117	"""
				118	if self.empty_element_tags is None:
				119	return True
				120	return tag_name in self.empty_element_tags
				121
				122	def feed(self, markup):
				123	raise NotImplementedError()
				124
				125	def prepare_markup(self, markup, user_specified_encoding=None,
				126	document_declared_encoding=None):
				127	return markup, None, None, False
				128
				129	def test_fragment_to_document(self, fragment):
				130	"""Wrap an HTML fragment to make it look like a document.
				131
				132	Different parsers do this differently. For instance, lxml
				133	introduces an empty <head> tag, and html5lib
				134	doesn't. Abstracting this away lets us write simple tests
				135	which run HTML fragments through the parser and compare the
				136	results against other HTML fragments.
				137
				138	This method should not be used outside of tests.
				139	"""
				140	return fragment
				141
				142	def set_up_substitutions(self, tag):
				143	return False
				144
				145	def _replace_cdata_list_attribute_values(self, tag_name, attrs):
				146	"""Replaces class="foo bar" with class=["foo", "bar"]
				147
				148	Modifies its input in place.
				149	"""
				150	if not attrs:
				151	return attrs
				152	if self.cdata_list_attributes:
				153	universal = self.cdata_list_attributes.get('*', [])
				154	tag_specific = self.cdata_list_attributes.get(
				155	tag_name.lower(), None)
				156	for attr in attrs.keys():
				157	if attr in universal or (tag_specific and attr in tag_specific):
				158	# We have a "class"-type attribute whose string
				159	# value is a whitespace-separated list of
				160	# values. Split it into a list.
				161	value = attrs[attr]
				162	if isinstance(value, basestring):
				163	values = whitespace_re.split(value)
				164	else:
				165	# html5lib sometimes calls setAttributes twice
				166	# for the same tag when rearranging the parse
				167	# tree. On the second call the attribute value
				168	# here is already a list. If this happens,
				169	# leave the value alone rather than trying to
				170	# split it again.
				171	values = value
				172	attrs[attr] = values
				173	return attrs
				174
				175	class SAXTreeBuilder(TreeBuilder):
				176	"""A Beautiful Soup treebuilder that listens for SAX events."""
				177
				178	def feed(self, markup):
				179	raise NotImplementedError()
				180
				181	def close(self):
				182	pass
				183
				184	def startElement(self, name, attrs):
				185	attrs = dict((key[1], value) for key, value in list(attrs.items()))
				186	#print "Start %s, %r" % (name, attrs)
				187	self.soup.handle_starttag(name, attrs)
				188
				189	def endElement(self, name):
				190	#print "End %s" % name
				191	self.soup.handle_endtag(name)
				192
				193	def startElementNS(self, nsTuple, nodeName, attrs):
				194	# Throw away (ns, nodeName) for now.
				195	self.startElement(nodeName, attrs)
				196
				197	def endElementNS(self, nsTuple, nodeName):
				198	# Throw away (ns, nodeName) for now.
				199	self.endElement(nodeName)
				200	#handler.endElementNS((ns, node.nodeName), node.nodeName)
				201
				202	def startPrefixMapping(self, prefix, nodeValue):
				203	# Ignore the prefix for now.
				204	pass
				205
				206	def endPrefixMapping(self, prefix):
				207	# Ignore the prefix for now.
				208	# handler.endPrefixMapping(prefix)
				209	pass
				210
				211	def characters(self, content):
				212	self.soup.handle_data(content)
				213
				214	def startDocument(self):
				215	pass
				216
				217	def endDocument(self):
				218	pass
				219
				220
				221	class HTMLTreeBuilder(TreeBuilder):
				222	"""This TreeBuilder knows facts about HTML.
				223
				224	Such as which tags are empty-element tags.
				225	"""
				226
				227	preserve_whitespace_tags = set(['pre', 'textarea'])
				228	empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
				229	'spacer', 'link', 'frame', 'base'])
				230
				231	# The HTML standard defines these attributes as containing a
				232	# space-separated list of values, not a single value. That is,
				233	# class="foo bar" means that the 'class' attribute has two values,
				234	# 'foo' and 'bar', not the single value 'foo bar'. When we
				235	# encounter one of these attributes, we will parse its value into
				236	# a list of values if possible. Upon output, the list will be
				237	# converted back into a string.
				238	cdata_list_attributes = {
				239	"*" : ['class', 'accesskey', 'dropzone'],
				240	"a" : ['rel', 'rev'],
				241	"link" : ['rel', 'rev'],
				242	"td" : ["headers"],
				243	"th" : ["headers"],
				244	"td" : ["headers"],
				245	"form" : ["accept-charset"],
				246	"object" : ["archive"],
				247
				248	# These are HTML5 specific, as are .accesskey and .dropzone above.
				249	"area" : ["rel"],
				250	"icon" : ["sizes"],
				251	"iframe" : ["sandbox"],
				252	"output" : ["for"],
				253	}
				254
				255	def set_up_substitutions(self, tag):
				256	# We are only interested in <meta> tags
				257	if tag.name != 'meta':
				258	return False
				259
				260	http_equiv = tag.get('http-equiv')
				261	content = tag.get('content')
				262	charset = tag.get('charset')
				263
				264	# We are interested in <meta> tags that say what encoding the
				265	# document was originally in. This means HTML 5-style <meta>
				266	# tags that provide the "charset" attribute. It also means
				267	# HTML 4-style <meta> tags that provide the "content"
				268	# attribute and have "http-equiv" set to "content-type".
				269	#
				270	# In both cases we will replace the value of the appropriate
				271	# attribute with a standin object that can take on any
				272	# encoding.
				273	meta_encoding = None
				274	if charset is not None:
				275	# HTML 5 style:
				276	# <meta charset="utf8">
				277	meta_encoding = charset
				278	tag['charset'] = CharsetMetaAttributeValue(charset)
				279
				280	elif (content is not None and http_equiv is not None
				281	and http_equiv.lower() == 'content-type'):
				282	# HTML 4 style:
				283	# <meta http-equiv="content-type" content="text/html; charset=utf8">
				284	tag['content'] = ContentMetaAttributeValue(content)
				285
				286	return (meta_encoding is not None)
				287
				288	def register_treebuilders_from(module):
				289	"""Copy TreeBuilders from the given module into this module."""
				290	# I'm fairly sure this is not the best way to do this.
				291	this_module = sys.modules['bs4.builder']
				292	for name in module.__all__:
				293	obj = getattr(module, name)
				294
				295	if issubclass(obj, TreeBuilder):
				296	setattr(this_module, name, obj)
				297	this_module.__all__.append(name)
				298	# Register the builder while we're at it.
				299	this_module.builder_registry.register(obj)
				300
				301	class ParserRejectedMarkup(Exception):
				302	pass
				303
				304	# Builders are registered in reverse order of priority, so that custom
				305	# builder registrations will take precedence. In general, we want lxml
				306	# to take precedence over html5lib, because it's faster. And we only
				307	# want to use HTMLParser as a last result.
				308	from . import _htmlparser
				309	register_treebuilders_from(_htmlparser)
				310	try:
				311	from . import _html5lib
				312	register_treebuilders_from(_html5lib)
				313	except ImportError:
				314	# They don't have html5lib installed.
				315	pass
				316	try:
				317	from . import _lxml
				318	register_treebuilders_from(_lxml)
				319	except ImportError:
				320	# They don't have lxml installed.
				321	pass