Blame - import-layers/yocto-poky/bitbake/lib/bs4/element.py - stefanberger/openbmc

blob: da9afdf48ec0b05cf8e970cd906425ce80b343cb [file] [log] [blame]

Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1	import collections
				2	import re
				3	import sys
				4	import warnings
				5	from bs4.dammit import EntitySubstitution
				6
				7	DEFAULT_OUTPUT_ENCODING = "utf-8"
				8	PY3K = (sys.version_info[0] > 2)
				9
				10	whitespace_re = re.compile("\s+")
				11
				12	def _alias(attr):
				13	"""Alias one attribute name to another for backward compatibility"""
				14	@property
				15	def alias(self):
				16	return getattr(self, attr)
				17
				18	@alias.setter
				19	def alias(self):
				20	return setattr(self, attr)
				21	return alias
				22
				23
				24	class NamespacedAttribute(unicode):
				25
				26	def __new__(cls, prefix, name, namespace=None):
				27	if name is None:
				28	obj = unicode.__new__(cls, prefix)
				29	elif prefix is None:
				30	# Not really namespaced.
				31	obj = unicode.__new__(cls, name)
				32	else:
				33	obj = unicode.__new__(cls, prefix + ":" + name)
				34	obj.prefix = prefix
				35	obj.name = name
				36	obj.namespace = namespace
				37	return obj
				38
				39	class AttributeValueWithCharsetSubstitution(unicode):
				40	"""A stand-in object for a character encoding specified in HTML."""
				41
				42	class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
				43	"""A generic stand-in for the value of a meta tag's 'charset' attribute.
				44
				45	When Beautiful Soup parses the markup '<meta charset="utf8">', the
				46	value of the 'charset' attribute will be one of these objects.
				47	"""
				48
				49	def __new__(cls, original_value):
				50	obj = unicode.__new__(cls, original_value)
				51	obj.original_value = original_value
				52	return obj
				53
				54	def encode(self, encoding):
				55	return encoding
				56
				57
				58	class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
				59	"""A generic stand-in for the value of a meta tag's 'content' attribute.
				60
				61	When Beautiful Soup parses the markup:
				62	<meta http-equiv="content-type" content="text/html; charset=utf8">
				63
				64	The value of the 'content' attribute will be one of these objects.
				65	"""
				66
				67	CHARSET_RE = re.compile("((^\|;)\scharset=)([^;])", re.M)
				68
				69	def __new__(cls, original_value):
				70	match = cls.CHARSET_RE.search(original_value)
				71	if match is None:
				72	# No substitution necessary.
				73	return unicode.__new__(unicode, original_value)
				74
				75	obj = unicode.__new__(cls, original_value)
				76	obj.original_value = original_value
				77	return obj
				78
				79	def encode(self, encoding):
				80	def rewrite(match):
				81	return match.group(1) + encoding
				82	return self.CHARSET_RE.sub(rewrite, self.original_value)
				83
				84	class HTMLAwareEntitySubstitution(EntitySubstitution):
				85
				86	"""Entity substitution rules that are aware of some HTML quirks.
				87
				88	Specifically, the contents of <script> and <style> tags should not
				89	undergo entity substitution.
				90
				91	Incoming NavigableString objects are checked to see if they're the
				92	direct children of a <script> or <style> tag.
				93	"""
				94
				95	cdata_containing_tags = set(["script", "style"])
				96
				97	preformatted_tags = set(["pre"])
				98
				99	@classmethod
				100	def _substitute_if_appropriate(cls, ns, f):
				101	if (isinstance(ns, NavigableString)
				102	and ns.parent is not None
				103	and ns.parent.name in cls.cdata_containing_tags):
				104	# Do nothing.
				105	return ns
				106	# Substitute.
				107	return f(ns)
				108
				109	@classmethod
				110	def substitute_html(cls, ns):
				111	return cls._substitute_if_appropriate(
				112	ns, EntitySubstitution.substitute_html)
				113
				114	@classmethod
				115	def substitute_xml(cls, ns):
				116	return cls._substitute_if_appropriate(
				117	ns, EntitySubstitution.substitute_xml)
				118
				119	class PageElement(object):
				120	"""Contains the navigational information for some part of the page
				121	(either a tag or a piece of text)"""
				122
				123	# There are five possible values for the "formatter" argument passed in
				124	# to methods like encode() and prettify():
				125	#
				126	# "html" - All Unicode characters with corresponding HTML entities
				127	# are converted to those entities on output.
				128	# "minimal" - Bare ampersands and angle brackets are converted to
				129	# XML entities: & < >
				130	# None - The null formatter. Unicode characters are never
				131	# converted to entities. This is not recommended, but it's
				132	# faster than "minimal".
				133	# A function - This function will be called on every string that
				134	# needs to undergo entity substitution.
				135	#
				136
				137	# In an HTML document, the default "html" and "minimal" functions
				138	# will leave the contents of <script> and <style> tags alone. For
				139	# an XML document, all tags will be given the same treatment.
				140
				141	HTML_FORMATTERS = {
				142	"html" : HTMLAwareEntitySubstitution.substitute_html,
				143	"minimal" : HTMLAwareEntitySubstitution.substitute_xml,
				144	None : None
				145	}
				146
				147	XML_FORMATTERS = {
				148	"html" : EntitySubstitution.substitute_html,
				149	"minimal" : EntitySubstitution.substitute_xml,
				150	None : None
				151	}
				152
				153	def format_string(self, s, formatter='minimal'):
				154	"""Format the given string using the given formatter."""
				155	if not callable(formatter):
				156	formatter = self._formatter_for_name(formatter)
				157	if formatter is None:
				158	output = s
				159	else:
				160	output = formatter(s)
				161	return output
				162
				163	@property
				164	def _is_xml(self):
				165	"""Is this element part of an XML tree or an HTML tree?
				166
				167	This is used when mapping a formatter name ("minimal") to an
				168	appropriate function (one that performs entity-substitution on
				169	the contents of <script> and <style> tags, or not). It's
				170	inefficient, but it should be called very rarely.
				171	"""
				172	if self.parent is None:
				173	# This is the top-level object. It should have .is_xml set
				174	# from tree creation. If not, take a guess--BS is usually
				175	# used on HTML markup.
				176	return getattr(self, 'is_xml', False)
				177	return self.parent._is_xml
				178
				179	def _formatter_for_name(self, name):
				180	"Look up a formatter function based on its name and the tree."
				181	if self._is_xml:
				182	return self.XML_FORMATTERS.get(
				183	name, EntitySubstitution.substitute_xml)
				184	else:
				185	return self.HTML_FORMATTERS.get(
				186	name, HTMLAwareEntitySubstitution.substitute_xml)
				187
				188	def setup(self, parent=None, previous_element=None):
				189	"""Sets up the initial relations between this element and
				190	other elements."""
				191	self.parent = parent
				192	self.previous_element = previous_element
				193	if previous_element is not None:
				194	self.previous_element.next_element = self
				195	self.next_element = None
				196	self.previous_sibling = None
				197	self.next_sibling = None
				198	if self.parent is not None and self.parent.contents:
				199	self.previous_sibling = self.parent.contents[-1]
				200	self.previous_sibling.next_sibling = self
				201
				202	nextSibling = _alias("next_sibling") # BS3
				203	previousSibling = _alias("previous_sibling") # BS3
				204
				205	def replace_with(self, replace_with):
				206	if replace_with is self:
				207	return
				208	if replace_with is self.parent:
				209	raise ValueError("Cannot replace a Tag with its parent.")
				210	old_parent = self.parent
				211	my_index = self.parent.index(self)
				212	self.extract()
				213	old_parent.insert(my_index, replace_with)
				214	return self
				215	replaceWith = replace_with # BS3
				216
				217	def unwrap(self):
				218	my_parent = self.parent
				219	my_index = self.parent.index(self)
				220	self.extract()
				221	for child in reversed(self.contents[:]):
				222	my_parent.insert(my_index, child)
				223	return self
				224	replace_with_children = unwrap
				225	replaceWithChildren = unwrap # BS3
				226
				227	def wrap(self, wrap_inside):
				228	me = self.replace_with(wrap_inside)
				229	wrap_inside.append(me)
				230	return wrap_inside
				231
				232	def extract(self):
				233	"""Destructively rips this element out of the tree."""
				234	if self.parent is not None:
				235	del self.parent.contents[self.parent.index(self)]
				236
				237	#Find the two elements that would be next to each other if
				238	#this element (and any children) hadn't been parsed. Connect
				239	#the two.
				240	last_child = self._last_descendant()
				241	next_element = last_child.next_element
				242
				243	if self.previous_element is not None:
				244	self.previous_element.next_element = next_element
				245	if next_element is not None:
				246	next_element.previous_element = self.previous_element
				247	self.previous_element = None
				248	last_child.next_element = None
				249
				250	self.parent = None
				251	if self.previous_sibling is not None:
				252	self.previous_sibling.next_sibling = self.next_sibling
				253	if self.next_sibling is not None:
				254	self.next_sibling.previous_sibling = self.previous_sibling
				255	self.previous_sibling = self.next_sibling = None
				256	return self
				257
				258	def _last_descendant(self, is_initialized=True, accept_self=True):
				259	"Finds the last element beneath this object to be parsed."
				260	if is_initialized and self.next_sibling:
				261	last_child = self.next_sibling.previous_element
				262	else:
				263	last_child = self
				264	while isinstance(last_child, Tag) and last_child.contents:
				265	last_child = last_child.contents[-1]
				266	if not accept_self and last_child == self:
				267	last_child = None
				268	return last_child
				269	# BS3: Not part of the API!
				270	_lastRecursiveChild = _last_descendant
				271
				272	def insert(self, position, new_child):
				273	if new_child is self:
				274	raise ValueError("Cannot insert a tag into itself.")
				275	if (isinstance(new_child, basestring)
				276	and not isinstance(new_child, NavigableString)):
				277	new_child = NavigableString(new_child)
				278
				279	position = min(position, len(self.contents))
				280	if hasattr(new_child, 'parent') and new_child.parent is not None:
				281	# We're 'inserting' an element that's already one
				282	# of this object's children.
				283	if new_child.parent is self:
				284	current_index = self.index(new_child)
				285	if current_index < position:
				286	# We're moving this element further down the list
				287	# of this object's children. That means that when
				288	# we extract this element, our target index will
				289	# jump down one.
				290	position -= 1
				291	new_child.extract()
				292
				293	new_child.parent = self
				294	previous_child = None
				295	if position == 0:
				296	new_child.previous_sibling = None
				297	new_child.previous_element = self
				298	else:
				299	previous_child = self.contents[position - 1]
				300	new_child.previous_sibling = previous_child
				301	new_child.previous_sibling.next_sibling = new_child
				302	new_child.previous_element = previous_child._last_descendant(False)
				303	if new_child.previous_element is not None:
				304	new_child.previous_element.next_element = new_child
				305
				306	new_childs_last_element = new_child._last_descendant(False)
				307
				308	if position >= len(self.contents):
				309	new_child.next_sibling = None
				310
				311	parent = self
				312	parents_next_sibling = None
				313	while parents_next_sibling is None and parent is not None:
				314	parents_next_sibling = parent.next_sibling
				315	parent = parent.parent
				316	if parents_next_sibling is not None:
				317	# We found the element that comes next in the document.
				318	break
				319	if parents_next_sibling is not None:
				320	new_childs_last_element.next_element = parents_next_sibling
				321	else:
				322	# The last element of this tag is the last element in
				323	# the document.
				324	new_childs_last_element.next_element = None
				325	else:
				326	next_child = self.contents[position]
				327	new_child.next_sibling = next_child
				328	if new_child.next_sibling is not None:
				329	new_child.next_sibling.previous_sibling = new_child
				330	new_childs_last_element.next_element = next_child
				331
				332	if new_childs_last_element.next_element is not None:
				333	new_childs_last_element.next_element.previous_element = new_childs_last_element
				334	self.contents.insert(position, new_child)
				335
				336	def append(self, tag):
				337	"""Appends the given tag to the contents of this tag."""
				338	self.insert(len(self.contents), tag)
				339
				340	def insert_before(self, predecessor):
				341	"""Makes the given element the immediate predecessor of this one.
				342
				343	The two elements will have the same parent, and the given element
				344	will be immediately before this one.
				345	"""
				346	if self is predecessor:
				347	raise ValueError("Can't insert an element before itself.")
				348	parent = self.parent
				349	if parent is None:
				350	raise ValueError(
				351	"Element has no parent, so 'before' has no meaning.")
				352	# Extract first so that the index won't be screwed up if they
				353	# are siblings.
				354	if isinstance(predecessor, PageElement):
				355	predecessor.extract()
				356	index = parent.index(self)
				357	parent.insert(index, predecessor)
				358
				359	def insert_after(self, successor):
				360	"""Makes the given element the immediate successor of this one.
				361
				362	The two elements will have the same parent, and the given element
				363	will be immediately after this one.
				364	"""
				365	if self is successor:
				366	raise ValueError("Can't insert an element after itself.")
				367	parent = self.parent
				368	if parent is None:
				369	raise ValueError(
				370	"Element has no parent, so 'after' has no meaning.")
				371	# Extract first so that the index won't be screwed up if they
				372	# are siblings.
				373	if isinstance(successor, PageElement):
				374	successor.extract()
				375	index = parent.index(self)
				376	parent.insert(index+1, successor)
				377
				378	def find_next(self, name=None, attrs={}, text=None, **kwargs):
				379	"""Returns the first item that matches the given criteria and
				380	appears after this Tag in the document."""
				381	return self._find_one(self.find_all_next, name, attrs, text, **kwargs)
				382	findNext = find_next # BS3
				383
				384	def find_all_next(self, name=None, attrs={}, text=None, limit=None,
				385	**kwargs):
				386	"""Returns all items that match the given criteria and appear
				387	after this Tag in the document."""
				388	return self._find_all(name, attrs, text, limit, self.next_elements,
				389	**kwargs)
				390	findAllNext = find_all_next # BS3
				391
				392	def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs):
				393	"""Returns the closest sibling to this Tag that matches the
				394	given criteria and appears after this Tag in the document."""
				395	return self._find_one(self.find_next_siblings, name, attrs, text,
				396	**kwargs)
				397	findNextSibling = find_next_sibling # BS3
				398
				399	def find_next_siblings(self, name=None, attrs={}, text=None, limit=None,
				400	**kwargs):
				401	"""Returns the siblings of this Tag that match the given
				402	criteria and appear after this Tag in the document."""
				403	return self._find_all(name, attrs, text, limit,
				404	self.next_siblings, **kwargs)
				405	findNextSiblings = find_next_siblings # BS3
				406	fetchNextSiblings = find_next_siblings # BS2
				407
				408	def find_previous(self, name=None, attrs={}, text=None, **kwargs):
				409	"""Returns the first item that matches the given criteria and
				410	appears before this Tag in the document."""
				411	return self._find_one(
				412	self.find_all_previous, name, attrs, text, **kwargs)
				413	findPrevious = find_previous # BS3
				414
				415	def find_all_previous(self, name=None, attrs={}, text=None, limit=None,
				416	**kwargs):
				417	"""Returns all items that match the given criteria and appear
				418	before this Tag in the document."""
				419	return self._find_all(name, attrs, text, limit, self.previous_elements,
				420	**kwargs)
				421	findAllPrevious = find_all_previous # BS3
				422	fetchPrevious = find_all_previous # BS2
				423
				424	def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs):
				425	"""Returns the closest sibling to this Tag that matches the
				426	given criteria and appears before this Tag in the document."""
				427	return self._find_one(self.find_previous_siblings, name, attrs, text,
				428	**kwargs)
				429	findPreviousSibling = find_previous_sibling # BS3
				430
				431	def find_previous_siblings(self, name=None, attrs={}, text=None,
				432	limit=None, **kwargs):
				433	"""Returns the siblings of this Tag that match the given
				434	criteria and appear before this Tag in the document."""
				435	return self._find_all(name, attrs, text, limit,
				436	self.previous_siblings, **kwargs)
				437	findPreviousSiblings = find_previous_siblings # BS3
				438	fetchPreviousSiblings = find_previous_siblings # BS2
				439
				440	def find_parent(self, name=None, attrs={}, **kwargs):
				441	"""Returns the closest parent of this Tag that matches the given
				442	criteria."""
				443	# NOTE: We can't use _find_one because findParents takes a different
				444	# set of arguments.
				445	r = None
				446	l = self.find_parents(name, attrs, 1, **kwargs)
				447	if l:
				448	r = l[0]
				449	return r
				450	findParent = find_parent # BS3
				451
				452	def find_parents(self, name=None, attrs={}, limit=None, **kwargs):
				453	"""Returns the parents of this Tag that match the given
				454	criteria."""
				455
				456	return self._find_all(name, attrs, None, limit, self.parents,
				457	**kwargs)
				458	findParents = find_parents # BS3
				459	fetchParents = find_parents # BS2
				460
				461	@property
				462	def next(self):
				463	return self.next_element
				464
				465	@property
				466	def previous(self):
				467	return self.previous_element
				468
				469	#These methods do the real heavy lifting.
				470
				471	def _find_one(self, method, name, attrs, text, **kwargs):
				472	r = None
				473	l = method(name, attrs, text, 1, **kwargs)
				474	if l:
				475	r = l[0]
				476	return r
				477
				478	def _find_all(self, name, attrs, text, limit, generator, **kwargs):
				479	"Iterates over a generator looking for things that match."
				480
				481	if isinstance(name, SoupStrainer):
				482	strainer = name
				483	else:
				484	strainer = SoupStrainer(name, attrs, text, **kwargs)
				485
				486	if text is None and not limit and not attrs and not kwargs:
				487	if name is True or name is None:
				488	# Optimization to find all tags.
				489	result = (element for element in generator
				490	if isinstance(element, Tag))
				491	return ResultSet(strainer, result)
				492	elif isinstance(name, basestring):
				493	# Optimization to find all tags with a given name.
				494	result = (element for element in generator
				495	if isinstance(element, Tag)
				496	and element.name == name)
				497	return ResultSet(strainer, result)
				498	results = ResultSet(strainer)
				499	while True:
				500	try:
				501	i = next(generator)
				502	except StopIteration:
				503	break
				504	if i:
				505	found = strainer.search(i)
				506	if found:
				507	results.append(found)
				508	if limit and len(results) >= limit:
				509	break
				510	return results
				511
				512	#These generators can be used to navigate starting from both
				513	#NavigableStrings and Tags.
				514	@property
				515	def next_elements(self):
				516	i = self.next_element
				517	while i is not None:
				518	yield i
				519	i = i.next_element
				520
				521	@property
				522	def next_siblings(self):
				523	i = self.next_sibling
				524	while i is not None:
				525	yield i
				526	i = i.next_sibling
				527
				528	@property
				529	def previous_elements(self):
				530	i = self.previous_element
				531	while i is not None:
				532	yield i
				533	i = i.previous_element
				534
				535	@property
				536	def previous_siblings(self):
				537	i = self.previous_sibling
				538	while i is not None:
				539	yield i
				540	i = i.previous_sibling
				541
				542	@property
				543	def parents(self):
				544	i = self.parent
				545	while i is not None:
				546	yield i
				547	i = i.parent
				548
				549	# Methods for supporting CSS selectors.
				550
				551	tag_name_re = re.compile('^[a-z0-9]+$')
				552
				553	# /^(\w+)\[(\w+)([=~\\|\^\$\]?)=?"?([^\]"])"?\]$/
				554	# \---/ \---/\-------------/ \-------/
				555	# \| \| \| \|
				556	# \| \| \| The value
				557	# \| \| ~,\|,^,$,* or =
				558	# \| Attribute
				559	# Tag
				560	attribselect_re = re.compile(
				561	r'^(?P<tag>\w+)?\[(?P<attribute>\w+)(?P<operator>[=~\\|\^\$\*]?)' +
				562	r'=?"?(?P<value>[^\]"]*)"?\]$'
				563	)
				564
				565	def _attr_value_as_string(self, value, default=None):
				566	"""Force an attribute value into a string representation.
				567
				568	A multi-valued attribute will be converted into a
				569	space-separated stirng.
				570	"""
				571	value = self.get(value, default)
				572	if isinstance(value, list) or isinstance(value, tuple):
				573	value =" ".join(value)
				574	return value
				575
				576	def _tag_name_matches_and(self, function, tag_name):
				577	if not tag_name:
				578	return function
				579	else:
				580	def _match(tag):
				581	return tag.name == tag_name and function(tag)
				582	return _match
				583
				584	def _attribute_checker(self, operator, attribute, value=''):
				585	"""Create a function that performs a CSS selector operation.
				586
				587	Takes an operator, attribute and optional value. Returns a
				588	function that will return True for elements that match that
				589	combination.
				590	"""
				591	if operator == '=':
				592	# string representation of `attribute` is equal to `value`
				593	return lambda el: el._attr_value_as_string(attribute) == value
				594	elif operator == '~':
				595	# space-separated list representation of `attribute`
				596	# contains `value`
				597	def _includes_value(element):
				598	attribute_value = element.get(attribute, [])
				599	if not isinstance(attribute_value, list):
				600	attribute_value = attribute_value.split()
				601	return value in attribute_value
				602	return _includes_value
				603	elif operator == '^':
				604	# string representation of `attribute` starts with `value`
				605	return lambda el: el._attr_value_as_string(
				606	attribute, '').startswith(value)
				607	elif operator == '$':
				608	# string represenation of `attribute` ends with `value`
				609	return lambda el: el._attr_value_as_string(
				610	attribute, '').endswith(value)
				611	elif operator == '*':
				612	# string representation of `attribute` contains `value`
				613	return lambda el: value in el._attr_value_as_string(attribute, '')
				614	elif operator == '\|':
				615	# string representation of `attribute` is either exactly
				616	# `value` or starts with `value` and then a dash.
				617	def _is_or_starts_with_dash(element):
				618	attribute_value = element._attr_value_as_string(attribute, '')
				619	return (attribute_value == value or attribute_value.startswith(
				620	value + '-'))
				621	return _is_or_starts_with_dash
				622	else:
				623	return lambda el: el.has_attr(attribute)
				624
				625	# Old non-property versions of the generators, for backwards
				626	# compatibility with BS3.
				627	def nextGenerator(self):
				628	return self.next_elements
				629
				630	def nextSiblingGenerator(self):
				631	return self.next_siblings
				632
				633	def previousGenerator(self):
				634	return self.previous_elements
				635
				636	def previousSiblingGenerator(self):
				637	return self.previous_siblings
				638
				639	def parentGenerator(self):
				640	return self.parents
				641
				642
				643	class NavigableString(unicode, PageElement):
				644
				645	PREFIX = ''
				646	SUFFIX = ''
				647
				648	def __new__(cls, value):
				649	"""Create a new NavigableString.
				650
				651	When unpickling a NavigableString, this method is called with
				652	the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
				653	passed in to the superclass's __new__ or the superclass won't know
				654	how to handle non-ASCII characters.
				655	"""
				656	if isinstance(value, unicode):
				657	return unicode.__new__(cls, value)
				658	return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
				659
				660	def __copy__(self):
				661	return self
				662
				663	def __getnewargs__(self):
				664	return (unicode(self),)
				665
				666	def __getattr__(self, attr):
				667	"""text.string gives you text. This is for backwards
				668	compatibility for NavigableString, but for CData it lets you
				669	get the string without the CData wrapper."""
				670	if attr == 'string':
				671	return self
				672	else:
				673	raise AttributeError(
				674	"'%s' object has no attribute '%s'" % (
				675	self.__class__.__name__, attr))
				676
				677	def output_ready(self, formatter="minimal"):
				678	output = self.format_string(self, formatter)
				679	return self.PREFIX + output + self.SUFFIX
				680
				681	@property
				682	def name(self):
				683	return None
				684
				685	@name.setter
				686	def name(self, name):
				687	raise AttributeError("A NavigableString cannot be given a name.")
				688
				689	class PreformattedString(NavigableString):
				690	"""A NavigableString not subject to the normal formatting rules.
				691
				692	The string will be passed into the formatter (to trigger side effects),
				693	but the return value will be ignored.
				694	"""
				695
				696	def output_ready(self, formatter="minimal"):
				697	"""CData strings are passed into the formatter.
				698	But the return value is ignored."""
				699	self.format_string(self, formatter)
				700	return self.PREFIX + self + self.SUFFIX
				701
				702	class CData(PreformattedString):
				703
				704	PREFIX = u'<![CDATA['
				705	SUFFIX = u']]>'
				706
				707	class ProcessingInstruction(PreformattedString):
				708
				709	PREFIX = u'<?'
				710	SUFFIX = u'?>'
				711
				712	class Comment(PreformattedString):
				713
				714	PREFIX = u'<!--'
				715	SUFFIX = u'-->'
				716
				717
				718	class Declaration(PreformattedString):
				719	PREFIX = u'<!'
				720	SUFFIX = u'!>'
				721
				722
				723	class Doctype(PreformattedString):
				724
				725	@classmethod
				726	def for_name_and_ids(cls, name, pub_id, system_id):
				727	value = name or ''
				728	if pub_id is not None:
				729	value += ' PUBLIC "%s"' % pub_id
				730	if system_id is not None:
				731	value += ' "%s"' % system_id
				732	elif system_id is not None:
				733	value += ' SYSTEM "%s"' % system_id
				734
				735	return Doctype(value)
				736
				737	PREFIX = u'<!DOCTYPE '
				738	SUFFIX = u'>\n'
				739
				740
				741	class Tag(PageElement):
				742
				743	"""Represents a found HTML tag with its attributes and contents."""
				744
				745	def __init__(self, parser=None, builder=None, name=None, namespace=None,
				746	prefix=None, attrs=None, parent=None, previous=None):
				747	"Basic constructor."
				748
				749	if parser is None:
				750	self.parser_class = None
				751	else:
				752	# We don't actually store the parser object: that lets extracted
				753	# chunks be garbage-collected.
				754	self.parser_class = parser.__class__
				755	if name is None:
				756	raise ValueError("No value provided for new tag's name.")
				757	self.name = name
				758	self.namespace = namespace
				759	self.prefix = prefix
				760	if attrs is None:
				761	attrs = {}
				762	elif attrs and builder.cdata_list_attributes:
				763	attrs = builder._replace_cdata_list_attribute_values(
				764	self.name, attrs)
				765	else:
				766	attrs = dict(attrs)
				767	self.attrs = attrs
				768	self.contents = []
				769	self.setup(parent, previous)
				770	self.hidden = False
				771
				772	# Set up any substitutions, such as the charset in a META tag.
				773	if builder is not None:
				774	builder.set_up_substitutions(self)
				775	self.can_be_empty_element = builder.can_be_empty_element(name)
				776	else:
				777	self.can_be_empty_element = False
				778
				779	parserClass = _alias("parser_class") # BS3
				780
				781	@property
				782	def is_empty_element(self):
				783	"""Is this tag an empty-element tag? (aka a self-closing tag)
				784
				785	A tag that has contents is never an empty-element tag.
				786
				787	A tag that has no contents may or may not be an empty-element
				788	tag. It depends on the builder used to create the tag. If the
				789	builder has a designated list of empty-element tags, then only
				790	a tag whose name shows up in that list is considered an
				791	empty-element tag.
				792
				793	If the builder has no designated list of empty-element tags,
				794	then any tag with no contents is an empty-element tag.
				795	"""
				796	return len(self.contents) == 0 and self.can_be_empty_element
				797	isSelfClosing = is_empty_element # BS3
				798
				799	@property
				800	def string(self):
				801	"""Convenience property to get the single string within this tag.
				802
				803	:Return: If this tag has a single string child, return value
				804	is that string. If this tag has no children, or more than one
				805	child, return value is None. If this tag has one child tag,
				806	return value is the 'string' attribute of the child tag,
				807	recursively.
				808	"""
				809	if len(self.contents) != 1:
				810	return None
				811	child = self.contents[0]
				812	if isinstance(child, NavigableString):
				813	return child
				814	return child.string
				815
				816	@string.setter
				817	def string(self, string):
				818	self.clear()
				819	self.append(string.__class__(string))
				820
				821	def _all_strings(self, strip=False, types=(NavigableString, CData)):
				822	"""Yield all strings of certain classes, possibly stripping them.
				823
				824	By default, yields only NavigableString and CData objects. So
				825	no comments, processing instructions, etc.
				826	"""
				827	for descendant in self.descendants:
				828	if (
				829	(types is None and not isinstance(descendant, NavigableString))
				830	or
				831	(types is not None and type(descendant) not in types)):
				832	continue
				833	if strip:
				834	descendant = descendant.strip()
				835	if len(descendant) == 0:
				836	continue
				837	yield descendant
				838
				839	strings = property(_all_strings)
				840
				841	@property
				842	def stripped_strings(self):
				843	for string in self._all_strings(True):
				844	yield string
				845
				846	def get_text(self, separator=u"", strip=False,
				847	types=(NavigableString, CData)):
				848	"""
				849	Get all child strings, concatenated using the given separator.
				850	"""
				851	return separator.join([s for s in self._all_strings(
				852	strip, types=types)])
				853	getText = get_text
				854	text = property(get_text)
				855
				856	def decompose(self):
				857	"""Recursively destroys the contents of this tree."""
				858	self.extract()
				859	i = self
				860	while i is not None:
				861	next = i.next_element
				862	i.__dict__.clear()
				863	i.contents = []
				864	i = next
				865
				866	def clear(self, decompose=False):
				867	"""
				868	Extract all children. If decompose is True, decompose instead.
				869	"""
				870	if decompose:
				871	for element in self.contents[:]:
				872	if isinstance(element, Tag):
				873	element.decompose()
				874	else:
				875	element.extract()
				876	else:
				877	for element in self.contents[:]:
				878	element.extract()
				879
				880	def index(self, element):
				881	"""
				882	Find the index of a child by identity, not value. Avoids issues with
				883	tag.contents.index(element) getting the index of equal elements.
				884	"""
				885	for i, child in enumerate(self.contents):
				886	if child is element:
				887	return i
				888	raise ValueError("Tag.index: element not in tag")
				889
				890	def get(self, key, default=None):
				891	"""Returns the value of the 'key' attribute for the tag, or
				892	the value given for 'default' if it doesn't have that
				893	attribute."""
				894	return self.attrs.get(key, default)
				895
				896	def has_attr(self, key):
				897	return key in self.attrs
				898
				899	def __hash__(self):
				900	return str(self).__hash__()
				901
				902	def __getitem__(self, key):
				903	"""tag[key] returns the value of the 'key' attribute for the tag,
				904	and throws an exception if it's not there."""
				905	return self.attrs[key]
				906
				907	def __iter__(self):
				908	"Iterating over a tag iterates over its contents."
				909	return iter(self.contents)
				910
				911	def __len__(self):
				912	"The length of a tag is the length of its list of contents."
				913	return len(self.contents)
				914
				915	def __contains__(self, x):
				916	return x in self.contents
				917
				918	def __nonzero__(self):
				919	"A tag is non-None even if it has no contents."
				920	return True
				921
				922	def __setitem__(self, key, value):
				923	"""Setting tag[key] sets the value of the 'key' attribute for the
				924	tag."""
				925	self.attrs[key] = value
				926
				927	def __delitem__(self, key):
				928	"Deleting tag[key] deletes all 'key' attributes for the tag."
				929	self.attrs.pop(key, None)
				930
				931	def __call__(self, args, *kwargs):
				932	"""Calling a tag like a function is the same as calling its
				933	find_all() method. Eg. tag('a') returns a list of all the A tags
				934	found within this tag."""
				935	return self.find_all(args, *kwargs)
				936
				937	def __getattr__(self, tag):
				938	#print "Getattr %s.%s" % (self.__class__, tag)
				939	if len(tag) > 3 and tag.endswith('Tag'):
				940	# BS3: soup.aTag -> "soup.find("a")
				941	tag_name = tag[:-3]
				942	warnings.warn(
				943	'.%sTag is deprecated, use .find("%s") instead.' % (
				944	tag_name, tag_name))
				945	return self.find(tag_name)
				946	# We special case contents to avoid recursion.
				947	elif not tag.startswith("__") and not tag=="contents":
				948	return self.find(tag)
				949	raise AttributeError(
				950	"'%s' object has no attribute '%s'" % (self.__class__, tag))
				951
				952	def __eq__(self, other):
				953	"""Returns true iff this tag has the same name, the same attributes,
				954	and the same contents (recursively) as the given tag."""
				955	if self is other:
				956	return True
				957	if (not hasattr(other, 'name') or
				958	not hasattr(other, 'attrs') or
				959	not hasattr(other, 'contents') or
				960	self.name != other.name or
				961	self.attrs != other.attrs or
				962	len(self) != len(other)):
				963	return False
				964	for i, my_child in enumerate(self.contents):
				965	if my_child != other.contents[i]:
				966	return False
				967	return True
				968
				969	def __ne__(self, other):
				970	"""Returns true iff this tag is not identical to the other tag,
				971	as defined in __eq__."""
				972	return not self == other
				973
				974	def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
				975	"""Renders this tag as a string."""
				976	return self.encode(encoding)
				977
				978	def __unicode__(self):
				979	return self.decode()
				980
				981	def __str__(self):
				982	return self.encode()
				983
				984	if PY3K:
				985	__str__ = __repr__ = __unicode__
				986
				987	def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
				988	indent_level=None, formatter="minimal",
				989	errors="xmlcharrefreplace"):
				990	# Turn the data structure into Unicode, then encode the
				991	# Unicode.
				992	u = self.decode(indent_level, encoding, formatter)
				993	return u.encode(encoding, errors)
				994
				995	def _should_pretty_print(self, indent_level):
				996	"""Should this tag be pretty-printed?"""
				997	return (
				998	indent_level is not None and
				999	(self.name not in HTMLAwareEntitySubstitution.preformatted_tags
				1000	or self._is_xml))
				1001
				1002	def decode(self, indent_level=None,
				1003	eventual_encoding=DEFAULT_OUTPUT_ENCODING,
				1004	formatter="minimal"):
				1005	"""Returns a Unicode representation of this tag and its contents.
				1006
				1007	:param eventual_encoding: The tag is destined to be
				1008	encoded into this encoding. This method is _not_
				1009	responsible for performing that encoding. This information
				1010	is passed in so that it can be substituted in if the
				1011	document contains a <META> tag that mentions the document's
				1012	encoding.
				1013	"""
				1014
				1015	# First off, turn a string formatter into a function. This
				1016	# will stop the lookup from happening over and over again.
				1017	if not callable(formatter):
				1018	formatter = self._formatter_for_name(formatter)
				1019
				1020	attrs = []
				1021	if self.attrs:
				1022	for key, val in sorted(self.attrs.items()):
				1023	if val is None:
				1024	decoded = key
				1025	else:
				1026	if isinstance(val, list) or isinstance(val, tuple):
				1027	val = ' '.join(val)
				1028	elif not isinstance(val, basestring):
				1029	val = unicode(val)
				1030	elif (
				1031	isinstance(val, AttributeValueWithCharsetSubstitution)
				1032	and eventual_encoding is not None):
				1033	val = val.encode(eventual_encoding)
				1034
				1035	text = self.format_string(val, formatter)
				1036	decoded = (
				1037	unicode(key) + '='
				1038	+ EntitySubstitution.quoted_attribute_value(text))
				1039	attrs.append(decoded)
				1040	close = ''
				1041	closeTag = ''
				1042
				1043	prefix = ''
				1044	if self.prefix:
				1045	prefix = self.prefix + ":"
				1046
				1047	if self.is_empty_element:
				1048	close = '/'
				1049	else:
				1050	closeTag = '</%s%s>' % (prefix, self.name)
				1051
				1052	pretty_print = self._should_pretty_print(indent_level)
				1053	space = ''
				1054	indent_space = ''
				1055	if indent_level is not None:
				1056	indent_space = (' ' * (indent_level - 1))
				1057	if pretty_print:
				1058	space = indent_space
				1059	indent_contents = indent_level + 1
				1060	else:
				1061	indent_contents = None
				1062	contents = self.decode_contents(
				1063	indent_contents, eventual_encoding, formatter)
				1064
				1065	if self.hidden:
				1066	# This is the 'document root' object.
				1067	s = contents
				1068	else:
				1069	s = []
				1070	attribute_string = ''
				1071	if attrs:
				1072	attribute_string = ' ' + ' '.join(attrs)
				1073	if indent_level is not None:
				1074	# Even if this particular tag is not pretty-printed,
				1075	# we should indent up to the start of the tag.
				1076	s.append(indent_space)
				1077	s.append('<%s%s%s%s>' % (
				1078	prefix, self.name, attribute_string, close))
				1079	if pretty_print:
				1080	s.append("\n")
				1081	s.append(contents)
				1082	if pretty_print and contents and contents[-1] != "\n":
				1083	s.append("\n")
				1084	if pretty_print and closeTag:
				1085	s.append(space)
				1086	s.append(closeTag)
				1087	if indent_level is not None and closeTag and self.next_sibling:
				1088	# Even if this particular tag is not pretty-printed,
				1089	# we're now done with the tag, and we should add a
				1090	# newline if appropriate.
				1091	s.append("\n")
				1092	s = ''.join(s)
				1093	return s
				1094
				1095	def prettify(self, encoding=None, formatter="minimal"):
				1096	if encoding is None:
				1097	return self.decode(True, formatter=formatter)
				1098	else:
				1099	return self.encode(encoding, True, formatter=formatter)
				1100
				1101	def decode_contents(self, indent_level=None,
				1102	eventual_encoding=DEFAULT_OUTPUT_ENCODING,
				1103	formatter="minimal"):
				1104	"""Renders the contents of this tag as a Unicode string.
				1105
				1106	:param eventual_encoding: The tag is destined to be
				1107	encoded into this encoding. This method is _not_
				1108	responsible for performing that encoding. This information
				1109	is passed in so that it can be substituted in if the
				1110	document contains a <META> tag that mentions the document's
				1111	encoding.
				1112	"""
				1113	# First off, turn a string formatter into a function. This
				1114	# will stop the lookup from happening over and over again.
				1115	if not callable(formatter):
				1116	formatter = self._formatter_for_name(formatter)
				1117
				1118	pretty_print = (indent_level is not None)
				1119	s = []
				1120	for c in self:
				1121	text = None
				1122	if isinstance(c, NavigableString):
				1123	text = c.output_ready(formatter)
				1124	elif isinstance(c, Tag):
				1125	s.append(c.decode(indent_level, eventual_encoding,
				1126	formatter))
				1127	if text and indent_level and not self.name == 'pre':
				1128	text = text.strip()
				1129	if text:
				1130	if pretty_print and not self.name == 'pre':
				1131	s.append(" " * (indent_level - 1))
				1132	s.append(text)
				1133	if pretty_print and not self.name == 'pre':
				1134	s.append("\n")
				1135	return ''.join(s)
				1136
				1137	def encode_contents(
				1138	self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
				1139	formatter="minimal"):
				1140	"""Renders the contents of this tag as a bytestring."""
				1141	contents = self.decode_contents(indent_level, encoding, formatter)
				1142	return contents.encode(encoding)
				1143
				1144	# Old method for BS3 compatibility
				1145	def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
				1146	prettyPrint=False, indentLevel=0):
				1147	if not prettyPrint:
				1148	indentLevel = None
				1149	return self.encode_contents(
				1150	indent_level=indentLevel, encoding=encoding)
				1151
				1152	#Soup methods
				1153
				1154	def find(self, name=None, attrs={}, recursive=True, text=None,
				1155	**kwargs):
				1156	"""Return only the first child of this Tag matching the given
				1157	criteria."""
				1158	r = None
				1159	l = self.find_all(name, attrs, recursive, text, 1, **kwargs)
				1160	if l:
				1161	r = l[0]
				1162	return r
				1163	findChild = find
				1164
				1165	def find_all(self, name=None, attrs={}, recursive=True, text=None,
				1166	limit=None, **kwargs):
				1167	"""Extracts a list of Tag objects that match the given
				1168	criteria. You can specify the name of the Tag and any
				1169	attributes you want the Tag to have.
				1170
				1171	The value of a key-value pair in the 'attrs' map can be a
				1172	string, a list of strings, a regular expression object, or a
				1173	callable that takes a string and returns whether or not the
				1174	string matches for some custom definition of 'matches'. The
				1175	same is true of the tag name."""
				1176
				1177	generator = self.descendants
				1178	if not recursive:
				1179	generator = self.children
				1180	return self._find_all(name, attrs, text, limit, generator, **kwargs)
				1181	findAll = find_all # BS3
				1182	findChildren = find_all # BS2
				1183
				1184	#Generator methods
				1185	@property
				1186	def children(self):
				1187	# return iter() to make the purpose of the method clear
				1188	return iter(self.contents) # XXX This seems to be untested.
				1189
				1190	@property
				1191	def descendants(self):
				1192	if not len(self.contents):
				1193	return
				1194	stopNode = self._last_descendant().next_element
				1195	current = self.contents[0]
				1196	while current is not stopNode:
				1197	yield current
				1198	current = current.next_element
				1199
				1200	# CSS selector code
				1201
				1202	_selector_combinators = ['>', '+', '~']
				1203	_select_debug = False
				1204	def select(self, selector, _candidate_generator=None):
				1205	"""Perform a CSS selection operation on the current element."""
				1206	tokens = selector.split()
				1207	current_context = [self]
				1208
				1209	if tokens[-1] in self._selector_combinators:
				1210	raise ValueError(
				1211	'Final combinator "%s" is missing an argument.' % tokens[-1])
				1212	if self._select_debug:
				1213	print 'Running CSS selector "%s"' % selector
				1214	for index, token in enumerate(tokens):
				1215	if self._select_debug:
				1216	print ' Considering token "%s"' % token
				1217	recursive_candidate_generator = None
				1218	tag_name = None
				1219	if tokens[index-1] in self._selector_combinators:
				1220	# This token was consumed by the previous combinator. Skip it.
				1221	if self._select_debug:
				1222	print ' Token was consumed by the previous combinator.'
				1223	continue
				1224	# Each operation corresponds to a checker function, a rule
				1225	# for determining whether a candidate matches the
				1226	# selector. Candidates are generated by the active
				1227	# iterator.
				1228	checker = None
				1229
				1230	m = self.attribselect_re.match(token)
				1231	if m is not None:
				1232	# Attribute selector
				1233	tag_name, attribute, operator, value = m.groups()
				1234	checker = self._attribute_checker(operator, attribute, value)
				1235
				1236	elif '#' in token:
				1237	# ID selector
				1238	tag_name, tag_id = token.split('#', 1)
				1239	def id_matches(tag):
				1240	return tag.get('id', None) == tag_id
				1241	checker = id_matches
				1242
				1243	elif '.' in token:
				1244	# Class selector
				1245	tag_name, klass = token.split('.', 1)
				1246	classes = set(klass.split('.'))
				1247	def classes_match(candidate):
				1248	return classes.issubset(candidate.get('class', []))
				1249	checker = classes_match
				1250
				1251	elif ':' in token:
				1252	# Pseudo-class
				1253	tag_name, pseudo = token.split(':', 1)
				1254	if tag_name == '':
				1255	raise ValueError(
				1256	"A pseudo-class must be prefixed with a tag name.")
				1257	pseudo_attributes = re.match('([a-zA-Z\d-]+)$([a-zA-Z\d]+)$', pseudo)
				1258	found = []
				1259	if pseudo_attributes is not None:
				1260	pseudo_type, pseudo_value = pseudo_attributes.groups()
				1261	if pseudo_type == 'nth-of-type':
				1262	try:
				1263	pseudo_value = int(pseudo_value)
				1264	except:
				1265	raise NotImplementedError(
				1266	'Only numeric values are currently supported for the nth-of-type pseudo-class.')
				1267	if pseudo_value < 1:
				1268	raise ValueError(
				1269	'nth-of-type pseudo-class value must be at least 1.')
				1270	class Counter(object):
				1271	def __init__(self, destination):
				1272	self.count = 0
				1273	self.destination = destination
				1274
				1275	def nth_child_of_type(self, tag):
				1276	self.count += 1
				1277	if self.count == self.destination:
				1278	return True
				1279	if self.count > self.destination:
				1280	# Stop the generator that's sending us
				1281	# these things.
				1282	raise StopIteration()
				1283	return False
				1284	checker = Counter(pseudo_value).nth_child_of_type
				1285	else:
				1286	raise NotImplementedError(
				1287	'Only the following pseudo-classes are implemented: nth-of-type.')
				1288
				1289	elif token == '*':
				1290	# Star selector -- matches everything
				1291	pass
				1292	elif token == '>':
				1293	# Run the next token as a CSS selector against the
				1294	# direct children of each tag in the current context.
				1295	recursive_candidate_generator = lambda tag: tag.children
				1296	elif token == '~':
				1297	# Run the next token as a CSS selector against the
				1298	# siblings of each tag in the current context.
				1299	recursive_candidate_generator = lambda tag: tag.next_siblings
				1300	elif token == '+':
				1301	# For each tag in the current context, run the next
				1302	# token as a CSS selector against the tag's next
				1303	# sibling that's a tag.
				1304	def next_tag_sibling(tag):
				1305	yield tag.find_next_sibling(True)
				1306	recursive_candidate_generator = next_tag_sibling
				1307
				1308	elif self.tag_name_re.match(token):
				1309	# Just a tag name.
				1310	tag_name = token
				1311	else:
				1312	raise ValueError(
				1313	'Unsupported or invalid CSS selector: "%s"' % token)
				1314
				1315	if recursive_candidate_generator:
				1316	# This happens when the selector looks like "> foo".
				1317	#
				1318	# The generator calls select() recursively on every
				1319	# member of the current context, passing in a different
				1320	# candidate generator and a different selector.
				1321	#
				1322	# In the case of "> foo", the candidate generator is
				1323	# one that yields a tag's direct children (">"), and
				1324	# the selector is "foo".
				1325	next_token = tokens[index+1]
				1326	def recursive_select(tag):
				1327	if self._select_debug:
				1328	print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)
				1329	print '-' * 40
				1330	for i in tag.select(next_token, recursive_candidate_generator):
				1331	if self._select_debug:
				1332	print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)
				1333	yield i
				1334	if self._select_debug:
				1335	print '-' * 40
				1336	_use_candidate_generator = recursive_select
				1337	elif _candidate_generator is None:
				1338	# By default, a tag's candidates are all of its
				1339	# children. If tag_name is defined, only yield tags
				1340	# with that name.
				1341	if self._select_debug:
				1342	if tag_name:
				1343	check = "[any]"
				1344	else:
				1345	check = tag_name
				1346	print ' Default candidate generator, tag name="%s"' % check
				1347	if self._select_debug:
				1348	# This is redundant with later code, but it stops
				1349	# a bunch of bogus tags from cluttering up the
				1350	# debug log.
				1351	def default_candidate_generator(tag):
				1352	for child in tag.descendants:
				1353	if not isinstance(child, Tag):
				1354	continue
				1355	if tag_name and not child.name == tag_name:
				1356	continue
				1357	yield child
				1358	_use_candidate_generator = default_candidate_generator
				1359	else:
				1360	_use_candidate_generator = lambda tag: tag.descendants
				1361	else:
				1362	_use_candidate_generator = _candidate_generator
				1363
				1364	new_context = []
				1365	new_context_ids = set([])
				1366	for tag in current_context:
				1367	if self._select_debug:
				1368	print " Running candidate generator on %s %s" % (
				1369	tag.name, repr(tag.attrs))
				1370	for candidate in _use_candidate_generator(tag):
				1371	if not isinstance(candidate, Tag):
				1372	continue
				1373	if tag_name and candidate.name != tag_name:
				1374	continue
				1375	if checker is not None:
				1376	try:
				1377	result = checker(candidate)
				1378	except StopIteration:
				1379	# The checker has decided we should no longer
				1380	# run the generator.
				1381	break
				1382	if checker is None or result:
				1383	if self._select_debug:
				1384	print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))
				1385	if id(candidate) not in new_context_ids:
				1386	# If a tag matches a selector more than once,
				1387	# don't include it in the context more than once.
				1388	new_context.append(candidate)
				1389	new_context_ids.add(id(candidate))
				1390	elif self._select_debug:
				1391	print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
				1392
				1393	current_context = new_context
				1394
				1395	if self._select_debug:
				1396	print "Final verdict:"
				1397	for i in current_context:
				1398	print " %s %s" % (i.name, i.attrs)
				1399	return current_context
				1400
				1401	# Old names for backwards compatibility
				1402	def childGenerator(self):
				1403	return self.children
				1404
				1405	def recursiveChildGenerator(self):
				1406	return self.descendants
				1407
				1408	def has_key(self, key):
				1409	"""This was kind of misleading because has_key() (attributes)
				1410	was different from __in__ (contents). has_key() is gone in
				1411	Python 3, anyway."""
				1412	warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % (
				1413	key))
				1414	return self.has_attr(key)
				1415
				1416	# Next, a couple classes to represent queries and their results.
				1417	class SoupStrainer(object):
				1418	"""Encapsulates a number of ways of matching a markup element (tag or
				1419	text)."""
				1420
				1421	def __init__(self, name=None, attrs={}, text=None, **kwargs):
				1422	self.name = self._normalize_search_value(name)
				1423	if not isinstance(attrs, dict):
				1424	# Treat a non-dict value for attrs as a search for the 'class'
				1425	# attribute.
				1426	kwargs['class'] = attrs
				1427	attrs = None
				1428
				1429	if 'class_' in kwargs:
				1430	# Treat class_="foo" as a search for the 'class'
				1431	# attribute, overriding any non-dict value for attrs.
				1432	kwargs['class'] = kwargs['class_']
				1433	del kwargs['class_']
				1434
				1435	if kwargs:
				1436	if attrs:
				1437	attrs = attrs.copy()
				1438	attrs.update(kwargs)
				1439	else:
				1440	attrs = kwargs
				1441	normalized_attrs = {}
				1442	for key, value in attrs.items():
				1443	normalized_attrs[key] = self._normalize_search_value(value)
				1444
				1445	self.attrs = normalized_attrs
				1446	self.text = self._normalize_search_value(text)
				1447
				1448	def _normalize_search_value(self, value):
				1449	# Leave it alone if it's a Unicode string, a callable, a
				1450	# regular expression, a boolean, or None.
				1451	if (isinstance(value, unicode) or callable(value) or hasattr(value, 'match')
				1452	or isinstance(value, bool) or value is None):
				1453	return value
				1454
				1455	# If it's a bytestring, convert it to Unicode, treating it as UTF-8.
				1456	if isinstance(value, bytes):
				1457	return value.decode("utf8")
				1458
				1459	# If it's listlike, convert it into a list of strings.
				1460	if hasattr(value, '__iter__'):
				1461	new_value = []
				1462	for v in value:
				1463	if (hasattr(v, '__iter__') and not isinstance(v, bytes)
				1464	and not isinstance(v, unicode)):
				1465	# This is almost certainly the user's mistake. In the
				1466	# interests of avoiding infinite loops, we'll let
				1467	# it through as-is rather than doing a recursive call.
				1468	new_value.append(v)
				1469	else:
				1470	new_value.append(self._normalize_search_value(v))
				1471	return new_value
				1472
				1473	# Otherwise, convert it into a Unicode string.
				1474	# The unicode(str()) thing is so this will do the same thing on Python 2
				1475	# and Python 3.
				1476	return unicode(str(value))
				1477
				1478	def __str__(self):
				1479	if self.text:
				1480	return self.text
				1481	else:
				1482	return "%s\|%s" % (self.name, self.attrs)
				1483
				1484	def search_tag(self, markup_name=None, markup_attrs={}):
				1485	found = None
				1486	markup = None
				1487	if isinstance(markup_name, Tag):
				1488	markup = markup_name
				1489	markup_attrs = markup
				1490	call_function_with_tag_data = (
				1491	isinstance(self.name, collections.Callable)
				1492	and not isinstance(markup_name, Tag))
				1493
				1494	if ((not self.name)
				1495	or call_function_with_tag_data
				1496	or (markup and self._matches(markup, self.name))
				1497	or (not markup and self._matches(markup_name, self.name))):
				1498	if call_function_with_tag_data:
				1499	match = self.name(markup_name, markup_attrs)
				1500	else:
				1501	match = True
				1502	markup_attr_map = None
				1503	for attr, match_against in list(self.attrs.items()):
				1504	if not markup_attr_map:
				1505	if hasattr(markup_attrs, 'get'):
				1506	markup_attr_map = markup_attrs
				1507	else:
				1508	markup_attr_map = {}
				1509	for k, v in markup_attrs:
				1510	markup_attr_map[k] = v
				1511	attr_value = markup_attr_map.get(attr)
				1512	if not self._matches(attr_value, match_against):
				1513	match = False
				1514	break
				1515	if match:
				1516	if markup:
				1517	found = markup
				1518	else:
				1519	found = markup_name
				1520	if found and self.text and not self._matches(found.string, self.text):
				1521	found = None
				1522	return found
				1523	searchTag = search_tag
				1524
				1525	def search(self, markup):
				1526	# print 'looking for %s in %s' % (self, markup)
				1527	found = None
				1528	# If given a list of items, scan it for a text element that
				1529	# matches.
				1530	if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, basestring)):
				1531	for element in markup:
				1532	if isinstance(element, NavigableString) \
				1533	and self.search(element):
				1534	found = element
				1535	break
				1536	# If it's a Tag, make sure its name or attributes match.
				1537	# Don't bother with Tags if we're searching for text.
				1538	elif isinstance(markup, Tag):
				1539	if not self.text or self.name or self.attrs:
				1540	found = self.search_tag(markup)
				1541	# If it's text, make sure the text matches.
				1542	elif isinstance(markup, NavigableString) or \
				1543	isinstance(markup, basestring):
				1544	if not self.name and not self.attrs and self._matches(markup, self.text):
				1545	found = markup
				1546	else:
				1547	raise Exception(
				1548	"I don't know how to match against a %s" % markup.__class__)
				1549	return found
				1550
				1551	def _matches(self, markup, match_against):
				1552	# print u"Matching %s against %s" % (markup, match_against)
				1553	result = False
				1554	if isinstance(markup, list) or isinstance(markup, tuple):
				1555	# This should only happen when searching a multi-valued attribute
				1556	# like 'class'.
				1557	if (isinstance(match_against, unicode)
				1558	and ' ' in match_against):
				1559	# A bit of a special case. If they try to match "foo
				1560	# bar" on a multivalue attribute's value, only accept
				1561	# the literal value "foo bar"
				1562	#
				1563	# XXX This is going to be pretty slow because we keep
				1564	# splitting match_against. But it shouldn't come up
				1565	# too often.
				1566	return (whitespace_re.split(match_against) == markup)
				1567	else:
				1568	for item in markup:
				1569	if self._matches(item, match_against):
				1570	return True
				1571	return False
				1572
				1573	if match_against is True:
				1574	# True matches any non-None value.
				1575	return markup is not None
				1576
				1577	if isinstance(match_against, collections.Callable):
				1578	return match_against(markup)
				1579
				1580	# Custom callables take the tag as an argument, but all
				1581	# other ways of matching match the tag name as a string.
				1582	if isinstance(markup, Tag):
				1583	markup = markup.name
				1584
				1585	# Ensure that `markup` is either a Unicode string, or None.
				1586	markup = self._normalize_search_value(markup)
				1587
				1588	if markup is None:
				1589	# None matches None, False, an empty string, an empty list, and so on.
				1590	return not match_against
				1591
				1592	if isinstance(match_against, unicode):
				1593	# Exact string match
				1594	return markup == match_against
				1595
				1596	if hasattr(match_against, 'match'):
				1597	# Regexp match
				1598	return match_against.search(markup)
				1599
				1600	if hasattr(match_against, '__iter__'):
				1601	# The markup must be an exact match against something
				1602	# in the iterable.
				1603	return markup in match_against
				1604
				1605
				1606	class ResultSet(list):
				1607	"""A ResultSet is just a list that keeps track of the SoupStrainer
				1608	that created it."""
				1609	def __init__(self, source, result=()):
				1610	super(ResultSet, self).__init__(result)
				1611	self.source = source