Blame - poky/bitbake/lib/bs4/element.py - mdmillerii/openbmc

blob: 0e62c2e100974114b02b3068c5a0efd51fa94a37 [file] [log] [blame]

Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1	__license__ = "MIT"
				2
				3	from pdb import set_trace
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	4	import collections
				5	import re
				6	import sys
				7	import warnings
				8	from bs4.dammit import EntitySubstitution
				9
				10	DEFAULT_OUTPUT_ENCODING = "utf-8"
				11	PY3K = (sys.version_info[0] > 2)
				12
				13	whitespace_re = re.compile("\s+")
				14
				15	def _alias(attr):
				16	"""Alias one attribute name to another for backward compatibility"""
				17	@property
				18	def alias(self):
				19	return getattr(self, attr)
				20
				21	@alias.setter
				22	def alias(self):
				23	return setattr(self, attr)
				24	return alias
				25
				26
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	27	class NamespacedAttribute(str):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	28
				29	def __new__(cls, prefix, name, namespace=None):
				30	if name is None:
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	31	obj = str.__new__(cls, prefix)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	32	elif prefix is None:
				33	# Not really namespaced.
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	34	obj = str.__new__(cls, name)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	35	else:
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	36	obj = str.__new__(cls, prefix + ":" + name)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	37	obj.prefix = prefix
				38	obj.name = name
				39	obj.namespace = namespace
				40	return obj
				41
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	42	class AttributeValueWithCharsetSubstitution(str):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	43	"""A stand-in object for a character encoding specified in HTML."""
				44
				45	class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
				46	"""A generic stand-in for the value of a meta tag's 'charset' attribute.
				47
				48	When Beautiful Soup parses the markup '<meta charset="utf8">', the
				49	value of the 'charset' attribute will be one of these objects.
				50	"""
				51
				52	def __new__(cls, original_value):
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	53	obj = str.__new__(cls, original_value)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	54	obj.original_value = original_value
				55	return obj
				56
				57	def encode(self, encoding):
				58	return encoding
				59
				60
				61	class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
				62	"""A generic stand-in for the value of a meta tag's 'content' attribute.
				63
				64	When Beautiful Soup parses the markup:
				65	<meta http-equiv="content-type" content="text/html; charset=utf8">
				66
				67	The value of the 'content' attribute will be one of these objects.
				68	"""
				69
				70	CHARSET_RE = re.compile("((^\|;)\scharset=)([^;])", re.M)
				71
				72	def __new__(cls, original_value):
				73	match = cls.CHARSET_RE.search(original_value)
				74	if match is None:
				75	# No substitution necessary.
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	76	return str.__new__(str, original_value)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	77
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	78	obj = str.__new__(cls, original_value)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	79	obj.original_value = original_value
				80	return obj
				81
				82	def encode(self, encoding):
				83	def rewrite(match):
				84	return match.group(1) + encoding
				85	return self.CHARSET_RE.sub(rewrite, self.original_value)
				86
				87	class HTMLAwareEntitySubstitution(EntitySubstitution):
				88
				89	"""Entity substitution rules that are aware of some HTML quirks.
				90
				91	Specifically, the contents of <script> and <style> tags should not
				92	undergo entity substitution.
				93
				94	Incoming NavigableString objects are checked to see if they're the
				95	direct children of a <script> or <style> tag.
				96	"""
				97
				98	cdata_containing_tags = set(["script", "style"])
				99
				100	preformatted_tags = set(["pre"])
				101
				102	@classmethod
				103	def _substitute_if_appropriate(cls, ns, f):
				104	if (isinstance(ns, NavigableString)
				105	and ns.parent is not None
				106	and ns.parent.name in cls.cdata_containing_tags):
				107	# Do nothing.
				108	return ns
				109	# Substitute.
				110	return f(ns)
				111
				112	@classmethod
				113	def substitute_html(cls, ns):
				114	return cls._substitute_if_appropriate(
				115	ns, EntitySubstitution.substitute_html)
				116
				117	@classmethod
				118	def substitute_xml(cls, ns):
				119	return cls._substitute_if_appropriate(
				120	ns, EntitySubstitution.substitute_xml)
				121
				122	class PageElement(object):
				123	"""Contains the navigational information for some part of the page
				124	(either a tag or a piece of text)"""
				125
				126	# There are five possible values for the "formatter" argument passed in
				127	# to methods like encode() and prettify():
				128	#
				129	# "html" - All Unicode characters with corresponding HTML entities
				130	# are converted to those entities on output.
				131	# "minimal" - Bare ampersands and angle brackets are converted to
				132	# XML entities: & < >
				133	# None - The null formatter. Unicode characters are never
				134	# converted to entities. This is not recommended, but it's
				135	# faster than "minimal".
				136	# A function - This function will be called on every string that
				137	# needs to undergo entity substitution.
				138	#
				139
				140	# In an HTML document, the default "html" and "minimal" functions
				141	# will leave the contents of <script> and <style> tags alone. For
				142	# an XML document, all tags will be given the same treatment.
				143
				144	HTML_FORMATTERS = {
				145	"html" : HTMLAwareEntitySubstitution.substitute_html,
				146	"minimal" : HTMLAwareEntitySubstitution.substitute_xml,
				147	None : None
				148	}
				149
				150	XML_FORMATTERS = {
				151	"html" : EntitySubstitution.substitute_html,
				152	"minimal" : EntitySubstitution.substitute_xml,
				153	None : None
				154	}
				155
				156	def format_string(self, s, formatter='minimal'):
				157	"""Format the given string using the given formatter."""
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	158	if not isinstance(formatter, collections.Callable):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	159	formatter = self._formatter_for_name(formatter)
				160	if formatter is None:
				161	output = s
				162	else:
				163	output = formatter(s)
				164	return output
				165
				166	@property
				167	def _is_xml(self):
				168	"""Is this element part of an XML tree or an HTML tree?
				169
				170	This is used when mapping a formatter name ("minimal") to an
				171	appropriate function (one that performs entity-substitution on
				172	the contents of <script> and <style> tags, or not). It's
				173	inefficient, but it should be called very rarely.
				174	"""
				175	if self.parent is None:
				176	# This is the top-level object. It should have .is_xml set
				177	# from tree creation. If not, take a guess--BS is usually
				178	# used on HTML markup.
				179	return getattr(self, 'is_xml', False)
				180	return self.parent._is_xml
				181
				182	def _formatter_for_name(self, name):
				183	"Look up a formatter function based on its name and the tree."
				184	if self._is_xml:
				185	return self.XML_FORMATTERS.get(
				186	name, EntitySubstitution.substitute_xml)
				187	else:
				188	return self.HTML_FORMATTERS.get(
				189	name, HTMLAwareEntitySubstitution.substitute_xml)
				190
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	191	def setup(self, parent=None, previous_element=None, next_element=None,
				192	previous_sibling=None, next_sibling=None):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	193	"""Sets up the initial relations between this element and
				194	other elements."""
				195	self.parent = parent
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	196
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	197	self.previous_element = previous_element
				198	if previous_element is not None:
				199	self.previous_element.next_element = self
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	200
				201	self.next_element = next_element
				202	if self.next_element:
				203	self.next_element.previous_element = self
				204
				205	self.next_sibling = next_sibling
				206	if self.next_sibling:
				207	self.next_sibling.previous_sibling = self
				208
				209	if (not previous_sibling
				210	and self.parent is not None and self.parent.contents):
				211	previous_sibling = self.parent.contents[-1]
				212
				213	self.previous_sibling = previous_sibling
				214	if previous_sibling:
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	215	self.previous_sibling.next_sibling = self
				216
				217	nextSibling = _alias("next_sibling") # BS3
				218	previousSibling = _alias("previous_sibling") # BS3
				219
				220	def replace_with(self, replace_with):
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	221	if not self.parent:
				222	raise ValueError(
				223	"Cannot replace one element with another when the"
				224	"element to be replaced is not part of a tree.")
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	225	if replace_with is self:
				226	return
				227	if replace_with is self.parent:
				228	raise ValueError("Cannot replace a Tag with its parent.")
				229	old_parent = self.parent
				230	my_index = self.parent.index(self)
				231	self.extract()
				232	old_parent.insert(my_index, replace_with)
				233	return self
				234	replaceWith = replace_with # BS3
				235
				236	def unwrap(self):
				237	my_parent = self.parent
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	238	if not self.parent:
				239	raise ValueError(
				240	"Cannot replace an element with its contents when that"
				241	"element is not part of a tree.")
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	242	my_index = self.parent.index(self)
				243	self.extract()
				244	for child in reversed(self.contents[:]):
				245	my_parent.insert(my_index, child)
				246	return self
				247	replace_with_children = unwrap
				248	replaceWithChildren = unwrap # BS3
				249
				250	def wrap(self, wrap_inside):
				251	me = self.replace_with(wrap_inside)
				252	wrap_inside.append(me)
				253	return wrap_inside
				254
				255	def extract(self):
				256	"""Destructively rips this element out of the tree."""
				257	if self.parent is not None:
				258	del self.parent.contents[self.parent.index(self)]
				259
				260	#Find the two elements that would be next to each other if
				261	#this element (and any children) hadn't been parsed. Connect
				262	#the two.
				263	last_child = self._last_descendant()
				264	next_element = last_child.next_element
				265
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	266	if (self.previous_element is not None and
				267	self.previous_element is not next_element):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	268	self.previous_element.next_element = next_element
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	269	if next_element is not None and next_element is not self.previous_element:
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	270	next_element.previous_element = self.previous_element
				271	self.previous_element = None
				272	last_child.next_element = None
				273
				274	self.parent = None
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	275	if (self.previous_sibling is not None
				276	and self.previous_sibling is not self.next_sibling):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	277	self.previous_sibling.next_sibling = self.next_sibling
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	278	if (self.next_sibling is not None
				279	and self.next_sibling is not self.previous_sibling):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	280	self.next_sibling.previous_sibling = self.previous_sibling
				281	self.previous_sibling = self.next_sibling = None
				282	return self
				283
				284	def _last_descendant(self, is_initialized=True, accept_self=True):
				285	"Finds the last element beneath this object to be parsed."
				286	if is_initialized and self.next_sibling:
				287	last_child = self.next_sibling.previous_element
				288	else:
				289	last_child = self
				290	while isinstance(last_child, Tag) and last_child.contents:
				291	last_child = last_child.contents[-1]
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	292	if not accept_self and last_child is self:
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	293	last_child = None
				294	return last_child
				295	# BS3: Not part of the API!
				296	_lastRecursiveChild = _last_descendant
				297
				298	def insert(self, position, new_child):
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	299	if new_child is None:
				300	raise ValueError("Cannot insert None into a tag.")
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	301	if new_child is self:
				302	raise ValueError("Cannot insert a tag into itself.")
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	303	if (isinstance(new_child, str)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	304	and not isinstance(new_child, NavigableString)):
				305	new_child = NavigableString(new_child)
				306
				307	position = min(position, len(self.contents))
				308	if hasattr(new_child, 'parent') and new_child.parent is not None:
				309	# We're 'inserting' an element that's already one
				310	# of this object's children.
				311	if new_child.parent is self:
				312	current_index = self.index(new_child)
				313	if current_index < position:
				314	# We're moving this element further down the list
				315	# of this object's children. That means that when
				316	# we extract this element, our target index will
				317	# jump down one.
				318	position -= 1
				319	new_child.extract()
				320
				321	new_child.parent = self
				322	previous_child = None
				323	if position == 0:
				324	new_child.previous_sibling = None
				325	new_child.previous_element = self
				326	else:
				327	previous_child = self.contents[position - 1]
				328	new_child.previous_sibling = previous_child
				329	new_child.previous_sibling.next_sibling = new_child
				330	new_child.previous_element = previous_child._last_descendant(False)
				331	if new_child.previous_element is not None:
				332	new_child.previous_element.next_element = new_child
				333
				334	new_childs_last_element = new_child._last_descendant(False)
				335
				336	if position >= len(self.contents):
				337	new_child.next_sibling = None
				338
				339	parent = self
				340	parents_next_sibling = None
				341	while parents_next_sibling is None and parent is not None:
				342	parents_next_sibling = parent.next_sibling
				343	parent = parent.parent
				344	if parents_next_sibling is not None:
				345	# We found the element that comes next in the document.
				346	break
				347	if parents_next_sibling is not None:
				348	new_childs_last_element.next_element = parents_next_sibling
				349	else:
				350	# The last element of this tag is the last element in
				351	# the document.
				352	new_childs_last_element.next_element = None
				353	else:
				354	next_child = self.contents[position]
				355	new_child.next_sibling = next_child
				356	if new_child.next_sibling is not None:
				357	new_child.next_sibling.previous_sibling = new_child
				358	new_childs_last_element.next_element = next_child
				359
				360	if new_childs_last_element.next_element is not None:
				361	new_childs_last_element.next_element.previous_element = new_childs_last_element
				362	self.contents.insert(position, new_child)
				363
				364	def append(self, tag):
				365	"""Appends the given tag to the contents of this tag."""
				366	self.insert(len(self.contents), tag)
				367
				368	def insert_before(self, predecessor):
				369	"""Makes the given element the immediate predecessor of this one.
				370
				371	The two elements will have the same parent, and the given element
				372	will be immediately before this one.
				373	"""
				374	if self is predecessor:
				375	raise ValueError("Can't insert an element before itself.")
				376	parent = self.parent
				377	if parent is None:
				378	raise ValueError(
				379	"Element has no parent, so 'before' has no meaning.")
				380	# Extract first so that the index won't be screwed up if they
				381	# are siblings.
				382	if isinstance(predecessor, PageElement):
				383	predecessor.extract()
				384	index = parent.index(self)
				385	parent.insert(index, predecessor)
				386
				387	def insert_after(self, successor):
				388	"""Makes the given element the immediate successor of this one.
				389
				390	The two elements will have the same parent, and the given element
				391	will be immediately after this one.
				392	"""
				393	if self is successor:
				394	raise ValueError("Can't insert an element after itself.")
				395	parent = self.parent
				396	if parent is None:
				397	raise ValueError(
				398	"Element has no parent, so 'after' has no meaning.")
				399	# Extract first so that the index won't be screwed up if they
				400	# are siblings.
				401	if isinstance(successor, PageElement):
				402	successor.extract()
				403	index = parent.index(self)
				404	parent.insert(index+1, successor)
				405
				406	def find_next(self, name=None, attrs={}, text=None, **kwargs):
				407	"""Returns the first item that matches the given criteria and
				408	appears after this Tag in the document."""
				409	return self._find_one(self.find_all_next, name, attrs, text, **kwargs)
				410	findNext = find_next # BS3
				411
				412	def find_all_next(self, name=None, attrs={}, text=None, limit=None,
				413	**kwargs):
				414	"""Returns all items that match the given criteria and appear
				415	after this Tag in the document."""
				416	return self._find_all(name, attrs, text, limit, self.next_elements,
				417	**kwargs)
				418	findAllNext = find_all_next # BS3
				419
				420	def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs):
				421	"""Returns the closest sibling to this Tag that matches the
				422	given criteria and appears after this Tag in the document."""
				423	return self._find_one(self.find_next_siblings, name, attrs, text,
				424	**kwargs)
				425	findNextSibling = find_next_sibling # BS3
				426
				427	def find_next_siblings(self, name=None, attrs={}, text=None, limit=None,
				428	**kwargs):
				429	"""Returns the siblings of this Tag that match the given
				430	criteria and appear after this Tag in the document."""
				431	return self._find_all(name, attrs, text, limit,
				432	self.next_siblings, **kwargs)
				433	findNextSiblings = find_next_siblings # BS3
				434	fetchNextSiblings = find_next_siblings # BS2
				435
				436	def find_previous(self, name=None, attrs={}, text=None, **kwargs):
				437	"""Returns the first item that matches the given criteria and
				438	appears before this Tag in the document."""
				439	return self._find_one(
				440	self.find_all_previous, name, attrs, text, **kwargs)
				441	findPrevious = find_previous # BS3
				442
				443	def find_all_previous(self, name=None, attrs={}, text=None, limit=None,
				444	**kwargs):
				445	"""Returns all items that match the given criteria and appear
				446	before this Tag in the document."""
				447	return self._find_all(name, attrs, text, limit, self.previous_elements,
				448	**kwargs)
				449	findAllPrevious = find_all_previous # BS3
				450	fetchPrevious = find_all_previous # BS2
				451
				452	def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs):
				453	"""Returns the closest sibling to this Tag that matches the
				454	given criteria and appears before this Tag in the document."""
				455	return self._find_one(self.find_previous_siblings, name, attrs, text,
				456	**kwargs)
				457	findPreviousSibling = find_previous_sibling # BS3
				458
				459	def find_previous_siblings(self, name=None, attrs={}, text=None,
				460	limit=None, **kwargs):
				461	"""Returns the siblings of this Tag that match the given
				462	criteria and appear before this Tag in the document."""
				463	return self._find_all(name, attrs, text, limit,
				464	self.previous_siblings, **kwargs)
				465	findPreviousSiblings = find_previous_siblings # BS3
				466	fetchPreviousSiblings = find_previous_siblings # BS2
				467
				468	def find_parent(self, name=None, attrs={}, **kwargs):
				469	"""Returns the closest parent of this Tag that matches the given
				470	criteria."""
				471	# NOTE: We can't use _find_one because findParents takes a different
				472	# set of arguments.
				473	r = None
				474	l = self.find_parents(name, attrs, 1, **kwargs)
				475	if l:
				476	r = l[0]
				477	return r
				478	findParent = find_parent # BS3
				479
				480	def find_parents(self, name=None, attrs={}, limit=None, **kwargs):
				481	"""Returns the parents of this Tag that match the given
				482	criteria."""
				483
				484	return self._find_all(name, attrs, None, limit, self.parents,
				485	**kwargs)
				486	findParents = find_parents # BS3
				487	fetchParents = find_parents # BS2
				488
				489	@property
				490	def next(self):
				491	return self.next_element
				492
				493	@property
				494	def previous(self):
				495	return self.previous_element
				496
				497	#These methods do the real heavy lifting.
				498
				499	def _find_one(self, method, name, attrs, text, **kwargs):
				500	r = None
				501	l = method(name, attrs, text, 1, **kwargs)
				502	if l:
				503	r = l[0]
				504	return r
				505
				506	def _find_all(self, name, attrs, text, limit, generator, **kwargs):
				507	"Iterates over a generator looking for things that match."
				508
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	509	if text is None and 'string' in kwargs:
				510	text = kwargs['string']
				511	del kwargs['string']
				512
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	513	if isinstance(name, SoupStrainer):
				514	strainer = name
				515	else:
				516	strainer = SoupStrainer(name, attrs, text, **kwargs)
				517
				518	if text is None and not limit and not attrs and not kwargs:
				519	if name is True or name is None:
				520	# Optimization to find all tags.
				521	result = (element for element in generator
				522	if isinstance(element, Tag))
				523	return ResultSet(strainer, result)
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	524	elif isinstance(name, str):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	525	# Optimization to find all tags with a given name.
				526	result = (element for element in generator
				527	if isinstance(element, Tag)
				528	and element.name == name)
				529	return ResultSet(strainer, result)
				530	results = ResultSet(strainer)
				531	while True:
				532	try:
				533	i = next(generator)
				534	except StopIteration:
				535	break
				536	if i:
				537	found = strainer.search(i)
				538	if found:
				539	results.append(found)
				540	if limit and len(results) >= limit:
				541	break
				542	return results
				543
				544	#These generators can be used to navigate starting from both
				545	#NavigableStrings and Tags.
				546	@property
				547	def next_elements(self):
				548	i = self.next_element
				549	while i is not None:
				550	yield i
				551	i = i.next_element
				552
				553	@property
				554	def next_siblings(self):
				555	i = self.next_sibling
				556	while i is not None:
				557	yield i
				558	i = i.next_sibling
				559
				560	@property
				561	def previous_elements(self):
				562	i = self.previous_element
				563	while i is not None:
				564	yield i
				565	i = i.previous_element
				566
				567	@property
				568	def previous_siblings(self):
				569	i = self.previous_sibling
				570	while i is not None:
				571	yield i
				572	i = i.previous_sibling
				573
				574	@property
				575	def parents(self):
				576	i = self.parent
				577	while i is not None:
				578	yield i
				579	i = i.parent
				580
				581	# Methods for supporting CSS selectors.
				582
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	583	tag_name_re = re.compile('^[a-zA-Z0-9][-.a-zA-Z0-9:_]*$')
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	584
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	585	# /^([a-zA-Z0-9][-.a-zA-Z0-9:_])\[(\w+)([=~\\|\^\$\]?)=?"?([^\]"]*)"?\]$/
				586	# \---------------------------/ \---/\-------------/ \-------/
				587	# \| \| \| \|
				588	# \| \| \| The value
				589	# \| \| ~,\|,^,$,* or =
				590	# \| Attribute
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	591	# Tag
				592	attribselect_re = re.compile(
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	593	r'^(?P<tag>[a-zA-Z0-9][-.a-zA-Z0-9:_])?\[(?P<attribute>[\w-]+)(?P<operator>[=~\\|\^\$\]?)' +
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	594	r'=?"?(?P<value>[^\]"]*)"?\]$'
				595	)
				596
				597	def _attr_value_as_string(self, value, default=None):
				598	"""Force an attribute value into a string representation.
				599
				600	A multi-valued attribute will be converted into a
				601	space-separated stirng.
				602	"""
				603	value = self.get(value, default)
				604	if isinstance(value, list) or isinstance(value, tuple):
				605	value =" ".join(value)
				606	return value
				607
				608	def _tag_name_matches_and(self, function, tag_name):
				609	if not tag_name:
				610	return function
				611	else:
				612	def _match(tag):
				613	return tag.name == tag_name and function(tag)
				614	return _match
				615
				616	def _attribute_checker(self, operator, attribute, value=''):
				617	"""Create a function that performs a CSS selector operation.
				618
				619	Takes an operator, attribute and optional value. Returns a
				620	function that will return True for elements that match that
				621	combination.
				622	"""
				623	if operator == '=':
				624	# string representation of `attribute` is equal to `value`
				625	return lambda el: el._attr_value_as_string(attribute) == value
				626	elif operator == '~':
				627	# space-separated list representation of `attribute`
				628	# contains `value`
				629	def _includes_value(element):
				630	attribute_value = element.get(attribute, [])
				631	if not isinstance(attribute_value, list):
				632	attribute_value = attribute_value.split()
				633	return value in attribute_value
				634	return _includes_value
				635	elif operator == '^':
				636	# string representation of `attribute` starts with `value`
				637	return lambda el: el._attr_value_as_string(
				638	attribute, '').startswith(value)
				639	elif operator == '$':
				640	# string represenation of `attribute` ends with `value`
				641	return lambda el: el._attr_value_as_string(
				642	attribute, '').endswith(value)
				643	elif operator == '*':
				644	# string representation of `attribute` contains `value`
				645	return lambda el: value in el._attr_value_as_string(attribute, '')
				646	elif operator == '\|':
				647	# string representation of `attribute` is either exactly
				648	# `value` or starts with `value` and then a dash.
				649	def _is_or_starts_with_dash(element):
				650	attribute_value = element._attr_value_as_string(attribute, '')
				651	return (attribute_value == value or attribute_value.startswith(
				652	value + '-'))
				653	return _is_or_starts_with_dash
				654	else:
				655	return lambda el: el.has_attr(attribute)
				656
				657	# Old non-property versions of the generators, for backwards
				658	# compatibility with BS3.
				659	def nextGenerator(self):
				660	return self.next_elements
				661
				662	def nextSiblingGenerator(self):
				663	return self.next_siblings
				664
				665	def previousGenerator(self):
				666	return self.previous_elements
				667
				668	def previousSiblingGenerator(self):
				669	return self.previous_siblings
				670
				671	def parentGenerator(self):
				672	return self.parents
				673
				674
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	675	class NavigableString(str, PageElement):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	676
				677	PREFIX = ''
				678	SUFFIX = ''
				679
				680	def __new__(cls, value):
				681	"""Create a new NavigableString.
				682
				683	When unpickling a NavigableString, this method is called with
				684	the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
				685	passed in to the superclass's __new__ or the superclass won't know
				686	how to handle non-ASCII characters.
				687	"""
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	688	if isinstance(value, str):
				689	u = str.__new__(cls, value)
				690	else:
				691	u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
				692	u.setup()
				693	return u
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	694
				695	def __copy__(self):
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	696	"""A copy of a NavigableString has the same contents and class
				697	as the original, but it is not connected to the parse tree.
				698	"""
				699	return type(self)(self)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	700
				701	def __getnewargs__(self):
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	702	return (str(self),)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	703
				704	def __getattr__(self, attr):
				705	"""text.string gives you text. This is for backwards
				706	compatibility for NavigableString, but for CData it lets you
				707	get the string without the CData wrapper."""
				708	if attr == 'string':
				709	return self
				710	else:
				711	raise AttributeError(
				712	"'%s' object has no attribute '%s'" % (
				713	self.__class__.__name__, attr))
				714
				715	def output_ready(self, formatter="minimal"):
				716	output = self.format_string(self, formatter)
				717	return self.PREFIX + output + self.SUFFIX
				718
				719	@property
				720	def name(self):
				721	return None
				722
				723	@name.setter
				724	def name(self, name):
				725	raise AttributeError("A NavigableString cannot be given a name.")
				726
				727	class PreformattedString(NavigableString):
				728	"""A NavigableString not subject to the normal formatting rules.
				729
				730	The string will be passed into the formatter (to trigger side effects),
				731	but the return value will be ignored.
				732	"""
				733
				734	def output_ready(self, formatter="minimal"):
				735	"""CData strings are passed into the formatter.
				736	But the return value is ignored."""
				737	self.format_string(self, formatter)
				738	return self.PREFIX + self + self.SUFFIX
				739
				740	class CData(PreformattedString):
				741
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	742	PREFIX = '<![CDATA['
				743	SUFFIX = ']]>'
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	744
				745	class ProcessingInstruction(PreformattedString):
				746
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	747	PREFIX = '<?'
				748	SUFFIX = '>'
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	749
				750	class Comment(PreformattedString):
				751
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	752	PREFIX = '<!--'
				753	SUFFIX = '-->'
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	754
				755
				756	class Declaration(PreformattedString):
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	757	PREFIX = '<?'
				758	SUFFIX = '?>'
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	759
				760
				761	class Doctype(PreformattedString):
				762
				763	@classmethod
				764	def for_name_and_ids(cls, name, pub_id, system_id):
				765	value = name or ''
				766	if pub_id is not None:
				767	value += ' PUBLIC "%s"' % pub_id
				768	if system_id is not None:
				769	value += ' "%s"' % system_id
				770	elif system_id is not None:
				771	value += ' SYSTEM "%s"' % system_id
				772
				773	return Doctype(value)
				774
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	775	PREFIX = '<!DOCTYPE '
				776	SUFFIX = '>\n'
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	777
				778
				779	class Tag(PageElement):
				780
				781	"""Represents a found HTML tag with its attributes and contents."""
				782
				783	def __init__(self, parser=None, builder=None, name=None, namespace=None,
				784	prefix=None, attrs=None, parent=None, previous=None):
				785	"Basic constructor."
				786
				787	if parser is None:
				788	self.parser_class = None
				789	else:
				790	# We don't actually store the parser object: that lets extracted
				791	# chunks be garbage-collected.
				792	self.parser_class = parser.__class__
				793	if name is None:
				794	raise ValueError("No value provided for new tag's name.")
				795	self.name = name
				796	self.namespace = namespace
				797	self.prefix = prefix
				798	if attrs is None:
				799	attrs = {}
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	800	elif attrs:
				801	if builder is not None and builder.cdata_list_attributes:
				802	attrs = builder._replace_cdata_list_attribute_values(
				803	self.name, attrs)
				804	else:
				805	attrs = dict(attrs)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	806	else:
				807	attrs = dict(attrs)
				808	self.attrs = attrs
				809	self.contents = []
				810	self.setup(parent, previous)
				811	self.hidden = False
				812
				813	# Set up any substitutions, such as the charset in a META tag.
				814	if builder is not None:
				815	builder.set_up_substitutions(self)
				816	self.can_be_empty_element = builder.can_be_empty_element(name)
				817	else:
				818	self.can_be_empty_element = False
				819
				820	parserClass = _alias("parser_class") # BS3
				821
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	822	def __copy__(self):
				823	"""A copy of a Tag is a new Tag, unconnected to the parse tree.
				824	Its contents are a copy of the old Tag's contents.
				825	"""
				826	clone = type(self)(None, self.builder, self.name, self.namespace,
				827	self.nsprefix, self.attrs)
				828	for attr in ('can_be_empty_element', 'hidden'):
				829	setattr(clone, attr, getattr(self, attr))
				830	for child in self.contents:
				831	clone.append(child.__copy__())
				832	return clone
				833
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	834	@property
				835	def is_empty_element(self):
				836	"""Is this tag an empty-element tag? (aka a self-closing tag)
				837
				838	A tag that has contents is never an empty-element tag.
				839
				840	A tag that has no contents may or may not be an empty-element
				841	tag. It depends on the builder used to create the tag. If the
				842	builder has a designated list of empty-element tags, then only
				843	a tag whose name shows up in that list is considered an
				844	empty-element tag.
				845
				846	If the builder has no designated list of empty-element tags,
				847	then any tag with no contents is an empty-element tag.
				848	"""
				849	return len(self.contents) == 0 and self.can_be_empty_element
				850	isSelfClosing = is_empty_element # BS3
				851
				852	@property
				853	def string(self):
				854	"""Convenience property to get the single string within this tag.
				855
				856	:Return: If this tag has a single string child, return value
				857	is that string. If this tag has no children, or more than one
				858	child, return value is None. If this tag has one child tag,
				859	return value is the 'string' attribute of the child tag,
				860	recursively.
				861	"""
				862	if len(self.contents) != 1:
				863	return None
				864	child = self.contents[0]
				865	if isinstance(child, NavigableString):
				866	return child
				867	return child.string
				868
				869	@string.setter
				870	def string(self, string):
				871	self.clear()
				872	self.append(string.__class__(string))
				873
				874	def _all_strings(self, strip=False, types=(NavigableString, CData)):
				875	"""Yield all strings of certain classes, possibly stripping them.
				876
				877	By default, yields only NavigableString and CData objects. So
				878	no comments, processing instructions, etc.
				879	"""
				880	for descendant in self.descendants:
				881	if (
				882	(types is None and not isinstance(descendant, NavigableString))
				883	or
				884	(types is not None and type(descendant) not in types)):
				885	continue
				886	if strip:
				887	descendant = descendant.strip()
				888	if len(descendant) == 0:
				889	continue
				890	yield descendant
				891
				892	strings = property(_all_strings)
				893
				894	@property
				895	def stripped_strings(self):
				896	for string in self._all_strings(True):
				897	yield string
				898
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	899	def get_text(self, separator="", strip=False,
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	900	types=(NavigableString, CData)):
				901	"""
				902	Get all child strings, concatenated using the given separator.
				903	"""
				904	return separator.join([s for s in self._all_strings(
				905	strip, types=types)])
				906	getText = get_text
				907	text = property(get_text)
				908
				909	def decompose(self):
				910	"""Recursively destroys the contents of this tree."""
				911	self.extract()
				912	i = self
				913	while i is not None:
				914	next = i.next_element
				915	i.__dict__.clear()
				916	i.contents = []
				917	i = next
				918
				919	def clear(self, decompose=False):
				920	"""
				921	Extract all children. If decompose is True, decompose instead.
				922	"""
				923	if decompose:
				924	for element in self.contents[:]:
				925	if isinstance(element, Tag):
				926	element.decompose()
				927	else:
				928	element.extract()
				929	else:
				930	for element in self.contents[:]:
				931	element.extract()
				932
				933	def index(self, element):
				934	"""
				935	Find the index of a child by identity, not value. Avoids issues with
				936	tag.contents.index(element) getting the index of equal elements.
				937	"""
				938	for i, child in enumerate(self.contents):
				939	if child is element:
				940	return i
				941	raise ValueError("Tag.index: element not in tag")
				942
				943	def get(self, key, default=None):
				944	"""Returns the value of the 'key' attribute for the tag, or
				945	the value given for 'default' if it doesn't have that
				946	attribute."""
				947	return self.attrs.get(key, default)
				948
				949	def has_attr(self, key):
				950	return key in self.attrs
				951
				952	def __hash__(self):
				953	return str(self).__hash__()
				954
				955	def __getitem__(self, key):
				956	"""tag[key] returns the value of the 'key' attribute for the tag,
				957	and throws an exception if it's not there."""
				958	return self.attrs[key]
				959
				960	def __iter__(self):
				961	"Iterating over a tag iterates over its contents."
				962	return iter(self.contents)
				963
				964	def __len__(self):
				965	"The length of a tag is the length of its list of contents."
				966	return len(self.contents)
				967
				968	def __contains__(self, x):
				969	return x in self.contents
				970
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	971	def __bool__(self):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	972	"A tag is non-None even if it has no contents."
				973	return True
				974
				975	def __setitem__(self, key, value):
				976	"""Setting tag[key] sets the value of the 'key' attribute for the
				977	tag."""
				978	self.attrs[key] = value
				979
				980	def __delitem__(self, key):
				981	"Deleting tag[key] deletes all 'key' attributes for the tag."
				982	self.attrs.pop(key, None)
				983
				984	def __call__(self, args, *kwargs):
				985	"""Calling a tag like a function is the same as calling its
				986	find_all() method. Eg. tag('a') returns a list of all the A tags
				987	found within this tag."""
				988	return self.find_all(args, *kwargs)
				989
				990	def __getattr__(self, tag):
				991	#print "Getattr %s.%s" % (self.__class__, tag)
				992	if len(tag) > 3 and tag.endswith('Tag'):
				993	# BS3: soup.aTag -> "soup.find("a")
				994	tag_name = tag[:-3]
				995	warnings.warn(
				996	'.%sTag is deprecated, use .find("%s") instead.' % (
				997	tag_name, tag_name))
				998	return self.find(tag_name)
				999	# We special case contents to avoid recursion.
				1000	elif not tag.startswith("__") and not tag=="contents":
				1001	return self.find(tag)
				1002	raise AttributeError(
				1003	"'%s' object has no attribute '%s'" % (self.__class__, tag))
				1004
				1005	def __eq__(self, other):
				1006	"""Returns true iff this tag has the same name, the same attributes,
				1007	and the same contents (recursively) as the given tag."""
				1008	if self is other:
				1009	return True
				1010	if (not hasattr(other, 'name') or
				1011	not hasattr(other, 'attrs') or
				1012	not hasattr(other, 'contents') or
				1013	self.name != other.name or
				1014	self.attrs != other.attrs or
				1015	len(self) != len(other)):
				1016	return False
				1017	for i, my_child in enumerate(self.contents):
				1018	if my_child != other.contents[i]:
				1019	return False
				1020	return True
				1021
				1022	def __ne__(self, other):
				1023	"""Returns true iff this tag is not identical to the other tag,
				1024	as defined in __eq__."""
				1025	return not self == other
				1026
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1027	def __repr__(self, encoding="unicode-escape"):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1028	"""Renders this tag as a string."""
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1029	if PY3K:
				1030	# "The return value must be a string object", i.e. Unicode
				1031	return self.decode()
				1032	else:
				1033	# "The return value must be a string object", i.e. a bytestring.
				1034	# By convention, the return value of __repr__ should also be
				1035	# an ASCII string.
				1036	return self.encode(encoding)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1037
				1038	def __unicode__(self):
				1039	return self.decode()
				1040
				1041	def __str__(self):
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1042	if PY3K:
				1043	return self.decode()
				1044	else:
				1045	return self.encode()
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1046
				1047	if PY3K:
				1048	__str__ = __repr__ = __unicode__
				1049
				1050	def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
				1051	indent_level=None, formatter="minimal",
				1052	errors="xmlcharrefreplace"):
				1053	# Turn the data structure into Unicode, then encode the
				1054	# Unicode.
				1055	u = self.decode(indent_level, encoding, formatter)
				1056	return u.encode(encoding, errors)
				1057
				1058	def _should_pretty_print(self, indent_level):
				1059	"""Should this tag be pretty-printed?"""
				1060	return (
				1061	indent_level is not None and
				1062	(self.name not in HTMLAwareEntitySubstitution.preformatted_tags
				1063	or self._is_xml))
				1064
				1065	def decode(self, indent_level=None,
				1066	eventual_encoding=DEFAULT_OUTPUT_ENCODING,
				1067	formatter="minimal"):
				1068	"""Returns a Unicode representation of this tag and its contents.
				1069
				1070	:param eventual_encoding: The tag is destined to be
				1071	encoded into this encoding. This method is _not_
				1072	responsible for performing that encoding. This information
				1073	is passed in so that it can be substituted in if the
				1074	document contains a <META> tag that mentions the document's
				1075	encoding.
				1076	"""
				1077
				1078	# First off, turn a string formatter into a function. This
				1079	# will stop the lookup from happening over and over again.
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1080	if not isinstance(formatter, collections.Callable):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1081	formatter = self._formatter_for_name(formatter)
				1082
				1083	attrs = []
				1084	if self.attrs:
				1085	for key, val in sorted(self.attrs.items()):
				1086	if val is None:
				1087	decoded = key
				1088	else:
				1089	if isinstance(val, list) or isinstance(val, tuple):
				1090	val = ' '.join(val)
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1091	elif not isinstance(val, str):
				1092	val = str(val)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1093	elif (
				1094	isinstance(val, AttributeValueWithCharsetSubstitution)
				1095	and eventual_encoding is not None):
				1096	val = val.encode(eventual_encoding)
				1097
				1098	text = self.format_string(val, formatter)
				1099	decoded = (
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1100	str(key) + '='
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1101	+ EntitySubstitution.quoted_attribute_value(text))
				1102	attrs.append(decoded)
				1103	close = ''
				1104	closeTag = ''
				1105
				1106	prefix = ''
				1107	if self.prefix:
				1108	prefix = self.prefix + ":"
				1109
				1110	if self.is_empty_element:
				1111	close = '/'
				1112	else:
				1113	closeTag = '</%s%s>' % (prefix, self.name)
				1114
				1115	pretty_print = self._should_pretty_print(indent_level)
				1116	space = ''
				1117	indent_space = ''
				1118	if indent_level is not None:
				1119	indent_space = (' ' * (indent_level - 1))
				1120	if pretty_print:
				1121	space = indent_space
				1122	indent_contents = indent_level + 1
				1123	else:
				1124	indent_contents = None
				1125	contents = self.decode_contents(
				1126	indent_contents, eventual_encoding, formatter)
				1127
				1128	if self.hidden:
				1129	# This is the 'document root' object.
				1130	s = contents
				1131	else:
				1132	s = []
				1133	attribute_string = ''
				1134	if attrs:
				1135	attribute_string = ' ' + ' '.join(attrs)
				1136	if indent_level is not None:
				1137	# Even if this particular tag is not pretty-printed,
				1138	# we should indent up to the start of the tag.
				1139	s.append(indent_space)
				1140	s.append('<%s%s%s%s>' % (
				1141	prefix, self.name, attribute_string, close))
				1142	if pretty_print:
				1143	s.append("\n")
				1144	s.append(contents)
				1145	if pretty_print and contents and contents[-1] != "\n":
				1146	s.append("\n")
				1147	if pretty_print and closeTag:
				1148	s.append(space)
				1149	s.append(closeTag)
				1150	if indent_level is not None and closeTag and self.next_sibling:
				1151	# Even if this particular tag is not pretty-printed,
				1152	# we're now done with the tag, and we should add a
				1153	# newline if appropriate.
				1154	s.append("\n")
				1155	s = ''.join(s)
				1156	return s
				1157
				1158	def prettify(self, encoding=None, formatter="minimal"):
				1159	if encoding is None:
				1160	return self.decode(True, formatter=formatter)
				1161	else:
				1162	return self.encode(encoding, True, formatter=formatter)
				1163
				1164	def decode_contents(self, indent_level=None,
				1165	eventual_encoding=DEFAULT_OUTPUT_ENCODING,
				1166	formatter="minimal"):
				1167	"""Renders the contents of this tag as a Unicode string.
				1168
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1169	:param indent_level: Each line of the rendering will be
				1170	indented this many spaces.
				1171
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1172	:param eventual_encoding: The tag is destined to be
				1173	encoded into this encoding. This method is _not_
				1174	responsible for performing that encoding. This information
				1175	is passed in so that it can be substituted in if the
				1176	document contains a <META> tag that mentions the document's
				1177	encoding.
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1178
				1179	:param formatter: The output formatter responsible for converting
				1180	entities to Unicode characters.
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1181	"""
				1182	# First off, turn a string formatter into a function. This
				1183	# will stop the lookup from happening over and over again.
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1184	if not isinstance(formatter, collections.Callable):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1185	formatter = self._formatter_for_name(formatter)
				1186
				1187	pretty_print = (indent_level is not None)
				1188	s = []
				1189	for c in self:
				1190	text = None
				1191	if isinstance(c, NavigableString):
				1192	text = c.output_ready(formatter)
				1193	elif isinstance(c, Tag):
				1194	s.append(c.decode(indent_level, eventual_encoding,
				1195	formatter))
				1196	if text and indent_level and not self.name == 'pre':
				1197	text = text.strip()
				1198	if text:
				1199	if pretty_print and not self.name == 'pre':
				1200	s.append(" " * (indent_level - 1))
				1201	s.append(text)
				1202	if pretty_print and not self.name == 'pre':
				1203	s.append("\n")
				1204	return ''.join(s)
				1205
				1206	def encode_contents(
				1207	self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
				1208	formatter="minimal"):
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1209	"""Renders the contents of this tag as a bytestring.
				1210
				1211	:param indent_level: Each line of the rendering will be
				1212	indented this many spaces.
				1213
				1214	:param eventual_encoding: The bytestring will be in this encoding.
				1215
				1216	:param formatter: The output formatter responsible for converting
				1217	entities to Unicode characters.
				1218	"""
				1219
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1220	contents = self.decode_contents(indent_level, encoding, formatter)
				1221	return contents.encode(encoding)
				1222
				1223	# Old method for BS3 compatibility
				1224	def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
				1225	prettyPrint=False, indentLevel=0):
				1226	if not prettyPrint:
				1227	indentLevel = None
				1228	return self.encode_contents(
				1229	indent_level=indentLevel, encoding=encoding)
				1230
				1231	#Soup methods
				1232
				1233	def find(self, name=None, attrs={}, recursive=True, text=None,
				1234	**kwargs):
				1235	"""Return only the first child of this Tag matching the given
				1236	criteria."""
				1237	r = None
				1238	l = self.find_all(name, attrs, recursive, text, 1, **kwargs)
				1239	if l:
				1240	r = l[0]
				1241	return r
				1242	findChild = find
				1243
				1244	def find_all(self, name=None, attrs={}, recursive=True, text=None,
				1245	limit=None, **kwargs):
				1246	"""Extracts a list of Tag objects that match the given
				1247	criteria. You can specify the name of the Tag and any
				1248	attributes you want the Tag to have.
				1249
				1250	The value of a key-value pair in the 'attrs' map can be a
				1251	string, a list of strings, a regular expression object, or a
				1252	callable that takes a string and returns whether or not the
				1253	string matches for some custom definition of 'matches'. The
				1254	same is true of the tag name."""
				1255
				1256	generator = self.descendants
				1257	if not recursive:
				1258	generator = self.children
				1259	return self._find_all(name, attrs, text, limit, generator, **kwargs)
				1260	findAll = find_all # BS3
				1261	findChildren = find_all # BS2
				1262
				1263	#Generator methods
				1264	@property
				1265	def children(self):
				1266	# return iter() to make the purpose of the method clear
				1267	return iter(self.contents) # XXX This seems to be untested.
				1268
				1269	@property
				1270	def descendants(self):
				1271	if not len(self.contents):
				1272	return
				1273	stopNode = self._last_descendant().next_element
				1274	current = self.contents[0]
				1275	while current is not stopNode:
				1276	yield current
				1277	current = current.next_element
				1278
				1279	# CSS selector code
				1280
				1281	_selector_combinators = ['>', '+', '~']
				1282	_select_debug = False
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1283	def select_one(self, selector):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1284	"""Perform a CSS selection operation on the current element."""
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1285	value = self.select(selector, limit=1)
				1286	if value:
				1287	return value[0]
				1288	return None
				1289
				1290	def select(self, selector, _candidate_generator=None, limit=None):
				1291	"""Perform a CSS selection operation on the current element."""
				1292
				1293	# Handle grouping selectors if ',' exists, ie: p,a
				1294	if ',' in selector:
				1295	context = []
				1296	for partial_selector in selector.split(','):
				1297	partial_selector = partial_selector.strip()
				1298	if partial_selector == '':
				1299	raise ValueError('Invalid group selection syntax: %s' % selector)
				1300	candidates = self.select(partial_selector, limit=limit)
				1301	for candidate in candidates:
				1302	if candidate not in context:
				1303	context.append(candidate)
				1304
				1305	if limit and len(context) >= limit:
				1306	break
				1307	return context
				1308
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1309	tokens = selector.split()
				1310	current_context = [self]
				1311
				1312	if tokens[-1] in self._selector_combinators:
				1313	raise ValueError(
				1314	'Final combinator "%s" is missing an argument.' % tokens[-1])
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1315
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1316	if self._select_debug:
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1317	print('Running CSS selector "%s"' % selector)
				1318
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1319	for index, token in enumerate(tokens):
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1320	new_context = []
				1321	new_context_ids = set([])
				1322
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1323	if tokens[index-1] in self._selector_combinators:
				1324	# This token was consumed by the previous combinator. Skip it.
				1325	if self._select_debug:
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1326	print(' Token was consumed by the previous combinator.')
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1327	continue
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1328
				1329	if self._select_debug:
				1330	print(' Considering token "%s"' % token)
				1331	recursive_candidate_generator = None
				1332	tag_name = None
				1333
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1334	# Each operation corresponds to a checker function, a rule
				1335	# for determining whether a candidate matches the
				1336	# selector. Candidates are generated by the active
				1337	# iterator.
				1338	checker = None
				1339
				1340	m = self.attribselect_re.match(token)
				1341	if m is not None:
				1342	# Attribute selector
				1343	tag_name, attribute, operator, value = m.groups()
				1344	checker = self._attribute_checker(operator, attribute, value)
				1345
				1346	elif '#' in token:
				1347	# ID selector
				1348	tag_name, tag_id = token.split('#', 1)
				1349	def id_matches(tag):
				1350	return tag.get('id', None) == tag_id
				1351	checker = id_matches
				1352
				1353	elif '.' in token:
				1354	# Class selector
				1355	tag_name, klass = token.split('.', 1)
				1356	classes = set(klass.split('.'))
				1357	def classes_match(candidate):
				1358	return classes.issubset(candidate.get('class', []))
				1359	checker = classes_match
				1360
				1361	elif ':' in token:
				1362	# Pseudo-class
				1363	tag_name, pseudo = token.split(':', 1)
				1364	if tag_name == '':
				1365	raise ValueError(
				1366	"A pseudo-class must be prefixed with a tag name.")
				1367	pseudo_attributes = re.match('([a-zA-Z\d-]+)$([a-zA-Z\d]+)$', pseudo)
				1368	found = []
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1369	if pseudo_attributes is None:
				1370	pseudo_type = pseudo
				1371	pseudo_value = None
				1372	else:
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1373	pseudo_type, pseudo_value = pseudo_attributes.groups()
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1374	if pseudo_type == 'nth-of-type':
				1375	try:
				1376	pseudo_value = int(pseudo_value)
				1377	except:
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1378	raise NotImplementedError(
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1379	'Only numeric values are currently supported for the nth-of-type pseudo-class.')
				1380	if pseudo_value < 1:
				1381	raise ValueError(
				1382	'nth-of-type pseudo-class value must be at least 1.')
				1383	class Counter(object):
				1384	def __init__(self, destination):
				1385	self.count = 0
				1386	self.destination = destination
				1387
				1388	def nth_child_of_type(self, tag):
				1389	self.count += 1
				1390	if self.count == self.destination:
				1391	return True
				1392	if self.count > self.destination:
				1393	# Stop the generator that's sending us
				1394	# these things.
				1395	raise StopIteration()
				1396	return False
				1397	checker = Counter(pseudo_value).nth_child_of_type
				1398	else:
				1399	raise NotImplementedError(
				1400	'Only the following pseudo-classes are implemented: nth-of-type.')
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1401
				1402	elif token == '*':
				1403	# Star selector -- matches everything
				1404	pass
				1405	elif token == '>':
				1406	# Run the next token as a CSS selector against the
				1407	# direct children of each tag in the current context.
				1408	recursive_candidate_generator = lambda tag: tag.children
				1409	elif token == '~':
				1410	# Run the next token as a CSS selector against the
				1411	# siblings of each tag in the current context.
				1412	recursive_candidate_generator = lambda tag: tag.next_siblings
				1413	elif token == '+':
				1414	# For each tag in the current context, run the next
				1415	# token as a CSS selector against the tag's next
				1416	# sibling that's a tag.
				1417	def next_tag_sibling(tag):
				1418	yield tag.find_next_sibling(True)
				1419	recursive_candidate_generator = next_tag_sibling
				1420
				1421	elif self.tag_name_re.match(token):
				1422	# Just a tag name.
				1423	tag_name = token
				1424	else:
				1425	raise ValueError(
				1426	'Unsupported or invalid CSS selector: "%s"' % token)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1427	if recursive_candidate_generator:
				1428	# This happens when the selector looks like "> foo".
				1429	#
				1430	# The generator calls select() recursively on every
				1431	# member of the current context, passing in a different
				1432	# candidate generator and a different selector.
				1433	#
				1434	# In the case of "> foo", the candidate generator is
				1435	# one that yields a tag's direct children (">"), and
				1436	# the selector is "foo".
				1437	next_token = tokens[index+1]
				1438	def recursive_select(tag):
				1439	if self._select_debug:
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1440	print(' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs))
				1441	print('-' * 40)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1442	for i in tag.select(next_token, recursive_candidate_generator):
				1443	if self._select_debug:
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1444	print('(Recursive select picked up candidate %s %s)' % (i.name, i.attrs))
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1445	yield i
				1446	if self._select_debug:
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1447	print('-' * 40)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1448	_use_candidate_generator = recursive_select
				1449	elif _candidate_generator is None:
				1450	# By default, a tag's candidates are all of its
				1451	# children. If tag_name is defined, only yield tags
				1452	# with that name.
				1453	if self._select_debug:
				1454	if tag_name:
				1455	check = "[any]"
				1456	else:
				1457	check = tag_name
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1458	print(' Default candidate generator, tag name="%s"' % check)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1459	if self._select_debug:
				1460	# This is redundant with later code, but it stops
				1461	# a bunch of bogus tags from cluttering up the
				1462	# debug log.
				1463	def default_candidate_generator(tag):
				1464	for child in tag.descendants:
				1465	if not isinstance(child, Tag):
				1466	continue
				1467	if tag_name and not child.name == tag_name:
				1468	continue
				1469	yield child
				1470	_use_candidate_generator = default_candidate_generator
				1471	else:
				1472	_use_candidate_generator = lambda tag: tag.descendants
				1473	else:
				1474	_use_candidate_generator = _candidate_generator
				1475
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1476	count = 0
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1477	for tag in current_context:
				1478	if self._select_debug:
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1479	print(" Running candidate generator on %s %s" % (
				1480	tag.name, repr(tag.attrs)))
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1481	for candidate in _use_candidate_generator(tag):
				1482	if not isinstance(candidate, Tag):
				1483	continue
				1484	if tag_name and candidate.name != tag_name:
				1485	continue
				1486	if checker is not None:
				1487	try:
				1488	result = checker(candidate)
				1489	except StopIteration:
				1490	# The checker has decided we should no longer
				1491	# run the generator.
				1492	break
				1493	if checker is None or result:
				1494	if self._select_debug:
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1495	print(" SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)))
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1496	if id(candidate) not in new_context_ids:
				1497	# If a tag matches a selector more than once,
				1498	# don't include it in the context more than once.
				1499	new_context.append(candidate)
				1500	new_context_ids.add(id(candidate))
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1501	if limit and len(new_context) >= limit:
				1502	break
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1503	elif self._select_debug:
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1504	print(" FAILURE %s %s" % (candidate.name, repr(candidate.attrs)))
				1505
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1506
				1507	current_context = new_context
				1508
				1509	if self._select_debug:
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1510	print("Final verdict:")
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1511	for i in current_context:
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1512	print(" %s %s" % (i.name, i.attrs))
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1513	return current_context
				1514
				1515	# Old names for backwards compatibility
				1516	def childGenerator(self):
				1517	return self.children
				1518
				1519	def recursiveChildGenerator(self):
				1520	return self.descendants
				1521
				1522	def has_key(self, key):
				1523	"""This was kind of misleading because has_key() (attributes)
				1524	was different from __in__ (contents). has_key() is gone in
				1525	Python 3, anyway."""
				1526	warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % (
				1527	key))
				1528	return self.has_attr(key)
				1529
				1530	# Next, a couple classes to represent queries and their results.
				1531	class SoupStrainer(object):
				1532	"""Encapsulates a number of ways of matching a markup element (tag or
				1533	text)."""
				1534
				1535	def __init__(self, name=None, attrs={}, text=None, **kwargs):
				1536	self.name = self._normalize_search_value(name)
				1537	if not isinstance(attrs, dict):
				1538	# Treat a non-dict value for attrs as a search for the 'class'
				1539	# attribute.
				1540	kwargs['class'] = attrs
				1541	attrs = None
				1542
				1543	if 'class_' in kwargs:
				1544	# Treat class_="foo" as a search for the 'class'
				1545	# attribute, overriding any non-dict value for attrs.
				1546	kwargs['class'] = kwargs['class_']
				1547	del kwargs['class_']
				1548
				1549	if kwargs:
				1550	if attrs:
				1551	attrs = attrs.copy()
				1552	attrs.update(kwargs)
				1553	else:
				1554	attrs = kwargs
				1555	normalized_attrs = {}
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1556	for key, value in list(attrs.items()):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1557	normalized_attrs[key] = self._normalize_search_value(value)
				1558
				1559	self.attrs = normalized_attrs
				1560	self.text = self._normalize_search_value(text)
				1561
				1562	def _normalize_search_value(self, value):
				1563	# Leave it alone if it's a Unicode string, a callable, a
				1564	# regular expression, a boolean, or None.
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1565	if (isinstance(value, str) or isinstance(value, collections.Callable) or hasattr(value, 'match')
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1566	or isinstance(value, bool) or value is None):
				1567	return value
				1568
				1569	# If it's a bytestring, convert it to Unicode, treating it as UTF-8.
				1570	if isinstance(value, bytes):
				1571	return value.decode("utf8")
				1572
				1573	# If it's listlike, convert it into a list of strings.
				1574	if hasattr(value, '__iter__'):
				1575	new_value = []
				1576	for v in value:
				1577	if (hasattr(v, '__iter__') and not isinstance(v, bytes)
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1578	and not isinstance(v, str)):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1579	# This is almost certainly the user's mistake. In the
				1580	# interests of avoiding infinite loops, we'll let
				1581	# it through as-is rather than doing a recursive call.
				1582	new_value.append(v)
				1583	else:
				1584	new_value.append(self._normalize_search_value(v))
				1585	return new_value
				1586
				1587	# Otherwise, convert it into a Unicode string.
				1588	# The unicode(str()) thing is so this will do the same thing on Python 2
				1589	# and Python 3.
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1590	return str(str(value))
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1591
				1592	def __str__(self):
				1593	if self.text:
				1594	return self.text
				1595	else:
				1596	return "%s\|%s" % (self.name, self.attrs)
				1597
				1598	def search_tag(self, markup_name=None, markup_attrs={}):
				1599	found = None
				1600	markup = None
				1601	if isinstance(markup_name, Tag):
				1602	markup = markup_name
				1603	markup_attrs = markup
				1604	call_function_with_tag_data = (
				1605	isinstance(self.name, collections.Callable)
				1606	and not isinstance(markup_name, Tag))
				1607
				1608	if ((not self.name)
				1609	or call_function_with_tag_data
				1610	or (markup and self._matches(markup, self.name))
				1611	or (not markup and self._matches(markup_name, self.name))):
				1612	if call_function_with_tag_data:
				1613	match = self.name(markup_name, markup_attrs)
				1614	else:
				1615	match = True
				1616	markup_attr_map = None
				1617	for attr, match_against in list(self.attrs.items()):
				1618	if not markup_attr_map:
				1619	if hasattr(markup_attrs, 'get'):
				1620	markup_attr_map = markup_attrs
				1621	else:
				1622	markup_attr_map = {}
				1623	for k, v in markup_attrs:
				1624	markup_attr_map[k] = v
				1625	attr_value = markup_attr_map.get(attr)
				1626	if not self._matches(attr_value, match_against):
				1627	match = False
				1628	break
				1629	if match:
				1630	if markup:
				1631	found = markup
				1632	else:
				1633	found = markup_name
				1634	if found and self.text and not self._matches(found.string, self.text):
				1635	found = None
				1636	return found
				1637	searchTag = search_tag
				1638
				1639	def search(self, markup):
				1640	# print 'looking for %s in %s' % (self, markup)
				1641	found = None
				1642	# If given a list of items, scan it for a text element that
				1643	# matches.
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1644	if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1645	for element in markup:
				1646	if isinstance(element, NavigableString) \
				1647	and self.search(element):
				1648	found = element
				1649	break
				1650	# If it's a Tag, make sure its name or attributes match.
				1651	# Don't bother with Tags if we're searching for text.
				1652	elif isinstance(markup, Tag):
				1653	if not self.text or self.name or self.attrs:
				1654	found = self.search_tag(markup)
				1655	# If it's text, make sure the text matches.
				1656	elif isinstance(markup, NavigableString) or \
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1657	isinstance(markup, str):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1658	if not self.name and not self.attrs and self._matches(markup, self.text):
				1659	found = markup
				1660	else:
				1661	raise Exception(
				1662	"I don't know how to match against a %s" % markup.__class__)
				1663	return found
				1664
				1665	def _matches(self, markup, match_against):
				1666	# print u"Matching %s against %s" % (markup, match_against)
				1667	result = False
				1668	if isinstance(markup, list) or isinstance(markup, tuple):
				1669	# This should only happen when searching a multi-valued attribute
				1670	# like 'class'.
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1671	if (isinstance(match_against, str)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1672	and ' ' in match_against):
				1673	# A bit of a special case. If they try to match "foo
				1674	# bar" on a multivalue attribute's value, only accept
				1675	# the literal value "foo bar"
				1676	#
				1677	# XXX This is going to be pretty slow because we keep
				1678	# splitting match_against. But it shouldn't come up
				1679	# too often.
				1680	return (whitespace_re.split(match_against) == markup)
				1681	else:
				1682	for item in markup:
				1683	if self._matches(item, match_against):
				1684	return True
				1685	return False
				1686
				1687	if match_against is True:
				1688	# True matches any non-None value.
				1689	return markup is not None
				1690
				1691	if isinstance(match_against, collections.Callable):
				1692	return match_against(markup)
				1693
				1694	# Custom callables take the tag as an argument, but all
				1695	# other ways of matching match the tag name as a string.
				1696	if isinstance(markup, Tag):
				1697	markup = markup.name
				1698
				1699	# Ensure that `markup` is either a Unicode string, or None.
				1700	markup = self._normalize_search_value(markup)
				1701
				1702	if markup is None:
				1703	# None matches None, False, an empty string, an empty list, and so on.
				1704	return not match_against
				1705
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1706	if isinstance(match_against, str):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1707	# Exact string match
				1708	return markup == match_against
				1709
				1710	if hasattr(match_against, 'match'):
				1711	# Regexp match
				1712	return match_against.search(markup)
				1713
				1714	if hasattr(match_against, '__iter__'):
				1715	# The markup must be an exact match against something
				1716	# in the iterable.
				1717	return markup in match_against
				1718
				1719
				1720	class ResultSet(list):
				1721	"""A ResultSet is just a list that keeps track of the SoupStrainer
				1722	that created it."""
				1723	def __init__(self, source, result=()):
				1724	super(ResultSet, self).__init__(result)
				1725	self.source = source