Blame - poky/bitbake/lib/bs4/element.py - mdmillerii/openbmc

blob: 68be42d138b68fadc65f23bcab8c90edd465c44d [file] [log] [blame]

Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1	__license__ = "MIT"
				2
Brad Bishop	1932369	2019-04-05 15:28:33 -0400	[diff] [blame]	3	import collections.abc
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	4	import re
				5	import sys
				6	import warnings
				7	from bs4.dammit import EntitySubstitution
				8
				9	DEFAULT_OUTPUT_ENCODING = "utf-8"
				10	PY3K = (sys.version_info[0] > 2)
				11
Brad Bishop	1932369	2019-04-05 15:28:33 -0400	[diff] [blame]	12	whitespace_re = re.compile(r"\s+")
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	13
				14	def _alias(attr):
				15	"""Alias one attribute name to another for backward compatibility"""
				16	@property
				17	def alias(self):
				18	return getattr(self, attr)
				19
				20	@alias.setter
				21	def alias(self):
				22	return setattr(self, attr)
				23	return alias
				24
				25
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	26	class NamespacedAttribute(str):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	27
				28	def __new__(cls, prefix, name, namespace=None):
				29	if name is None:
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	30	obj = str.__new__(cls, prefix)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	31	elif prefix is None:
				32	# Not really namespaced.
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	33	obj = str.__new__(cls, name)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	34	else:
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	35	obj = str.__new__(cls, prefix + ":" + name)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	36	obj.prefix = prefix
				37	obj.name = name
				38	obj.namespace = namespace
				39	return obj
				40
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	41	class AttributeValueWithCharsetSubstitution(str):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	42	"""A stand-in object for a character encoding specified in HTML."""
				43
				44	class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
				45	"""A generic stand-in for the value of a meta tag's 'charset' attribute.
				46
				47	When Beautiful Soup parses the markup '<meta charset="utf8">', the
				48	value of the 'charset' attribute will be one of these objects.
				49	"""
				50
				51	def __new__(cls, original_value):
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	52	obj = str.__new__(cls, original_value)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	53	obj.original_value = original_value
				54	return obj
				55
				56	def encode(self, encoding):
				57	return encoding
				58
				59
				60	class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
				61	"""A generic stand-in for the value of a meta tag's 'content' attribute.
				62
				63	When Beautiful Soup parses the markup:
				64	<meta http-equiv="content-type" content="text/html; charset=utf8">
				65
				66	The value of the 'content' attribute will be one of these objects.
				67	"""
				68
Brad Bishop	1932369	2019-04-05 15:28:33 -0400	[diff] [blame]	69	CHARSET_RE = re.compile(r"((^\|;)\scharset=)([^;])", re.M)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	70
				71	def __new__(cls, original_value):
				72	match = cls.CHARSET_RE.search(original_value)
				73	if match is None:
				74	# No substitution necessary.
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	75	return str.__new__(str, original_value)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	76
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	77	obj = str.__new__(cls, original_value)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	78	obj.original_value = original_value
				79	return obj
				80
				81	def encode(self, encoding):
				82	def rewrite(match):
				83	return match.group(1) + encoding
				84	return self.CHARSET_RE.sub(rewrite, self.original_value)
				85
				86	class HTMLAwareEntitySubstitution(EntitySubstitution):
				87
				88	"""Entity substitution rules that are aware of some HTML quirks.
				89
				90	Specifically, the contents of <script> and <style> tags should not
				91	undergo entity substitution.
				92
				93	Incoming NavigableString objects are checked to see if they're the
				94	direct children of a <script> or <style> tag.
				95	"""
				96
				97	cdata_containing_tags = set(["script", "style"])
				98
				99	preformatted_tags = set(["pre"])
				100
				101	@classmethod
				102	def _substitute_if_appropriate(cls, ns, f):
				103	if (isinstance(ns, NavigableString)
				104	and ns.parent is not None
				105	and ns.parent.name in cls.cdata_containing_tags):
				106	# Do nothing.
				107	return ns
				108	# Substitute.
				109	return f(ns)
				110
				111	@classmethod
				112	def substitute_html(cls, ns):
				113	return cls._substitute_if_appropriate(
				114	ns, EntitySubstitution.substitute_html)
				115
				116	@classmethod
				117	def substitute_xml(cls, ns):
				118	return cls._substitute_if_appropriate(
				119	ns, EntitySubstitution.substitute_xml)
				120
				121	class PageElement(object):
				122	"""Contains the navigational information for some part of the page
				123	(either a tag or a piece of text)"""
				124
				125	# There are five possible values for the "formatter" argument passed in
				126	# to methods like encode() and prettify():
				127	#
				128	# "html" - All Unicode characters with corresponding HTML entities
				129	# are converted to those entities on output.
				130	# "minimal" - Bare ampersands and angle brackets are converted to
				131	# XML entities: & < >
				132	# None - The null formatter. Unicode characters are never
				133	# converted to entities. This is not recommended, but it's
				134	# faster than "minimal".
				135	# A function - This function will be called on every string that
				136	# needs to undergo entity substitution.
				137	#
				138
				139	# In an HTML document, the default "html" and "minimal" functions
				140	# will leave the contents of <script> and <style> tags alone. For
				141	# an XML document, all tags will be given the same treatment.
				142
				143	HTML_FORMATTERS = {
				144	"html" : HTMLAwareEntitySubstitution.substitute_html,
				145	"minimal" : HTMLAwareEntitySubstitution.substitute_xml,
				146	None : None
				147	}
				148
				149	XML_FORMATTERS = {
				150	"html" : EntitySubstitution.substitute_html,
				151	"minimal" : EntitySubstitution.substitute_xml,
				152	None : None
				153	}
				154
				155	def format_string(self, s, formatter='minimal'):
				156	"""Format the given string using the given formatter."""
Brad Bishop	1932369	2019-04-05 15:28:33 -0400	[diff] [blame]	157	if not isinstance(formatter, collections.abc.Callable):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	158	formatter = self._formatter_for_name(formatter)
				159	if formatter is None:
				160	output = s
				161	else:
				162	output = formatter(s)
				163	return output
				164
				165	@property
				166	def _is_xml(self):
				167	"""Is this element part of an XML tree or an HTML tree?
				168
				169	This is used when mapping a formatter name ("minimal") to an
				170	appropriate function (one that performs entity-substitution on
				171	the contents of <script> and <style> tags, or not). It's
				172	inefficient, but it should be called very rarely.
				173	"""
				174	if self.parent is None:
				175	# This is the top-level object. It should have .is_xml set
				176	# from tree creation. If not, take a guess--BS is usually
				177	# used on HTML markup.
				178	return getattr(self, 'is_xml', False)
				179	return self.parent._is_xml
				180
				181	def _formatter_for_name(self, name):
				182	"Look up a formatter function based on its name and the tree."
				183	if self._is_xml:
				184	return self.XML_FORMATTERS.get(
				185	name, EntitySubstitution.substitute_xml)
				186	else:
				187	return self.HTML_FORMATTERS.get(
				188	name, HTMLAwareEntitySubstitution.substitute_xml)
				189
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	190	def setup(self, parent=None, previous_element=None, next_element=None,
				191	previous_sibling=None, next_sibling=None):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	192	"""Sets up the initial relations between this element and
				193	other elements."""
				194	self.parent = parent
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	195
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	196	self.previous_element = previous_element
				197	if previous_element is not None:
				198	self.previous_element.next_element = self
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	199
				200	self.next_element = next_element
				201	if self.next_element:
				202	self.next_element.previous_element = self
				203
				204	self.next_sibling = next_sibling
				205	if self.next_sibling:
				206	self.next_sibling.previous_sibling = self
				207
				208	if (not previous_sibling
				209	and self.parent is not None and self.parent.contents):
				210	previous_sibling = self.parent.contents[-1]
				211
				212	self.previous_sibling = previous_sibling
				213	if previous_sibling:
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	214	self.previous_sibling.next_sibling = self
				215
				216	nextSibling = _alias("next_sibling") # BS3
				217	previousSibling = _alias("previous_sibling") # BS3
				218
				219	def replace_with(self, replace_with):
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	220	if not self.parent:
				221	raise ValueError(
				222	"Cannot replace one element with another when the"
				223	"element to be replaced is not part of a tree.")
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	224	if replace_with is self:
				225	return
				226	if replace_with is self.parent:
				227	raise ValueError("Cannot replace a Tag with its parent.")
				228	old_parent = self.parent
				229	my_index = self.parent.index(self)
				230	self.extract()
				231	old_parent.insert(my_index, replace_with)
				232	return self
				233	replaceWith = replace_with # BS3
				234
				235	def unwrap(self):
				236	my_parent = self.parent
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	237	if not self.parent:
				238	raise ValueError(
				239	"Cannot replace an element with its contents when that"
				240	"element is not part of a tree.")
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	241	my_index = self.parent.index(self)
				242	self.extract()
				243	for child in reversed(self.contents[:]):
				244	my_parent.insert(my_index, child)
				245	return self
				246	replace_with_children = unwrap
				247	replaceWithChildren = unwrap # BS3
				248
				249	def wrap(self, wrap_inside):
				250	me = self.replace_with(wrap_inside)
				251	wrap_inside.append(me)
				252	return wrap_inside
				253
				254	def extract(self):
				255	"""Destructively rips this element out of the tree."""
				256	if self.parent is not None:
				257	del self.parent.contents[self.parent.index(self)]
				258
				259	#Find the two elements that would be next to each other if
				260	#this element (and any children) hadn't been parsed. Connect
				261	#the two.
				262	last_child = self._last_descendant()
				263	next_element = last_child.next_element
				264
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	265	if (self.previous_element is not None and
				266	self.previous_element is not next_element):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	267	self.previous_element.next_element = next_element
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	268	if next_element is not None and next_element is not self.previous_element:
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	269	next_element.previous_element = self.previous_element
				270	self.previous_element = None
				271	last_child.next_element = None
				272
				273	self.parent = None
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	274	if (self.previous_sibling is not None
				275	and self.previous_sibling is not self.next_sibling):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	276	self.previous_sibling.next_sibling = self.next_sibling
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	277	if (self.next_sibling is not None
				278	and self.next_sibling is not self.previous_sibling):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	279	self.next_sibling.previous_sibling = self.previous_sibling
				280	self.previous_sibling = self.next_sibling = None
				281	return self
				282
				283	def _last_descendant(self, is_initialized=True, accept_self=True):
				284	"Finds the last element beneath this object to be parsed."
				285	if is_initialized and self.next_sibling:
				286	last_child = self.next_sibling.previous_element
				287	else:
				288	last_child = self
				289	while isinstance(last_child, Tag) and last_child.contents:
				290	last_child = last_child.contents[-1]
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	291	if not accept_self and last_child is self:
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	292	last_child = None
				293	return last_child
				294	# BS3: Not part of the API!
				295	_lastRecursiveChild = _last_descendant
				296
				297	def insert(self, position, new_child):
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	298	if new_child is None:
				299	raise ValueError("Cannot insert None into a tag.")
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	300	if new_child is self:
				301	raise ValueError("Cannot insert a tag into itself.")
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	302	if (isinstance(new_child, str)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	303	and not isinstance(new_child, NavigableString)):
				304	new_child = NavigableString(new_child)
				305
				306	position = min(position, len(self.contents))
				307	if hasattr(new_child, 'parent') and new_child.parent is not None:
				308	# We're 'inserting' an element that's already one
				309	# of this object's children.
				310	if new_child.parent is self:
				311	current_index = self.index(new_child)
				312	if current_index < position:
				313	# We're moving this element further down the list
				314	# of this object's children. That means that when
				315	# we extract this element, our target index will
				316	# jump down one.
				317	position -= 1
				318	new_child.extract()
				319
				320	new_child.parent = self
				321	previous_child = None
				322	if position == 0:
				323	new_child.previous_sibling = None
				324	new_child.previous_element = self
				325	else:
				326	previous_child = self.contents[position - 1]
				327	new_child.previous_sibling = previous_child
				328	new_child.previous_sibling.next_sibling = new_child
				329	new_child.previous_element = previous_child._last_descendant(False)
				330	if new_child.previous_element is not None:
				331	new_child.previous_element.next_element = new_child
				332
				333	new_childs_last_element = new_child._last_descendant(False)
				334
				335	if position >= len(self.contents):
				336	new_child.next_sibling = None
				337
				338	parent = self
				339	parents_next_sibling = None
				340	while parents_next_sibling is None and parent is not None:
				341	parents_next_sibling = parent.next_sibling
				342	parent = parent.parent
				343	if parents_next_sibling is not None:
				344	# We found the element that comes next in the document.
				345	break
				346	if parents_next_sibling is not None:
				347	new_childs_last_element.next_element = parents_next_sibling
				348	else:
				349	# The last element of this tag is the last element in
				350	# the document.
				351	new_childs_last_element.next_element = None
				352	else:
				353	next_child = self.contents[position]
				354	new_child.next_sibling = next_child
				355	if new_child.next_sibling is not None:
				356	new_child.next_sibling.previous_sibling = new_child
				357	new_childs_last_element.next_element = next_child
				358
				359	if new_childs_last_element.next_element is not None:
				360	new_childs_last_element.next_element.previous_element = new_childs_last_element
				361	self.contents.insert(position, new_child)
				362
				363	def append(self, tag):
				364	"""Appends the given tag to the contents of this tag."""
				365	self.insert(len(self.contents), tag)
				366
				367	def insert_before(self, predecessor):
				368	"""Makes the given element the immediate predecessor of this one.
				369
				370	The two elements will have the same parent, and the given element
				371	will be immediately before this one.
				372	"""
				373	if self is predecessor:
				374	raise ValueError("Can't insert an element before itself.")
				375	parent = self.parent
				376	if parent is None:
				377	raise ValueError(
				378	"Element has no parent, so 'before' has no meaning.")
				379	# Extract first so that the index won't be screwed up if they
				380	# are siblings.
				381	if isinstance(predecessor, PageElement):
				382	predecessor.extract()
				383	index = parent.index(self)
				384	parent.insert(index, predecessor)
				385
				386	def insert_after(self, successor):
				387	"""Makes the given element the immediate successor of this one.
				388
				389	The two elements will have the same parent, and the given element
				390	will be immediately after this one.
				391	"""
				392	if self is successor:
				393	raise ValueError("Can't insert an element after itself.")
				394	parent = self.parent
				395	if parent is None:
				396	raise ValueError(
				397	"Element has no parent, so 'after' has no meaning.")
				398	# Extract first so that the index won't be screwed up if they
				399	# are siblings.
				400	if isinstance(successor, PageElement):
				401	successor.extract()
				402	index = parent.index(self)
				403	parent.insert(index+1, successor)
				404
				405	def find_next(self, name=None, attrs={}, text=None, **kwargs):
				406	"""Returns the first item that matches the given criteria and
				407	appears after this Tag in the document."""
				408	return self._find_one(self.find_all_next, name, attrs, text, **kwargs)
				409	findNext = find_next # BS3
				410
				411	def find_all_next(self, name=None, attrs={}, text=None, limit=None,
				412	**kwargs):
				413	"""Returns all items that match the given criteria and appear
				414	after this Tag in the document."""
				415	return self._find_all(name, attrs, text, limit, self.next_elements,
				416	**kwargs)
				417	findAllNext = find_all_next # BS3
				418
				419	def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs):
				420	"""Returns the closest sibling to this Tag that matches the
				421	given criteria and appears after this Tag in the document."""
				422	return self._find_one(self.find_next_siblings, name, attrs, text,
				423	**kwargs)
				424	findNextSibling = find_next_sibling # BS3
				425
				426	def find_next_siblings(self, name=None, attrs={}, text=None, limit=None,
				427	**kwargs):
				428	"""Returns the siblings of this Tag that match the given
				429	criteria and appear after this Tag in the document."""
				430	return self._find_all(name, attrs, text, limit,
				431	self.next_siblings, **kwargs)
				432	findNextSiblings = find_next_siblings # BS3
				433	fetchNextSiblings = find_next_siblings # BS2
				434
				435	def find_previous(self, name=None, attrs={}, text=None, **kwargs):
				436	"""Returns the first item that matches the given criteria and
				437	appears before this Tag in the document."""
				438	return self._find_one(
				439	self.find_all_previous, name, attrs, text, **kwargs)
				440	findPrevious = find_previous # BS3
				441
				442	def find_all_previous(self, name=None, attrs={}, text=None, limit=None,
				443	**kwargs):
				444	"""Returns all items that match the given criteria and appear
				445	before this Tag in the document."""
				446	return self._find_all(name, attrs, text, limit, self.previous_elements,
				447	**kwargs)
				448	findAllPrevious = find_all_previous # BS3
				449	fetchPrevious = find_all_previous # BS2
				450
				451	def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs):
				452	"""Returns the closest sibling to this Tag that matches the
				453	given criteria and appears before this Tag in the document."""
				454	return self._find_one(self.find_previous_siblings, name, attrs, text,
				455	**kwargs)
				456	findPreviousSibling = find_previous_sibling # BS3
				457
				458	def find_previous_siblings(self, name=None, attrs={}, text=None,
				459	limit=None, **kwargs):
				460	"""Returns the siblings of this Tag that match the given
				461	criteria and appear before this Tag in the document."""
				462	return self._find_all(name, attrs, text, limit,
				463	self.previous_siblings, **kwargs)
				464	findPreviousSiblings = find_previous_siblings # BS3
				465	fetchPreviousSiblings = find_previous_siblings # BS2
				466
				467	def find_parent(self, name=None, attrs={}, **kwargs):
				468	"""Returns the closest parent of this Tag that matches the given
				469	criteria."""
				470	# NOTE: We can't use _find_one because findParents takes a different
				471	# set of arguments.
				472	r = None
				473	l = self.find_parents(name, attrs, 1, **kwargs)
				474	if l:
				475	r = l[0]
				476	return r
				477	findParent = find_parent # BS3
				478
				479	def find_parents(self, name=None, attrs={}, limit=None, **kwargs):
				480	"""Returns the parents of this Tag that match the given
				481	criteria."""
				482
				483	return self._find_all(name, attrs, None, limit, self.parents,
				484	**kwargs)
				485	findParents = find_parents # BS3
				486	fetchParents = find_parents # BS2
				487
				488	@property
				489	def next(self):
				490	return self.next_element
				491
				492	@property
				493	def previous(self):
				494	return self.previous_element
				495
				496	#These methods do the real heavy lifting.
				497
				498	def _find_one(self, method, name, attrs, text, **kwargs):
				499	r = None
				500	l = method(name, attrs, text, 1, **kwargs)
				501	if l:
				502	r = l[0]
				503	return r
				504
				505	def _find_all(self, name, attrs, text, limit, generator, **kwargs):
				506	"Iterates over a generator looking for things that match."
				507
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	508	if text is None and 'string' in kwargs:
				509	text = kwargs['string']
				510	del kwargs['string']
				511
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	512	if isinstance(name, SoupStrainer):
				513	strainer = name
				514	else:
				515	strainer = SoupStrainer(name, attrs, text, **kwargs)
				516
				517	if text is None and not limit and not attrs and not kwargs:
				518	if name is True or name is None:
				519	# Optimization to find all tags.
				520	result = (element for element in generator
				521	if isinstance(element, Tag))
				522	return ResultSet(strainer, result)
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	523	elif isinstance(name, str):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	524	# Optimization to find all tags with a given name.
				525	result = (element for element in generator
				526	if isinstance(element, Tag)
				527	and element.name == name)
				528	return ResultSet(strainer, result)
				529	results = ResultSet(strainer)
				530	while True:
				531	try:
				532	i = next(generator)
				533	except StopIteration:
				534	break
				535	if i:
				536	found = strainer.search(i)
				537	if found:
				538	results.append(found)
				539	if limit and len(results) >= limit:
				540	break
				541	return results
				542
				543	#These generators can be used to navigate starting from both
				544	#NavigableStrings and Tags.
				545	@property
				546	def next_elements(self):
				547	i = self.next_element
				548	while i is not None:
				549	yield i
				550	i = i.next_element
				551
				552	@property
				553	def next_siblings(self):
				554	i = self.next_sibling
				555	while i is not None:
				556	yield i
				557	i = i.next_sibling
				558
				559	@property
				560	def previous_elements(self):
				561	i = self.previous_element
				562	while i is not None:
				563	yield i
				564	i = i.previous_element
				565
				566	@property
				567	def previous_siblings(self):
				568	i = self.previous_sibling
				569	while i is not None:
				570	yield i
				571	i = i.previous_sibling
				572
				573	@property
				574	def parents(self):
				575	i = self.parent
				576	while i is not None:
				577	yield i
				578	i = i.parent
				579
				580	# Methods for supporting CSS selectors.
				581
Brad Bishop	1932369	2019-04-05 15:28:33 -0400	[diff] [blame]	582	tag_name_re = re.compile(r'^[a-zA-Z0-9][-.a-zA-Z0-9:_]*$')
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	583
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	584	# /^([a-zA-Z0-9][-.a-zA-Z0-9:_])\[(\w+)([=~\\|\^\$\]?)=?"?([^\]"]*)"?\]$/
				585	# \---------------------------/ \---/\-------------/ \-------/
				586	# \| \| \| \|
				587	# \| \| \| The value
				588	# \| \| ~,\|,^,$,* or =
				589	# \| Attribute
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	590	# Tag
				591	attribselect_re = re.compile(
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	592	r'^(?P<tag>[a-zA-Z0-9][-.a-zA-Z0-9:_])?\[(?P<attribute>[\w-]+)(?P<operator>[=~\\|\^\$\]?)' +
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	593	r'=?"?(?P<value>[^\]"]*)"?\]$'
				594	)
				595
				596	def _attr_value_as_string(self, value, default=None):
				597	"""Force an attribute value into a string representation.
				598
				599	A multi-valued attribute will be converted into a
				600	space-separated stirng.
				601	"""
				602	value = self.get(value, default)
				603	if isinstance(value, list) or isinstance(value, tuple):
				604	value =" ".join(value)
				605	return value
				606
				607	def _tag_name_matches_and(self, function, tag_name):
				608	if not tag_name:
				609	return function
				610	else:
				611	def _match(tag):
				612	return tag.name == tag_name and function(tag)
				613	return _match
				614
				615	def _attribute_checker(self, operator, attribute, value=''):
				616	"""Create a function that performs a CSS selector operation.
				617
				618	Takes an operator, attribute and optional value. Returns a
				619	function that will return True for elements that match that
				620	combination.
				621	"""
				622	if operator == '=':
				623	# string representation of `attribute` is equal to `value`
				624	return lambda el: el._attr_value_as_string(attribute) == value
				625	elif operator == '~':
				626	# space-separated list representation of `attribute`
				627	# contains `value`
				628	def _includes_value(element):
				629	attribute_value = element.get(attribute, [])
				630	if not isinstance(attribute_value, list):
				631	attribute_value = attribute_value.split()
				632	return value in attribute_value
				633	return _includes_value
				634	elif operator == '^':
				635	# string representation of `attribute` starts with `value`
				636	return lambda el: el._attr_value_as_string(
				637	attribute, '').startswith(value)
				638	elif operator == '$':
				639	# string represenation of `attribute` ends with `value`
				640	return lambda el: el._attr_value_as_string(
				641	attribute, '').endswith(value)
				642	elif operator == '*':
				643	# string representation of `attribute` contains `value`
				644	return lambda el: value in el._attr_value_as_string(attribute, '')
				645	elif operator == '\|':
				646	# string representation of `attribute` is either exactly
				647	# `value` or starts with `value` and then a dash.
				648	def _is_or_starts_with_dash(element):
				649	attribute_value = element._attr_value_as_string(attribute, '')
				650	return (attribute_value == value or attribute_value.startswith(
				651	value + '-'))
				652	return _is_or_starts_with_dash
				653	else:
				654	return lambda el: el.has_attr(attribute)
				655
				656	# Old non-property versions of the generators, for backwards
				657	# compatibility with BS3.
				658	def nextGenerator(self):
				659	return self.next_elements
				660
				661	def nextSiblingGenerator(self):
				662	return self.next_siblings
				663
				664	def previousGenerator(self):
				665	return self.previous_elements
				666
				667	def previousSiblingGenerator(self):
				668	return self.previous_siblings
				669
				670	def parentGenerator(self):
				671	return self.parents
				672
				673
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	674	class NavigableString(str, PageElement):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	675
				676	PREFIX = ''
				677	SUFFIX = ''
				678
				679	def __new__(cls, value):
				680	"""Create a new NavigableString.
				681
				682	When unpickling a NavigableString, this method is called with
				683	the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
				684	passed in to the superclass's __new__ or the superclass won't know
				685	how to handle non-ASCII characters.
				686	"""
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	687	if isinstance(value, str):
				688	u = str.__new__(cls, value)
				689	else:
				690	u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
				691	u.setup()
				692	return u
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	693
				694	def __copy__(self):
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	695	"""A copy of a NavigableString has the same contents and class
				696	as the original, but it is not connected to the parse tree.
				697	"""
				698	return type(self)(self)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	699
				700	def __getnewargs__(self):
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	701	return (str(self),)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	702
				703	def __getattr__(self, attr):
				704	"""text.string gives you text. This is for backwards
				705	compatibility for NavigableString, but for CData it lets you
				706	get the string without the CData wrapper."""
				707	if attr == 'string':
				708	return self
				709	else:
				710	raise AttributeError(
				711	"'%s' object has no attribute '%s'" % (
				712	self.__class__.__name__, attr))
				713
				714	def output_ready(self, formatter="minimal"):
				715	output = self.format_string(self, formatter)
				716	return self.PREFIX + output + self.SUFFIX
				717
				718	@property
				719	def name(self):
				720	return None
				721
				722	@name.setter
				723	def name(self, name):
				724	raise AttributeError("A NavigableString cannot be given a name.")
				725
				726	class PreformattedString(NavigableString):
				727	"""A NavigableString not subject to the normal formatting rules.
				728
				729	The string will be passed into the formatter (to trigger side effects),
				730	but the return value will be ignored.
				731	"""
				732
				733	def output_ready(self, formatter="minimal"):
				734	"""CData strings are passed into the formatter.
				735	But the return value is ignored."""
				736	self.format_string(self, formatter)
				737	return self.PREFIX + self + self.SUFFIX
				738
				739	class CData(PreformattedString):
				740
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	741	PREFIX = '<![CDATA['
				742	SUFFIX = ']]>'
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	743
				744	class ProcessingInstruction(PreformattedString):
				745
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	746	PREFIX = '<?'
				747	SUFFIX = '>'
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	748
				749	class Comment(PreformattedString):
				750
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	751	PREFIX = '<!--'
				752	SUFFIX = '-->'
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	753
				754
				755	class Declaration(PreformattedString):
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	756	PREFIX = '<?'
				757	SUFFIX = '?>'
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	758
				759
				760	class Doctype(PreformattedString):
				761
				762	@classmethod
				763	def for_name_and_ids(cls, name, pub_id, system_id):
				764	value = name or ''
				765	if pub_id is not None:
				766	value += ' PUBLIC "%s"' % pub_id
				767	if system_id is not None:
				768	value += ' "%s"' % system_id
				769	elif system_id is not None:
				770	value += ' SYSTEM "%s"' % system_id
				771
				772	return Doctype(value)
				773
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	774	PREFIX = '<!DOCTYPE '
				775	SUFFIX = '>\n'
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	776
				777
				778	class Tag(PageElement):
				779
				780	"""Represents a found HTML tag with its attributes and contents."""
				781
				782	def __init__(self, parser=None, builder=None, name=None, namespace=None,
				783	prefix=None, attrs=None, parent=None, previous=None):
				784	"Basic constructor."
				785
				786	if parser is None:
				787	self.parser_class = None
				788	else:
				789	# We don't actually store the parser object: that lets extracted
				790	# chunks be garbage-collected.
				791	self.parser_class = parser.__class__
				792	if name is None:
				793	raise ValueError("No value provided for new tag's name.")
				794	self.name = name
				795	self.namespace = namespace
				796	self.prefix = prefix
				797	if attrs is None:
				798	attrs = {}
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	799	elif attrs:
				800	if builder is not None and builder.cdata_list_attributes:
				801	attrs = builder._replace_cdata_list_attribute_values(
				802	self.name, attrs)
				803	else:
				804	attrs = dict(attrs)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	805	else:
				806	attrs = dict(attrs)
				807	self.attrs = attrs
				808	self.contents = []
				809	self.setup(parent, previous)
				810	self.hidden = False
				811
				812	# Set up any substitutions, such as the charset in a META tag.
				813	if builder is not None:
				814	builder.set_up_substitutions(self)
				815	self.can_be_empty_element = builder.can_be_empty_element(name)
				816	else:
				817	self.can_be_empty_element = False
				818
				819	parserClass = _alias("parser_class") # BS3
				820
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	821	def __copy__(self):
				822	"""A copy of a Tag is a new Tag, unconnected to the parse tree.
				823	Its contents are a copy of the old Tag's contents.
				824	"""
				825	clone = type(self)(None, self.builder, self.name, self.namespace,
				826	self.nsprefix, self.attrs)
				827	for attr in ('can_be_empty_element', 'hidden'):
				828	setattr(clone, attr, getattr(self, attr))
				829	for child in self.contents:
				830	clone.append(child.__copy__())
				831	return clone
				832
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	833	@property
				834	def is_empty_element(self):
				835	"""Is this tag an empty-element tag? (aka a self-closing tag)
				836
				837	A tag that has contents is never an empty-element tag.
				838
				839	A tag that has no contents may or may not be an empty-element
				840	tag. It depends on the builder used to create the tag. If the
				841	builder has a designated list of empty-element tags, then only
				842	a tag whose name shows up in that list is considered an
				843	empty-element tag.
				844
				845	If the builder has no designated list of empty-element tags,
				846	then any tag with no contents is an empty-element tag.
				847	"""
				848	return len(self.contents) == 0 and self.can_be_empty_element
				849	isSelfClosing = is_empty_element # BS3
				850
				851	@property
				852	def string(self):
				853	"""Convenience property to get the single string within this tag.
				854
				855	:Return: If this tag has a single string child, return value
				856	is that string. If this tag has no children, or more than one
				857	child, return value is None. If this tag has one child tag,
				858	return value is the 'string' attribute of the child tag,
				859	recursively.
				860	"""
				861	if len(self.contents) != 1:
				862	return None
				863	child = self.contents[0]
				864	if isinstance(child, NavigableString):
				865	return child
				866	return child.string
				867
				868	@string.setter
				869	def string(self, string):
				870	self.clear()
				871	self.append(string.__class__(string))
				872
				873	def _all_strings(self, strip=False, types=(NavigableString, CData)):
				874	"""Yield all strings of certain classes, possibly stripping them.
				875
				876	By default, yields only NavigableString and CData objects. So
				877	no comments, processing instructions, etc.
				878	"""
				879	for descendant in self.descendants:
				880	if (
				881	(types is None and not isinstance(descendant, NavigableString))
				882	or
				883	(types is not None and type(descendant) not in types)):
				884	continue
				885	if strip:
				886	descendant = descendant.strip()
				887	if len(descendant) == 0:
				888	continue
				889	yield descendant
				890
				891	strings = property(_all_strings)
				892
				893	@property
				894	def stripped_strings(self):
				895	for string in self._all_strings(True):
				896	yield string
				897
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	898	def get_text(self, separator="", strip=False,
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	899	types=(NavigableString, CData)):
				900	"""
				901	Get all child strings, concatenated using the given separator.
				902	"""
				903	return separator.join([s for s in self._all_strings(
				904	strip, types=types)])
				905	getText = get_text
				906	text = property(get_text)
				907
				908	def decompose(self):
				909	"""Recursively destroys the contents of this tree."""
				910	self.extract()
				911	i = self
				912	while i is not None:
				913	next = i.next_element
				914	i.__dict__.clear()
				915	i.contents = []
				916	i = next
				917
				918	def clear(self, decompose=False):
				919	"""
				920	Extract all children. If decompose is True, decompose instead.
				921	"""
				922	if decompose:
				923	for element in self.contents[:]:
				924	if isinstance(element, Tag):
				925	element.decompose()
				926	else:
				927	element.extract()
				928	else:
				929	for element in self.contents[:]:
				930	element.extract()
				931
				932	def index(self, element):
				933	"""
				934	Find the index of a child by identity, not value. Avoids issues with
				935	tag.contents.index(element) getting the index of equal elements.
				936	"""
				937	for i, child in enumerate(self.contents):
				938	if child is element:
				939	return i
				940	raise ValueError("Tag.index: element not in tag")
				941
				942	def get(self, key, default=None):
				943	"""Returns the value of the 'key' attribute for the tag, or
				944	the value given for 'default' if it doesn't have that
				945	attribute."""
				946	return self.attrs.get(key, default)
				947
				948	def has_attr(self, key):
				949	return key in self.attrs
				950
				951	def __hash__(self):
				952	return str(self).__hash__()
				953
				954	def __getitem__(self, key):
				955	"""tag[key] returns the value of the 'key' attribute for the tag,
				956	and throws an exception if it's not there."""
				957	return self.attrs[key]
				958
				959	def __iter__(self):
				960	"Iterating over a tag iterates over its contents."
				961	return iter(self.contents)
				962
				963	def __len__(self):
				964	"The length of a tag is the length of its list of contents."
				965	return len(self.contents)
				966
				967	def __contains__(self, x):
				968	return x in self.contents
				969
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	970	def __bool__(self):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	971	"A tag is non-None even if it has no contents."
				972	return True
				973
				974	def __setitem__(self, key, value):
				975	"""Setting tag[key] sets the value of the 'key' attribute for the
				976	tag."""
				977	self.attrs[key] = value
				978
				979	def __delitem__(self, key):
				980	"Deleting tag[key] deletes all 'key' attributes for the tag."
				981	self.attrs.pop(key, None)
				982
				983	def __call__(self, args, *kwargs):
				984	"""Calling a tag like a function is the same as calling its
				985	find_all() method. Eg. tag('a') returns a list of all the A tags
				986	found within this tag."""
				987	return self.find_all(args, *kwargs)
				988
				989	def __getattr__(self, tag):
				990	#print "Getattr %s.%s" % (self.__class__, tag)
				991	if len(tag) > 3 and tag.endswith('Tag'):
				992	# BS3: soup.aTag -> "soup.find("a")
				993	tag_name = tag[:-3]
				994	warnings.warn(
				995	'.%sTag is deprecated, use .find("%s") instead.' % (
				996	tag_name, tag_name))
				997	return self.find(tag_name)
				998	# We special case contents to avoid recursion.
				999	elif not tag.startswith("__") and not tag=="contents":
				1000	return self.find(tag)
				1001	raise AttributeError(
				1002	"'%s' object has no attribute '%s'" % (self.__class__, tag))
				1003
				1004	def __eq__(self, other):
				1005	"""Returns true iff this tag has the same name, the same attributes,
				1006	and the same contents (recursively) as the given tag."""
				1007	if self is other:
				1008	return True
				1009	if (not hasattr(other, 'name') or
				1010	not hasattr(other, 'attrs') or
				1011	not hasattr(other, 'contents') or
				1012	self.name != other.name or
				1013	self.attrs != other.attrs or
				1014	len(self) != len(other)):
				1015	return False
				1016	for i, my_child in enumerate(self.contents):
				1017	if my_child != other.contents[i]:
				1018	return False
				1019	return True
				1020
				1021	def __ne__(self, other):
				1022	"""Returns true iff this tag is not identical to the other tag,
				1023	as defined in __eq__."""
				1024	return not self == other
				1025
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1026	def __repr__(self, encoding="unicode-escape"):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1027	"""Renders this tag as a string."""
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1028	if PY3K:
				1029	# "The return value must be a string object", i.e. Unicode
				1030	return self.decode()
				1031	else:
				1032	# "The return value must be a string object", i.e. a bytestring.
				1033	# By convention, the return value of __repr__ should also be
				1034	# an ASCII string.
				1035	return self.encode(encoding)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1036
				1037	def __unicode__(self):
				1038	return self.decode()
				1039
				1040	def __str__(self):
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1041	if PY3K:
				1042	return self.decode()
				1043	else:
				1044	return self.encode()
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1045
				1046	if PY3K:
				1047	__str__ = __repr__ = __unicode__
				1048
				1049	def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
				1050	indent_level=None, formatter="minimal",
				1051	errors="xmlcharrefreplace"):
				1052	# Turn the data structure into Unicode, then encode the
				1053	# Unicode.
				1054	u = self.decode(indent_level, encoding, formatter)
				1055	return u.encode(encoding, errors)
				1056
				1057	def _should_pretty_print(self, indent_level):
				1058	"""Should this tag be pretty-printed?"""
				1059	return (
				1060	indent_level is not None and
				1061	(self.name not in HTMLAwareEntitySubstitution.preformatted_tags
				1062	or self._is_xml))
				1063
				1064	def decode(self, indent_level=None,
				1065	eventual_encoding=DEFAULT_OUTPUT_ENCODING,
				1066	formatter="minimal"):
				1067	"""Returns a Unicode representation of this tag and its contents.
				1068
				1069	:param eventual_encoding: The tag is destined to be
				1070	encoded into this encoding. This method is _not_
				1071	responsible for performing that encoding. This information
				1072	is passed in so that it can be substituted in if the
				1073	document contains a <META> tag that mentions the document's
				1074	encoding.
				1075	"""
				1076
				1077	# First off, turn a string formatter into a function. This
				1078	# will stop the lookup from happening over and over again.
Brad Bishop	1932369	2019-04-05 15:28:33 -0400	[diff] [blame]	1079	if not isinstance(formatter, collections.abc.Callable):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1080	formatter = self._formatter_for_name(formatter)
				1081
				1082	attrs = []
				1083	if self.attrs:
				1084	for key, val in sorted(self.attrs.items()):
				1085	if val is None:
				1086	decoded = key
				1087	else:
				1088	if isinstance(val, list) or isinstance(val, tuple):
				1089	val = ' '.join(val)
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1090	elif not isinstance(val, str):
				1091	val = str(val)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1092	elif (
				1093	isinstance(val, AttributeValueWithCharsetSubstitution)
				1094	and eventual_encoding is not None):
				1095	val = val.encode(eventual_encoding)
				1096
				1097	text = self.format_string(val, formatter)
				1098	decoded = (
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1099	str(key) + '='
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1100	+ EntitySubstitution.quoted_attribute_value(text))
				1101	attrs.append(decoded)
				1102	close = ''
				1103	closeTag = ''
				1104
				1105	prefix = ''
				1106	if self.prefix:
				1107	prefix = self.prefix + ":"
				1108
				1109	if self.is_empty_element:
				1110	close = '/'
				1111	else:
				1112	closeTag = '</%s%s>' % (prefix, self.name)
				1113
				1114	pretty_print = self._should_pretty_print(indent_level)
				1115	space = ''
				1116	indent_space = ''
				1117	if indent_level is not None:
				1118	indent_space = (' ' * (indent_level - 1))
				1119	if pretty_print:
				1120	space = indent_space
				1121	indent_contents = indent_level + 1
				1122	else:
				1123	indent_contents = None
				1124	contents = self.decode_contents(
				1125	indent_contents, eventual_encoding, formatter)
				1126
				1127	if self.hidden:
				1128	# This is the 'document root' object.
				1129	s = contents
				1130	else:
				1131	s = []
				1132	attribute_string = ''
				1133	if attrs:
				1134	attribute_string = ' ' + ' '.join(attrs)
				1135	if indent_level is not None:
				1136	# Even if this particular tag is not pretty-printed,
				1137	# we should indent up to the start of the tag.
				1138	s.append(indent_space)
				1139	s.append('<%s%s%s%s>' % (
				1140	prefix, self.name, attribute_string, close))
				1141	if pretty_print:
				1142	s.append("\n")
				1143	s.append(contents)
				1144	if pretty_print and contents and contents[-1] != "\n":
				1145	s.append("\n")
				1146	if pretty_print and closeTag:
				1147	s.append(space)
				1148	s.append(closeTag)
				1149	if indent_level is not None and closeTag and self.next_sibling:
				1150	# Even if this particular tag is not pretty-printed,
				1151	# we're now done with the tag, and we should add a
				1152	# newline if appropriate.
				1153	s.append("\n")
				1154	s = ''.join(s)
				1155	return s
				1156
				1157	def prettify(self, encoding=None, formatter="minimal"):
				1158	if encoding is None:
				1159	return self.decode(True, formatter=formatter)
				1160	else:
				1161	return self.encode(encoding, True, formatter=formatter)
				1162
				1163	def decode_contents(self, indent_level=None,
				1164	eventual_encoding=DEFAULT_OUTPUT_ENCODING,
				1165	formatter="minimal"):
				1166	"""Renders the contents of this tag as a Unicode string.
				1167
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1168	:param indent_level: Each line of the rendering will be
				1169	indented this many spaces.
				1170
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1171	:param eventual_encoding: The tag is destined to be
				1172	encoded into this encoding. This method is _not_
				1173	responsible for performing that encoding. This information
				1174	is passed in so that it can be substituted in if the
				1175	document contains a <META> tag that mentions the document's
				1176	encoding.
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1177
				1178	:param formatter: The output formatter responsible for converting
				1179	entities to Unicode characters.
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1180	"""
				1181	# First off, turn a string formatter into a function. This
				1182	# will stop the lookup from happening over and over again.
Brad Bishop	1932369	2019-04-05 15:28:33 -0400	[diff] [blame]	1183	if not isinstance(formatter, collections.abc.Callable):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1184	formatter = self._formatter_for_name(formatter)
				1185
				1186	pretty_print = (indent_level is not None)
				1187	s = []
				1188	for c in self:
				1189	text = None
				1190	if isinstance(c, NavigableString):
				1191	text = c.output_ready(formatter)
				1192	elif isinstance(c, Tag):
				1193	s.append(c.decode(indent_level, eventual_encoding,
				1194	formatter))
				1195	if text and indent_level and not self.name == 'pre':
				1196	text = text.strip()
				1197	if text:
				1198	if pretty_print and not self.name == 'pre':
				1199	s.append(" " * (indent_level - 1))
				1200	s.append(text)
				1201	if pretty_print and not self.name == 'pre':
				1202	s.append("\n")
				1203	return ''.join(s)
				1204
				1205	def encode_contents(
				1206	self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
				1207	formatter="minimal"):
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1208	"""Renders the contents of this tag as a bytestring.
				1209
				1210	:param indent_level: Each line of the rendering will be
				1211	indented this many spaces.
				1212
				1213	:param eventual_encoding: The bytestring will be in this encoding.
				1214
				1215	:param formatter: The output formatter responsible for converting
				1216	entities to Unicode characters.
				1217	"""
				1218
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1219	contents = self.decode_contents(indent_level, encoding, formatter)
				1220	return contents.encode(encoding)
				1221
				1222	# Old method for BS3 compatibility
				1223	def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
				1224	prettyPrint=False, indentLevel=0):
				1225	if not prettyPrint:
				1226	indentLevel = None
				1227	return self.encode_contents(
				1228	indent_level=indentLevel, encoding=encoding)
				1229
				1230	#Soup methods
				1231
				1232	def find(self, name=None, attrs={}, recursive=True, text=None,
				1233	**kwargs):
				1234	"""Return only the first child of this Tag matching the given
				1235	criteria."""
				1236	r = None
				1237	l = self.find_all(name, attrs, recursive, text, 1, **kwargs)
				1238	if l:
				1239	r = l[0]
				1240	return r
				1241	findChild = find
				1242
				1243	def find_all(self, name=None, attrs={}, recursive=True, text=None,
				1244	limit=None, **kwargs):
				1245	"""Extracts a list of Tag objects that match the given
				1246	criteria. You can specify the name of the Tag and any
				1247	attributes you want the Tag to have.
				1248
				1249	The value of a key-value pair in the 'attrs' map can be a
				1250	string, a list of strings, a regular expression object, or a
				1251	callable that takes a string and returns whether or not the
				1252	string matches for some custom definition of 'matches'. The
				1253	same is true of the tag name."""
				1254
				1255	generator = self.descendants
				1256	if not recursive:
				1257	generator = self.children
				1258	return self._find_all(name, attrs, text, limit, generator, **kwargs)
				1259	findAll = find_all # BS3
				1260	findChildren = find_all # BS2
				1261
				1262	#Generator methods
				1263	@property
				1264	def children(self):
				1265	# return iter() to make the purpose of the method clear
				1266	return iter(self.contents) # XXX This seems to be untested.
				1267
				1268	@property
				1269	def descendants(self):
				1270	if not len(self.contents):
				1271	return
				1272	stopNode = self._last_descendant().next_element
				1273	current = self.contents[0]
				1274	while current is not stopNode:
				1275	yield current
				1276	current = current.next_element
				1277
				1278	# CSS selector code
				1279
				1280	_selector_combinators = ['>', '+', '~']
				1281	_select_debug = False
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1282	def select_one(self, selector):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1283	"""Perform a CSS selection operation on the current element."""
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1284	value = self.select(selector, limit=1)
				1285	if value:
				1286	return value[0]
				1287	return None
				1288
				1289	def select(self, selector, _candidate_generator=None, limit=None):
				1290	"""Perform a CSS selection operation on the current element."""
				1291
				1292	# Handle grouping selectors if ',' exists, ie: p,a
				1293	if ',' in selector:
				1294	context = []
				1295	for partial_selector in selector.split(','):
				1296	partial_selector = partial_selector.strip()
				1297	if partial_selector == '':
				1298	raise ValueError('Invalid group selection syntax: %s' % selector)
				1299	candidates = self.select(partial_selector, limit=limit)
				1300	for candidate in candidates:
				1301	if candidate not in context:
				1302	context.append(candidate)
				1303
				1304	if limit and len(context) >= limit:
				1305	break
				1306	return context
				1307
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1308	tokens = selector.split()
				1309	current_context = [self]
				1310
				1311	if tokens[-1] in self._selector_combinators:
				1312	raise ValueError(
				1313	'Final combinator "%s" is missing an argument.' % tokens[-1])
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1314
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1315	if self._select_debug:
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1316	print('Running CSS selector "%s"' % selector)
				1317
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1318	for index, token in enumerate(tokens):
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1319	new_context = []
				1320	new_context_ids = set([])
				1321
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1322	if tokens[index-1] in self._selector_combinators:
				1323	# This token was consumed by the previous combinator. Skip it.
				1324	if self._select_debug:
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1325	print(' Token was consumed by the previous combinator.')
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1326	continue
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1327
				1328	if self._select_debug:
				1329	print(' Considering token "%s"' % token)
				1330	recursive_candidate_generator = None
				1331	tag_name = None
				1332
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1333	# Each operation corresponds to a checker function, a rule
				1334	# for determining whether a candidate matches the
				1335	# selector. Candidates are generated by the active
				1336	# iterator.
				1337	checker = None
				1338
				1339	m = self.attribselect_re.match(token)
				1340	if m is not None:
				1341	# Attribute selector
				1342	tag_name, attribute, operator, value = m.groups()
				1343	checker = self._attribute_checker(operator, attribute, value)
				1344
				1345	elif '#' in token:
				1346	# ID selector
				1347	tag_name, tag_id = token.split('#', 1)
				1348	def id_matches(tag):
				1349	return tag.get('id', None) == tag_id
				1350	checker = id_matches
				1351
				1352	elif '.' in token:
				1353	# Class selector
				1354	tag_name, klass = token.split('.', 1)
				1355	classes = set(klass.split('.'))
				1356	def classes_match(candidate):
				1357	return classes.issubset(candidate.get('class', []))
				1358	checker = classes_match
				1359
				1360	elif ':' in token:
				1361	# Pseudo-class
				1362	tag_name, pseudo = token.split(':', 1)
				1363	if tag_name == '':
				1364	raise ValueError(
				1365	"A pseudo-class must be prefixed with a tag name.")
Brad Bishop	1932369	2019-04-05 15:28:33 -0400	[diff] [blame]	1366	pseudo_attributes = re.match(r'([a-zA-Z\d-]+)$([a-zA-Z\d]+)$', pseudo)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1367	found = []
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1368	if pseudo_attributes is None:
				1369	pseudo_type = pseudo
				1370	pseudo_value = None
				1371	else:
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1372	pseudo_type, pseudo_value = pseudo_attributes.groups()
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1373	if pseudo_type == 'nth-of-type':
				1374	try:
				1375	pseudo_value = int(pseudo_value)
				1376	except:
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1377	raise NotImplementedError(
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1378	'Only numeric values are currently supported for the nth-of-type pseudo-class.')
				1379	if pseudo_value < 1:
				1380	raise ValueError(
				1381	'nth-of-type pseudo-class value must be at least 1.')
				1382	class Counter(object):
				1383	def __init__(self, destination):
				1384	self.count = 0
				1385	self.destination = destination
				1386
				1387	def nth_child_of_type(self, tag):
				1388	self.count += 1
				1389	if self.count == self.destination:
				1390	return True
				1391	if self.count > self.destination:
				1392	# Stop the generator that's sending us
				1393	# these things.
				1394	raise StopIteration()
				1395	return False
				1396	checker = Counter(pseudo_value).nth_child_of_type
				1397	else:
				1398	raise NotImplementedError(
				1399	'Only the following pseudo-classes are implemented: nth-of-type.')
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1400
				1401	elif token == '*':
				1402	# Star selector -- matches everything
				1403	pass
				1404	elif token == '>':
				1405	# Run the next token as a CSS selector against the
				1406	# direct children of each tag in the current context.
				1407	recursive_candidate_generator = lambda tag: tag.children
				1408	elif token == '~':
				1409	# Run the next token as a CSS selector against the
				1410	# siblings of each tag in the current context.
				1411	recursive_candidate_generator = lambda tag: tag.next_siblings
				1412	elif token == '+':
				1413	# For each tag in the current context, run the next
				1414	# token as a CSS selector against the tag's next
				1415	# sibling that's a tag.
				1416	def next_tag_sibling(tag):
				1417	yield tag.find_next_sibling(True)
				1418	recursive_candidate_generator = next_tag_sibling
				1419
				1420	elif self.tag_name_re.match(token):
				1421	# Just a tag name.
				1422	tag_name = token
				1423	else:
				1424	raise ValueError(
				1425	'Unsupported or invalid CSS selector: "%s"' % token)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1426	if recursive_candidate_generator:
				1427	# This happens when the selector looks like "> foo".
				1428	#
				1429	# The generator calls select() recursively on every
				1430	# member of the current context, passing in a different
				1431	# candidate generator and a different selector.
				1432	#
				1433	# In the case of "> foo", the candidate generator is
				1434	# one that yields a tag's direct children (">"), and
				1435	# the selector is "foo".
				1436	next_token = tokens[index+1]
				1437	def recursive_select(tag):
				1438	if self._select_debug:
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1439	print(' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs))
				1440	print('-' * 40)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1441	for i in tag.select(next_token, recursive_candidate_generator):
				1442	if self._select_debug:
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1443	print('(Recursive select picked up candidate %s %s)' % (i.name, i.attrs))
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1444	yield i
				1445	if self._select_debug:
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1446	print('-' * 40)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1447	_use_candidate_generator = recursive_select
				1448	elif _candidate_generator is None:
				1449	# By default, a tag's candidates are all of its
				1450	# children. If tag_name is defined, only yield tags
				1451	# with that name.
				1452	if self._select_debug:
				1453	if tag_name:
				1454	check = "[any]"
				1455	else:
				1456	check = tag_name
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1457	print(' Default candidate generator, tag name="%s"' % check)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1458	if self._select_debug:
				1459	# This is redundant with later code, but it stops
				1460	# a bunch of bogus tags from cluttering up the
				1461	# debug log.
				1462	def default_candidate_generator(tag):
				1463	for child in tag.descendants:
				1464	if not isinstance(child, Tag):
				1465	continue
				1466	if tag_name and not child.name == tag_name:
				1467	continue
				1468	yield child
				1469	_use_candidate_generator = default_candidate_generator
				1470	else:
				1471	_use_candidate_generator = lambda tag: tag.descendants
				1472	else:
				1473	_use_candidate_generator = _candidate_generator
				1474
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1475	count = 0
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1476	for tag in current_context:
				1477	if self._select_debug:
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1478	print(" Running candidate generator on %s %s" % (
				1479	tag.name, repr(tag.attrs)))
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1480	for candidate in _use_candidate_generator(tag):
				1481	if not isinstance(candidate, Tag):
				1482	continue
				1483	if tag_name and candidate.name != tag_name:
				1484	continue
				1485	if checker is not None:
				1486	try:
				1487	result = checker(candidate)
				1488	except StopIteration:
				1489	# The checker has decided we should no longer
				1490	# run the generator.
				1491	break
				1492	if checker is None or result:
				1493	if self._select_debug:
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1494	print(" SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)))
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1495	if id(candidate) not in new_context_ids:
				1496	# If a tag matches a selector more than once,
				1497	# don't include it in the context more than once.
				1498	new_context.append(candidate)
				1499	new_context_ids.add(id(candidate))
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1500	if limit and len(new_context) >= limit:
				1501	break
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1502	elif self._select_debug:
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1503	print(" FAILURE %s %s" % (candidate.name, repr(candidate.attrs)))
				1504
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1505
				1506	current_context = new_context
				1507
				1508	if self._select_debug:
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1509	print("Final verdict:")
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1510	for i in current_context:
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1511	print(" %s %s" % (i.name, i.attrs))
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1512	return current_context
				1513
				1514	# Old names for backwards compatibility
				1515	def childGenerator(self):
				1516	return self.children
				1517
				1518	def recursiveChildGenerator(self):
				1519	return self.descendants
				1520
				1521	def has_key(self, key):
				1522	"""This was kind of misleading because has_key() (attributes)
				1523	was different from __in__ (contents). has_key() is gone in
				1524	Python 3, anyway."""
				1525	warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % (
				1526	key))
				1527	return self.has_attr(key)
				1528
				1529	# Next, a couple classes to represent queries and their results.
				1530	class SoupStrainer(object):
				1531	"""Encapsulates a number of ways of matching a markup element (tag or
				1532	text)."""
				1533
				1534	def __init__(self, name=None, attrs={}, text=None, **kwargs):
				1535	self.name = self._normalize_search_value(name)
				1536	if not isinstance(attrs, dict):
				1537	# Treat a non-dict value for attrs as a search for the 'class'
				1538	# attribute.
				1539	kwargs['class'] = attrs
				1540	attrs = None
				1541
				1542	if 'class_' in kwargs:
				1543	# Treat class_="foo" as a search for the 'class'
				1544	# attribute, overriding any non-dict value for attrs.
				1545	kwargs['class'] = kwargs['class_']
				1546	del kwargs['class_']
				1547
				1548	if kwargs:
				1549	if attrs:
				1550	attrs = attrs.copy()
				1551	attrs.update(kwargs)
				1552	else:
				1553	attrs = kwargs
				1554	normalized_attrs = {}
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1555	for key, value in list(attrs.items()):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1556	normalized_attrs[key] = self._normalize_search_value(value)
				1557
				1558	self.attrs = normalized_attrs
				1559	self.text = self._normalize_search_value(text)
				1560
				1561	def _normalize_search_value(self, value):
				1562	# Leave it alone if it's a Unicode string, a callable, a
				1563	# regular expression, a boolean, or None.
Brad Bishop	1932369	2019-04-05 15:28:33 -0400	[diff] [blame]	1564	if (isinstance(value, str) or isinstance(value, collections.abc.Callable) or hasattr(value, 'match')
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1565	or isinstance(value, bool) or value is None):
				1566	return value
				1567
				1568	# If it's a bytestring, convert it to Unicode, treating it as UTF-8.
				1569	if isinstance(value, bytes):
				1570	return value.decode("utf8")
				1571
				1572	# If it's listlike, convert it into a list of strings.
				1573	if hasattr(value, '__iter__'):
				1574	new_value = []
				1575	for v in value:
				1576	if (hasattr(v, '__iter__') and not isinstance(v, bytes)
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1577	and not isinstance(v, str)):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1578	# This is almost certainly the user's mistake. In the
				1579	# interests of avoiding infinite loops, we'll let
				1580	# it through as-is rather than doing a recursive call.
				1581	new_value.append(v)
				1582	else:
				1583	new_value.append(self._normalize_search_value(v))
				1584	return new_value
				1585
				1586	# Otherwise, convert it into a Unicode string.
				1587	# The unicode(str()) thing is so this will do the same thing on Python 2
				1588	# and Python 3.
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1589	return str(str(value))
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1590
				1591	def __str__(self):
				1592	if self.text:
				1593	return self.text
				1594	else:
				1595	return "%s\|%s" % (self.name, self.attrs)
				1596
				1597	def search_tag(self, markup_name=None, markup_attrs={}):
				1598	found = None
				1599	markup = None
				1600	if isinstance(markup_name, Tag):
				1601	markup = markup_name
				1602	markup_attrs = markup
				1603	call_function_with_tag_data = (
Brad Bishop	1932369	2019-04-05 15:28:33 -0400	[diff] [blame]	1604	isinstance(self.name, collections.abc.Callable)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1605	and not isinstance(markup_name, Tag))
				1606
				1607	if ((not self.name)
				1608	or call_function_with_tag_data
				1609	or (markup and self._matches(markup, self.name))
				1610	or (not markup and self._matches(markup_name, self.name))):
				1611	if call_function_with_tag_data:
				1612	match = self.name(markup_name, markup_attrs)
				1613	else:
				1614	match = True
				1615	markup_attr_map = None
				1616	for attr, match_against in list(self.attrs.items()):
				1617	if not markup_attr_map:
				1618	if hasattr(markup_attrs, 'get'):
				1619	markup_attr_map = markup_attrs
				1620	else:
				1621	markup_attr_map = {}
				1622	for k, v in markup_attrs:
				1623	markup_attr_map[k] = v
				1624	attr_value = markup_attr_map.get(attr)
				1625	if not self._matches(attr_value, match_against):
				1626	match = False
				1627	break
				1628	if match:
				1629	if markup:
				1630	found = markup
				1631	else:
				1632	found = markup_name
				1633	if found and self.text and not self._matches(found.string, self.text):
				1634	found = None
				1635	return found
				1636	searchTag = search_tag
				1637
				1638	def search(self, markup):
				1639	# print 'looking for %s in %s' % (self, markup)
				1640	found = None
				1641	# If given a list of items, scan it for a text element that
				1642	# matches.
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1643	if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1644	for element in markup:
				1645	if isinstance(element, NavigableString) \
				1646	and self.search(element):
				1647	found = element
				1648	break
				1649	# If it's a Tag, make sure its name or attributes match.
				1650	# Don't bother with Tags if we're searching for text.
				1651	elif isinstance(markup, Tag):
				1652	if not self.text or self.name or self.attrs:
				1653	found = self.search_tag(markup)
				1654	# If it's text, make sure the text matches.
				1655	elif isinstance(markup, NavigableString) or \
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1656	isinstance(markup, str):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1657	if not self.name and not self.attrs and self._matches(markup, self.text):
				1658	found = markup
				1659	else:
				1660	raise Exception(
				1661	"I don't know how to match against a %s" % markup.__class__)
				1662	return found
				1663
				1664	def _matches(self, markup, match_against):
				1665	# print u"Matching %s against %s" % (markup, match_against)
				1666	result = False
				1667	if isinstance(markup, list) or isinstance(markup, tuple):
				1668	# This should only happen when searching a multi-valued attribute
				1669	# like 'class'.
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1670	if (isinstance(match_against, str)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1671	and ' ' in match_against):
				1672	# A bit of a special case. If they try to match "foo
				1673	# bar" on a multivalue attribute's value, only accept
				1674	# the literal value "foo bar"
				1675	#
				1676	# XXX This is going to be pretty slow because we keep
				1677	# splitting match_against. But it shouldn't come up
				1678	# too often.
				1679	return (whitespace_re.split(match_against) == markup)
				1680	else:
				1681	for item in markup:
				1682	if self._matches(item, match_against):
				1683	return True
				1684	return False
				1685
				1686	if match_against is True:
				1687	# True matches any non-None value.
				1688	return markup is not None
				1689
Brad Bishop	1932369	2019-04-05 15:28:33 -0400	[diff] [blame]	1690	if isinstance(match_against, collections.abc.Callable):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1691	return match_against(markup)
				1692
				1693	# Custom callables take the tag as an argument, but all
				1694	# other ways of matching match the tag name as a string.
				1695	if isinstance(markup, Tag):
				1696	markup = markup.name
				1697
				1698	# Ensure that `markup` is either a Unicode string, or None.
				1699	markup = self._normalize_search_value(markup)
				1700
				1701	if markup is None:
				1702	# None matches None, False, an empty string, an empty list, and so on.
				1703	return not match_against
				1704
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	1705	if isinstance(match_against, str):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1706	# Exact string match
				1707	return markup == match_against
				1708
				1709	if hasattr(match_against, 'match'):
				1710	# Regexp match
				1711	return match_against.search(markup)
				1712
				1713	if hasattr(match_against, '__iter__'):
				1714	# The markup must be an exact match against something
				1715	# in the iterable.
				1716	return markup in match_against
				1717
				1718
				1719	class ResultSet(list):
				1720	"""A ResultSet is just a list that keeps track of the SoupStrainer
				1721	that created it."""
				1722	def __init__(self, source, result=()):
				1723	super(ResultSet, self).__init__(result)
				1724	self.source = source