Blame - poky/bitbake/lib/bs4/__init__.py - stefanberger/openbmc

blob: f6fdfd50b125bede34b1f31312f1ea95ee444761 [file] [log] [blame]

Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1	"""Beautiful Soup
				2	Elixir and Tonic
				3	"The Screen-Scraper's Friend"
				4	http://www.crummy.com/software/BeautifulSoup/
				5
				6	Beautiful Soup uses a pluggable XML or HTML parser to parse a
				7	(possibly invalid) document into a tree representation. Beautiful Soup
				8	provides provides methods and Pythonic idioms that make it easy to
				9	navigate, search, and modify the parse tree.
				10
				11	Beautiful Soup works with Python 2.6 and up. It works better if lxml
				12	and/or html5lib is installed.
				13
				14	For more than you ever wanted to know about Beautiful Soup, see the
				15	documentation:
				16	http://www.crummy.com/software/BeautifulSoup/bs4/doc/
				17	"""
				18
				19	__author__ = "Leonard Richardson (leonardr@segfault.org)"
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	20	__version__ = "4.4.1"
				21	__copyright__ = "Copyright (c) 2004-2015 Leonard Richardson"
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	22	__license__ = "MIT"
				23
				24	__all__ = ['BeautifulSoup']
				25
				26	import os
				27	import re
				28	import warnings
				29
				30	from .builder import builder_registry, ParserRejectedMarkup
				31	from .dammit import UnicodeDammit
				32	from .element import (
				33	CData,
				34	Comment,
				35	DEFAULT_OUTPUT_ENCODING,
				36	Declaration,
				37	Doctype,
				38	NavigableString,
				39	PageElement,
				40	ProcessingInstruction,
				41	ResultSet,
				42	SoupStrainer,
				43	Tag,
				44	)
				45
				46	# The very first thing we do is give a useful error if someone is
				47	# running this code under Python 3 without converting it.
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	48	'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	49
				50	class BeautifulSoup(Tag):
				51	"""
				52	This class defines the basic interface called by the tree builders.
				53
				54	These methods will be called by the parser:
				55	reset()
				56	feed(markup)
				57
				58	The tree builder may call these methods from its feed() implementation:
				59	handle_starttag(name, attrs) # See note about return value
				60	handle_endtag(name)
				61	handle_data(data) # Appends to the current data node
				62	endData(containerClass=NavigableString) # Ends the current data node
				63
				64	No matter how complicated the underlying parser is, you should be
				65	able to build a tree using 'start tag' events, 'end tag' events,
				66	'data' events, and "done with data" events.
				67
				68	If you encounter an empty-element tag (aka a self-closing tag,
				69	like HTML's <br> tag), call handle_starttag and then
				70	handle_endtag.
				71	"""
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	72	ROOT_TAG_NAME = '[document]'
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	73
				74	# If the end-user gives no indication which tree builder they
				75	# want, look for one with these features.
				76	DEFAULT_BUILDER_FEATURES = ['html', 'fast']
				77
				78	ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
				79
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	80	NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
				81
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	82	def __init__(self, markup="", features=None, builder=None,
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	83	parse_only=None, from_encoding=None, exclude_encodings=None,
				84	**kwargs):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	85	"""The Soup object is initialized as the 'root tag', and the
				86	provided markup (which can be a string or a file-like object)
				87	is fed into the underlying parser."""
				88
				89	if 'convertEntities' in kwargs:
				90	warnings.warn(
				91	"BS4 does not respect the convertEntities argument to the "
				92	"BeautifulSoup constructor. Entities are always converted "
				93	"to Unicode characters.")
				94
				95	if 'markupMassage' in kwargs:
				96	del kwargs['markupMassage']
				97	warnings.warn(
				98	"BS4 does not respect the markupMassage argument to the "
				99	"BeautifulSoup constructor. The tree builder is responsible "
				100	"for any necessary markup massage.")
				101
				102	if 'smartQuotesTo' in kwargs:
				103	del kwargs['smartQuotesTo']
				104	warnings.warn(
				105	"BS4 does not respect the smartQuotesTo argument to the "
				106	"BeautifulSoup constructor. Smart quotes are always converted "
				107	"to Unicode characters.")
				108
				109	if 'selfClosingTags' in kwargs:
				110	del kwargs['selfClosingTags']
				111	warnings.warn(
				112	"BS4 does not respect the selfClosingTags argument to the "
				113	"BeautifulSoup constructor. The tree builder is responsible "
				114	"for understanding self-closing tags.")
				115
				116	if 'isHTML' in kwargs:
				117	del kwargs['isHTML']
				118	warnings.warn(
				119	"BS4 does not respect the isHTML argument to the "
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	120	"BeautifulSoup constructor. Suggest you use "
				121	"features='lxml' for HTML and features='lxml-xml' for "
				122	"XML.")
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	123
				124	def deprecated_argument(old_name, new_name):
				125	if old_name in kwargs:
				126	warnings.warn(
				127	'The "%s" argument to the BeautifulSoup constructor '
				128	'has been renamed to "%s."' % (old_name, new_name))
				129	value = kwargs[old_name]
				130	del kwargs[old_name]
				131	return value
				132	return None
				133
				134	parse_only = parse_only or deprecated_argument(
				135	"parseOnlyThese", "parse_only")
				136
				137	from_encoding = from_encoding or deprecated_argument(
				138	"fromEncoding", "from_encoding")
				139
				140	if len(kwargs) > 0:
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	141	arg = list(kwargs.keys()).pop()
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	142	raise TypeError(
				143	"__init__() got an unexpected keyword argument '%s'" % arg)
				144
				145	if builder is None:
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	146	original_features = features
				147	if isinstance(features, str):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	148	features = [features]
				149	if features is None or len(features) == 0:
				150	features = self.DEFAULT_BUILDER_FEATURES
				151	builder_class = builder_registry.lookup(*features)
				152	if builder_class is None:
				153	raise FeatureNotFound(
				154	"Couldn't find a tree builder with the features you "
				155	"requested: %s. Do you need to install a parser library?"
				156	% ",".join(features))
				157	builder = builder_class()
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	158	if not (original_features == builder.NAME or
				159	original_features in builder.ALTERNATE_NAMES):
				160	if builder.is_xml:
				161	markup_type = "XML"
				162	else:
				163	markup_type = "HTML"
				164	warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
				165	parser=builder.NAME,
				166	markup_type=markup_type))
				167
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	168	self.builder = builder
				169	self.is_xml = builder.is_xml
				170	self.builder.soup = self
				171
				172	self.parse_only = parse_only
				173
				174	if hasattr(markup, 'read'): # It's a file-type object.
				175	markup = markup.read()
				176	elif len(markup) <= 256:
				177	# Print out warnings for a couple beginner problems
				178	# involving passing non-markup to Beautiful Soup.
				179	# Beautiful Soup will still parse the input as markup,
				180	# just in case that's what the user really wants.
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	181	if (isinstance(markup, str)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	182	and not os.path.supports_unicode_filenames):
				183	possible_filename = markup.encode("utf8")
				184	else:
				185	possible_filename = markup
				186	is_file = False
				187	try:
				188	is_file = os.path.exists(possible_filename)
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	189	except Exception as e:
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	190	# This is almost certainly a problem involving
				191	# characters not valid in filenames on this
				192	# system. Just let it go.
				193	pass
				194	if is_file:
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	195	if isinstance(markup, str):
				196	markup = markup.encode("utf8")
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	197	warnings.warn(
				198	'"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
				199	if markup[:5] == "http:" or markup[:6] == "https:":
				200	# TODO: This is ugly but I couldn't get it to work in
				201	# Python 3 otherwise.
				202	if ((isinstance(markup, bytes) and not b' ' in markup)
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	203	or (isinstance(markup, str) and not ' ' in markup)):
				204	if isinstance(markup, str):
				205	markup = markup.encode("utf8")
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	206	warnings.warn(
				207	'"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
				208
				209	for (self.markup, self.original_encoding, self.declared_html_encoding,
				210	self.contains_replacement_characters) in (
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	211	self.builder.prepare_markup(
				212	markup, from_encoding, exclude_encodings=exclude_encodings)):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	213	self.reset()
				214	try:
				215	self._feed()
				216	break
				217	except ParserRejectedMarkup:
				218	pass
				219
				220	# Clear out the markup and remove the builder's circular
				221	# reference to this object.
				222	self.markup = None
				223	self.builder.soup = None
				224
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	225	def __copy__(self):
				226	return type(self)(self.encode(), builder=self.builder)
				227
				228	def __getstate__(self):
				229	# Frequently a tree builder can't be pickled.
				230	d = dict(self.__dict__)
				231	if 'builder' in d and not self.builder.picklable:
				232	del d['builder']
				233	return d
				234
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	235	def _feed(self):
				236	# Convert the document to Unicode.
				237	self.builder.reset()
				238
				239	self.builder.feed(self.markup)
				240	# Close out any unfinished strings and close all the open tags.
				241	self.endData()
				242	while self.currentTag.name != self.ROOT_TAG_NAME:
				243	self.popTag()
				244
				245	def reset(self):
				246	Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
				247	self.hidden = 1
				248	self.builder.reset()
				249	self.current_data = []
				250	self.currentTag = None
				251	self.tagStack = []
				252	self.preserve_whitespace_tag_stack = []
				253	self.pushTag(self)
				254
				255	def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
				256	"""Create a new tag associated with this soup."""
				257	return Tag(None, self.builder, name, namespace, nsprefix, attrs)
				258
				259	def new_string(self, s, subclass=NavigableString):
				260	"""Create a new NavigableString associated with this soup."""
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	261	return subclass(s)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	262
				263	def insert_before(self, successor):
				264	raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
				265
				266	def insert_after(self, successor):
				267	raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
				268
				269	def popTag(self):
				270	tag = self.tagStack.pop()
				271	if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
				272	self.preserve_whitespace_tag_stack.pop()
				273	#print "Pop", tag.name
				274	if self.tagStack:
				275	self.currentTag = self.tagStack[-1]
				276	return self.currentTag
				277
				278	def pushTag(self, tag):
				279	#print "Push", tag.name
				280	if self.currentTag:
				281	self.currentTag.contents.append(tag)
				282	self.tagStack.append(tag)
				283	self.currentTag = self.tagStack[-1]
				284	if tag.name in self.builder.preserve_whitespace_tags:
				285	self.preserve_whitespace_tag_stack.append(tag)
				286
				287	def endData(self, containerClass=NavigableString):
				288	if self.current_data:
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	289	current_data = ''.join(self.current_data)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	290	# If whitespace is not preserved, and this string contains
				291	# nothing but ASCII spaces, replace it with a single space
				292	# or newline.
				293	if not self.preserve_whitespace_tag_stack:
				294	strippable = True
				295	for i in current_data:
				296	if i not in self.ASCII_SPACES:
				297	strippable = False
				298	break
				299	if strippable:
				300	if '\n' in current_data:
				301	current_data = '\n'
				302	else:
				303	current_data = ' '
				304
				305	# Reset the data collector.
				306	self.current_data = []
				307
				308	# Should we add this string to the tree at all?
				309	if self.parse_only and len(self.tagStack) <= 1 and \
				310	(not self.parse_only.text or \
				311	not self.parse_only.search(current_data)):
				312	return
				313
				314	o = containerClass(current_data)
				315	self.object_was_parsed(o)
				316
				317	def object_was_parsed(self, o, parent=None, most_recent_element=None):
				318	"""Add an object to the parse tree."""
				319	parent = parent or self.currentTag
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	320	previous_element = most_recent_element or self._most_recent_element
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	321
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	322	next_element = previous_sibling = next_sibling = None
				323	if isinstance(o, Tag):
				324	next_element = o.next_element
				325	next_sibling = o.next_sibling
				326	previous_sibling = o.previous_sibling
				327	if not previous_element:
				328	previous_element = o.previous_element
				329
				330	o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
				331
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	332	self._most_recent_element = o
				333	parent.contents.append(o)
				334
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	335	if parent.next_sibling:
				336	# This node is being inserted into an element that has
				337	# already been parsed. Deal with any dangling references.
				338	index = parent.contents.index(o)
				339	if index == 0:
				340	previous_element = parent
				341	previous_sibling = None
				342	else:
				343	previous_element = previous_sibling = parent.contents[index-1]
				344	if index == len(parent.contents)-1:
				345	next_element = parent.next_sibling
				346	next_sibling = None
				347	else:
				348	next_element = next_sibling = parent.contents[index+1]
				349
				350	o.previous_element = previous_element
				351	if previous_element:
				352	previous_element.next_element = o
				353	o.next_element = next_element
				354	if next_element:
				355	next_element.previous_element = o
				356	o.next_sibling = next_sibling
				357	if next_sibling:
				358	next_sibling.previous_sibling = o
				359	o.previous_sibling = previous_sibling
				360	if previous_sibling:
				361	previous_sibling.next_sibling = o
				362
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	363	def _popToTag(self, name, nsprefix=None, inclusivePop=True):
				364	"""Pops the tag stack up to and including the most recent
				365	instance of the given tag. If inclusivePop is false, pops the tag
				366	stack up to but not including the most recent instqance of
				367	the given tag."""
				368	#print "Popping to %s" % name
				369	if name == self.ROOT_TAG_NAME:
				370	# The BeautifulSoup object itself can never be popped.
				371	return
				372
				373	most_recently_popped = None
				374
				375	stack_size = len(self.tagStack)
				376	for i in range(stack_size - 1, 0, -1):
				377	t = self.tagStack[i]
				378	if (name == t.name and nsprefix == t.prefix):
				379	if inclusivePop:
				380	most_recently_popped = self.popTag()
				381	break
				382	most_recently_popped = self.popTag()
				383
				384	return most_recently_popped
				385
				386	def handle_starttag(self, name, namespace, nsprefix, attrs):
				387	"""Push a start tag on to the stack.
				388
				389	If this method returns None, the tag was rejected by the
				390	SoupStrainer. You should proceed as if the tag had not occured
				391	in the document. For instance, if this was a self-closing tag,
				392	don't call handle_endtag.
				393	"""
				394
				395	# print "Start tag %s: %s" % (name, attrs)
				396	self.endData()
				397
				398	if (self.parse_only and len(self.tagStack) <= 1
				399	and (self.parse_only.text
				400	or not self.parse_only.search_tag(name, attrs))):
				401	return None
				402
				403	tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
				404	self.currentTag, self._most_recent_element)
				405	if tag is None:
				406	return tag
				407	if self._most_recent_element:
				408	self._most_recent_element.next_element = tag
				409	self._most_recent_element = tag
				410	self.pushTag(tag)
				411	return tag
				412
				413	def handle_endtag(self, name, nsprefix=None):
				414	#print "End tag: " + name
				415	self.endData()
				416	self._popToTag(name, nsprefix)
				417
				418	def handle_data(self, data):
				419	self.current_data.append(data)
				420
				421	def decode(self, pretty_print=False,
				422	eventual_encoding=DEFAULT_OUTPUT_ENCODING,
				423	formatter="minimal"):
				424	"""Returns a string or Unicode representation of this document.
				425	To get Unicode, pass None for encoding."""
				426
				427	if self.is_xml:
				428	# Print the XML declaration
				429	encoding_part = ''
				430	if eventual_encoding != None:
				431	encoding_part = ' encoding="%s"' % eventual_encoding
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	432	prefix = '<?xml version="1.0"%s?>\n' % encoding_part
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	433	else:
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	434	prefix = ''
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	435	if not pretty_print:
				436	indent_level = None
				437	else:
				438	indent_level = 0
				439	return prefix + super(BeautifulSoup, self).decode(
				440	indent_level, eventual_encoding, formatter)
				441
				442	# Alias to make it easier to type import: 'from bs4 import _soup'
				443	_s = BeautifulSoup
				444	_soup = BeautifulSoup
				445
				446	class BeautifulStoneSoup(BeautifulSoup):
				447	"""Deprecated interface to an XML parser."""
				448
				449	def __init__(self, args, *kwargs):
				450	kwargs['features'] = 'xml'
				451	warnings.warn(
				452	'The BeautifulStoneSoup class is deprecated. Instead of using '
				453	'it, pass features="xml" into the BeautifulSoup constructor.')
				454	super(BeautifulStoneSoup, self).__init__(args, *kwargs)
				455
				456
				457	class StopParsing(Exception):
				458	pass
				459
				460	class FeatureNotFound(ValueError):
				461	pass
				462
				463
				464	#By default, act as an HTML pretty-printer.
				465	if __name__ == '__main__':
				466	import sys
				467	soup = BeautifulSoup(sys.stdin)
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	468	print(soup.prettify())