Blame - yocto-poky/bitbake/lib/bs4/__init__.py - mdmillerii/openbmc

blob: 7ba34269af71fbfbb2b53272866275684f08170f [file] [log] [blame]

Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1	"""Beautiful Soup
				2	Elixir and Tonic
				3	"The Screen-Scraper's Friend"
				4	http://www.crummy.com/software/BeautifulSoup/
				5
				6	Beautiful Soup uses a pluggable XML or HTML parser to parse a
				7	(possibly invalid) document into a tree representation. Beautiful Soup
				8	provides provides methods and Pythonic idioms that make it easy to
				9	navigate, search, and modify the parse tree.
				10
				11	Beautiful Soup works with Python 2.6 and up. It works better if lxml
				12	and/or html5lib is installed.
				13
				14	For more than you ever wanted to know about Beautiful Soup, see the
				15	documentation:
				16	http://www.crummy.com/software/BeautifulSoup/bs4/doc/
				17	"""
				18
				19	__author__ = "Leonard Richardson (leonardr@segfault.org)"
				20	__version__ = "4.3.2"
				21	__copyright__ = "Copyright (c) 2004-2013 Leonard Richardson"
				22	__license__ = "MIT"
				23
				24	__all__ = ['BeautifulSoup']
				25
				26	import os
				27	import re
				28	import warnings
				29
				30	from .builder import builder_registry, ParserRejectedMarkup
				31	from .dammit import UnicodeDammit
				32	from .element import (
				33	CData,
				34	Comment,
				35	DEFAULT_OUTPUT_ENCODING,
				36	Declaration,
				37	Doctype,
				38	NavigableString,
				39	PageElement,
				40	ProcessingInstruction,
				41	ResultSet,
				42	SoupStrainer,
				43	Tag,
				44	)
				45
				46	# The very first thing we do is give a useful error if someone is
				47	# running this code under Python 3 without converting it.
				48	syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
				49
				50	class BeautifulSoup(Tag):
				51	"""
				52	This class defines the basic interface called by the tree builders.
				53
				54	These methods will be called by the parser:
				55	reset()
				56	feed(markup)
				57
				58	The tree builder may call these methods from its feed() implementation:
				59	handle_starttag(name, attrs) # See note about return value
				60	handle_endtag(name)
				61	handle_data(data) # Appends to the current data node
				62	endData(containerClass=NavigableString) # Ends the current data node
				63
				64	No matter how complicated the underlying parser is, you should be
				65	able to build a tree using 'start tag' events, 'end tag' events,
				66	'data' events, and "done with data" events.
				67
				68	If you encounter an empty-element tag (aka a self-closing tag,
				69	like HTML's <br> tag), call handle_starttag and then
				70	handle_endtag.
				71	"""
				72	ROOT_TAG_NAME = u'[document]'
				73
				74	# If the end-user gives no indication which tree builder they
				75	# want, look for one with these features.
				76	DEFAULT_BUILDER_FEATURES = ['html', 'fast']
				77
				78	ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
				79
				80	def __init__(self, markup="", features=None, builder=None,
				81	parse_only=None, from_encoding=None, **kwargs):
				82	"""The Soup object is initialized as the 'root tag', and the
				83	provided markup (which can be a string or a file-like object)
				84	is fed into the underlying parser."""
				85
				86	if 'convertEntities' in kwargs:
				87	warnings.warn(
				88	"BS4 does not respect the convertEntities argument to the "
				89	"BeautifulSoup constructor. Entities are always converted "
				90	"to Unicode characters.")
				91
				92	if 'markupMassage' in kwargs:
				93	del kwargs['markupMassage']
				94	warnings.warn(
				95	"BS4 does not respect the markupMassage argument to the "
				96	"BeautifulSoup constructor. The tree builder is responsible "
				97	"for any necessary markup massage.")
				98
				99	if 'smartQuotesTo' in kwargs:
				100	del kwargs['smartQuotesTo']
				101	warnings.warn(
				102	"BS4 does not respect the smartQuotesTo argument to the "
				103	"BeautifulSoup constructor. Smart quotes are always converted "
				104	"to Unicode characters.")
				105
				106	if 'selfClosingTags' in kwargs:
				107	del kwargs['selfClosingTags']
				108	warnings.warn(
				109	"BS4 does not respect the selfClosingTags argument to the "
				110	"BeautifulSoup constructor. The tree builder is responsible "
				111	"for understanding self-closing tags.")
				112
				113	if 'isHTML' in kwargs:
				114	del kwargs['isHTML']
				115	warnings.warn(
				116	"BS4 does not respect the isHTML argument to the "
				117	"BeautifulSoup constructor. You can pass in features='html' "
				118	"or features='xml' to get a builder capable of handling "
				119	"one or the other.")
				120
				121	def deprecated_argument(old_name, new_name):
				122	if old_name in kwargs:
				123	warnings.warn(
				124	'The "%s" argument to the BeautifulSoup constructor '
				125	'has been renamed to "%s."' % (old_name, new_name))
				126	value = kwargs[old_name]
				127	del kwargs[old_name]
				128	return value
				129	return None
				130
				131	parse_only = parse_only or deprecated_argument(
				132	"parseOnlyThese", "parse_only")
				133
				134	from_encoding = from_encoding or deprecated_argument(
				135	"fromEncoding", "from_encoding")
				136
				137	if len(kwargs) > 0:
				138	arg = kwargs.keys().pop()
				139	raise TypeError(
				140	"__init__() got an unexpected keyword argument '%s'" % arg)
				141
				142	if builder is None:
				143	if isinstance(features, basestring):
				144	features = [features]
				145	if features is None or len(features) == 0:
				146	features = self.DEFAULT_BUILDER_FEATURES
				147	builder_class = builder_registry.lookup(*features)
				148	if builder_class is None:
				149	raise FeatureNotFound(
				150	"Couldn't find a tree builder with the features you "
				151	"requested: %s. Do you need to install a parser library?"
				152	% ",".join(features))
				153	builder = builder_class()
				154	self.builder = builder
				155	self.is_xml = builder.is_xml
				156	self.builder.soup = self
				157
				158	self.parse_only = parse_only
				159
				160	if hasattr(markup, 'read'): # It's a file-type object.
				161	markup = markup.read()
				162	elif len(markup) <= 256:
				163	# Print out warnings for a couple beginner problems
				164	# involving passing non-markup to Beautiful Soup.
				165	# Beautiful Soup will still parse the input as markup,
				166	# just in case that's what the user really wants.
				167	if (isinstance(markup, unicode)
				168	and not os.path.supports_unicode_filenames):
				169	possible_filename = markup.encode("utf8")
				170	else:
				171	possible_filename = markup
				172	is_file = False
				173	try:
				174	is_file = os.path.exists(possible_filename)
				175	except Exception, e:
				176	# This is almost certainly a problem involving
				177	# characters not valid in filenames on this
				178	# system. Just let it go.
				179	pass
				180	if is_file:
				181	warnings.warn(
				182	'"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
				183	if markup[:5] == "http:" or markup[:6] == "https:":
				184	# TODO: This is ugly but I couldn't get it to work in
				185	# Python 3 otherwise.
				186	if ((isinstance(markup, bytes) and not b' ' in markup)
				187	or (isinstance(markup, unicode) and not u' ' in markup)):
				188	warnings.warn(
				189	'"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
				190
				191	for (self.markup, self.original_encoding, self.declared_html_encoding,
				192	self.contains_replacement_characters) in (
				193	self.builder.prepare_markup(markup, from_encoding)):
				194	self.reset()
				195	try:
				196	self._feed()
				197	break
				198	except ParserRejectedMarkup:
				199	pass
				200
				201	# Clear out the markup and remove the builder's circular
				202	# reference to this object.
				203	self.markup = None
				204	self.builder.soup = None
				205
				206	def _feed(self):
				207	# Convert the document to Unicode.
				208	self.builder.reset()
				209
				210	self.builder.feed(self.markup)
				211	# Close out any unfinished strings and close all the open tags.
				212	self.endData()
				213	while self.currentTag.name != self.ROOT_TAG_NAME:
				214	self.popTag()
				215
				216	def reset(self):
				217	Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
				218	self.hidden = 1
				219	self.builder.reset()
				220	self.current_data = []
				221	self.currentTag = None
				222	self.tagStack = []
				223	self.preserve_whitespace_tag_stack = []
				224	self.pushTag(self)
				225
				226	def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
				227	"""Create a new tag associated with this soup."""
				228	return Tag(None, self.builder, name, namespace, nsprefix, attrs)
				229
				230	def new_string(self, s, subclass=NavigableString):
				231	"""Create a new NavigableString associated with this soup."""
				232	navigable = subclass(s)
				233	navigable.setup()
				234	return navigable
				235
				236	def insert_before(self, successor):
				237	raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
				238
				239	def insert_after(self, successor):
				240	raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
				241
				242	def popTag(self):
				243	tag = self.tagStack.pop()
				244	if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
				245	self.preserve_whitespace_tag_stack.pop()
				246	#print "Pop", tag.name
				247	if self.tagStack:
				248	self.currentTag = self.tagStack[-1]
				249	return self.currentTag
				250
				251	def pushTag(self, tag):
				252	#print "Push", tag.name
				253	if self.currentTag:
				254	self.currentTag.contents.append(tag)
				255	self.tagStack.append(tag)
				256	self.currentTag = self.tagStack[-1]
				257	if tag.name in self.builder.preserve_whitespace_tags:
				258	self.preserve_whitespace_tag_stack.append(tag)
				259
				260	def endData(self, containerClass=NavigableString):
				261	if self.current_data:
				262	current_data = u''.join(self.current_data)
				263	# If whitespace is not preserved, and this string contains
				264	# nothing but ASCII spaces, replace it with a single space
				265	# or newline.
				266	if not self.preserve_whitespace_tag_stack:
				267	strippable = True
				268	for i in current_data:
				269	if i not in self.ASCII_SPACES:
				270	strippable = False
				271	break
				272	if strippable:
				273	if '\n' in current_data:
				274	current_data = '\n'
				275	else:
				276	current_data = ' '
				277
				278	# Reset the data collector.
				279	self.current_data = []
				280
				281	# Should we add this string to the tree at all?
				282	if self.parse_only and len(self.tagStack) <= 1 and \
				283	(not self.parse_only.text or \
				284	not self.parse_only.search(current_data)):
				285	return
				286
				287	o = containerClass(current_data)
				288	self.object_was_parsed(o)
				289
				290	def object_was_parsed(self, o, parent=None, most_recent_element=None):
				291	"""Add an object to the parse tree."""
				292	parent = parent or self.currentTag
				293	most_recent_element = most_recent_element or self._most_recent_element
				294	o.setup(parent, most_recent_element)
				295
				296	if most_recent_element is not None:
				297	most_recent_element.next_element = o
				298	self._most_recent_element = o
				299	parent.contents.append(o)
				300
				301	def _popToTag(self, name, nsprefix=None, inclusivePop=True):
				302	"""Pops the tag stack up to and including the most recent
				303	instance of the given tag. If inclusivePop is false, pops the tag
				304	stack up to but not including the most recent instqance of
				305	the given tag."""
				306	#print "Popping to %s" % name
				307	if name == self.ROOT_TAG_NAME:
				308	# The BeautifulSoup object itself can never be popped.
				309	return
				310
				311	most_recently_popped = None
				312
				313	stack_size = len(self.tagStack)
				314	for i in range(stack_size - 1, 0, -1):
				315	t = self.tagStack[i]
				316	if (name == t.name and nsprefix == t.prefix):
				317	if inclusivePop:
				318	most_recently_popped = self.popTag()
				319	break
				320	most_recently_popped = self.popTag()
				321
				322	return most_recently_popped
				323
				324	def handle_starttag(self, name, namespace, nsprefix, attrs):
				325	"""Push a start tag on to the stack.
				326
				327	If this method returns None, the tag was rejected by the
				328	SoupStrainer. You should proceed as if the tag had not occured
				329	in the document. For instance, if this was a self-closing tag,
				330	don't call handle_endtag.
				331	"""
				332
				333	# print "Start tag %s: %s" % (name, attrs)
				334	self.endData()
				335
				336	if (self.parse_only and len(self.tagStack) <= 1
				337	and (self.parse_only.text
				338	or not self.parse_only.search_tag(name, attrs))):
				339	return None
				340
				341	tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
				342	self.currentTag, self._most_recent_element)
				343	if tag is None:
				344	return tag
				345	if self._most_recent_element:
				346	self._most_recent_element.next_element = tag
				347	self._most_recent_element = tag
				348	self.pushTag(tag)
				349	return tag
				350
				351	def handle_endtag(self, name, nsprefix=None):
				352	#print "End tag: " + name
				353	self.endData()
				354	self._popToTag(name, nsprefix)
				355
				356	def handle_data(self, data):
				357	self.current_data.append(data)
				358
				359	def decode(self, pretty_print=False,
				360	eventual_encoding=DEFAULT_OUTPUT_ENCODING,
				361	formatter="minimal"):
				362	"""Returns a string or Unicode representation of this document.
				363	To get Unicode, pass None for encoding."""
				364
				365	if self.is_xml:
				366	# Print the XML declaration
				367	encoding_part = ''
				368	if eventual_encoding != None:
				369	encoding_part = ' encoding="%s"' % eventual_encoding
				370	prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
				371	else:
				372	prefix = u''
				373	if not pretty_print:
				374	indent_level = None
				375	else:
				376	indent_level = 0
				377	return prefix + super(BeautifulSoup, self).decode(
				378	indent_level, eventual_encoding, formatter)
				379
				380	# Alias to make it easier to type import: 'from bs4 import _soup'
				381	_s = BeautifulSoup
				382	_soup = BeautifulSoup
				383
				384	class BeautifulStoneSoup(BeautifulSoup):
				385	"""Deprecated interface to an XML parser."""
				386
				387	def __init__(self, args, *kwargs):
				388	kwargs['features'] = 'xml'
				389	warnings.warn(
				390	'The BeautifulStoneSoup class is deprecated. Instead of using '
				391	'it, pass features="xml" into the BeautifulSoup constructor.')
				392	super(BeautifulStoneSoup, self).__init__(args, *kwargs)
				393
				394
				395	class StopParsing(Exception):
				396	pass
				397
				398	class FeatureNotFound(ValueError):
				399	pass
				400
				401
				402	#By default, act as an HTML pretty-printer.
				403	if __name__ == '__main__':
				404	import sys
				405	soup = BeautifulSoup(sys.stdin)
				406	print soup.prettify()