Blame - poky/bitbake/lib/bs4/diagnose.py - mdmillerii/openbmc

blob: 083395fb46ce803fd2679d875f15481564627f6e [file] [log] [blame]

Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1	"""Diagnostic functions, mainly for use when doing tech support."""
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	2
				3	__license__ = "MIT"
				4
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	5	import cProfile
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	6	from io import StringIO
				7	from html.parser import HTMLParser
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	8	import bs4
				9	from bs4 import BeautifulSoup, __version__
				10	from bs4.builder import builder_registry
				11
				12	import os
				13	import pstats
				14	import random
				15	import tempfile
				16	import time
				17	import traceback
				18	import sys
				19	import cProfile
				20
				21	def diagnose(data):
				22	"""Diagnostic suite for isolating common problems."""
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	23	print("Diagnostic running on Beautiful Soup %s" % __version__)
				24	print("Python version %s" % sys.version)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	25
				26	basic_parsers = ["html.parser", "html5lib", "lxml"]
				27	for name in basic_parsers:
				28	for builder in builder_registry.builders:
				29	if name in builder.features:
				30	break
				31	else:
				32	basic_parsers.remove(name)
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	33	print((
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	34	"I noticed that %s is not installed. Installing it may help." %
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	35	name))
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	36
				37	if 'lxml' in basic_parsers:
				38	basic_parsers.append(["lxml", "xml"])
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	39	try:
				40	from lxml import etree
				41	print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))
				42	except ImportError as e:
				43	print (
				44	"lxml is not installed or couldn't be imported.")
				45
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	46
				47	if 'html5lib' in basic_parsers:
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	48	try:
				49	import html5lib
				50	print("Found html5lib version %s" % html5lib.__version__)
				51	except ImportError as e:
				52	print (
				53	"html5lib is not installed or couldn't be imported.")
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	54
				55	if hasattr(data, 'read'):
				56	data = data.read()
				57	elif os.path.exists(data):
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	58	print('"%s" looks like a filename. Reading data from the file.' % data)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	59	data = open(data).read()
				60	elif data.startswith("http:") or data.startswith("https:"):
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	61	print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
				62	print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	63	return
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	64	print()
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	65
				66	for parser in basic_parsers:
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	67	print("Trying to parse your markup with %s" % parser)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	68	success = False
				69	try:
				70	soup = BeautifulSoup(data, parser)
				71	success = True
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	72	except Exception as e:
				73	print("%s could not parse the markup." % parser)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	74	traceback.print_exc()
				75	if success:
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	76	print("Here's what %s did with the markup:" % parser)
				77	print(soup.prettify())
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	78
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	79	print("-" * 80)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	80
				81	def lxml_trace(data, html=True, **kwargs):
				82	"""Print out the lxml events that occur during parsing.
				83
				84	This lets you see how lxml parses a document when no Beautiful
				85	Soup code is running.
				86	"""
				87	from lxml import etree
				88	for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	89	print(("%s, %4s, %s" % (event, element.tag, element.text)))
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	90
				91	class AnnouncingParser(HTMLParser):
				92	"""Announces HTMLParser parse events, without doing anything else."""
				93
				94	def _p(self, s):
				95	print(s)
				96
				97	def handle_starttag(self, name, attrs):
				98	self._p("%s START" % name)
				99
				100	def handle_endtag(self, name):
				101	self._p("%s END" % name)
				102
				103	def handle_data(self, data):
				104	self._p("%s DATA" % data)
				105
				106	def handle_charref(self, name):
				107	self._p("%s CHARREF" % name)
				108
				109	def handle_entityref(self, name):
				110	self._p("%s ENTITYREF" % name)
				111
				112	def handle_comment(self, data):
				113	self._p("%s COMMENT" % data)
				114
				115	def handle_decl(self, data):
				116	self._p("%s DECL" % data)
				117
				118	def unknown_decl(self, data):
				119	self._p("%s UNKNOWN-DECL" % data)
				120
				121	def handle_pi(self, data):
				122	self._p("%s PI" % data)
				123
				124	def htmlparser_trace(data):
				125	"""Print out the HTMLParser events that occur during parsing.
				126
				127	This lets you see how HTMLParser parses a document when no
				128	Beautiful Soup code is running.
				129	"""
				130	parser = AnnouncingParser()
				131	parser.feed(data)
				132
				133	_vowels = "aeiou"
				134	_consonants = "bcdfghjklmnpqrstvwxyz"
				135
				136	def rword(length=5):
				137	"Generate a random word-like string."
				138	s = ''
				139	for i in range(length):
				140	if i % 2 == 0:
				141	t = _consonants
				142	else:
				143	t = _vowels
				144	s += random.choice(t)
				145	return s
				146
				147	def rsentence(length=4):
				148	"Generate a random sentence-like string."
				149	return " ".join(rword(random.randint(4,9)) for i in range(length))
				150
				151	def rdoc(num_elements=1000):
				152	"""Randomly generate an invalid HTML document."""
				153	tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
				154	elements = []
				155	for i in range(num_elements):
				156	choice = random.randint(0,3)
				157	if choice == 0:
				158	# New tag.
				159	tag_name = random.choice(tag_names)
				160	elements.append("<%s>" % tag_name)
				161	elif choice == 1:
				162	elements.append(rsentence(random.randint(1,4)))
				163	elif choice == 2:
				164	# Close a tag.
				165	tag_name = random.choice(tag_names)
				166	elements.append("</%s>" % tag_name)
				167	return "<html>" + "\n".join(elements) + "</html>"
				168
				169	def benchmark_parsers(num_elements=100000):
				170	"""Very basic head-to-head performance benchmark."""
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	171	print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	172	data = rdoc(num_elements)
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	173	print("Generated a large invalid HTML document (%d bytes)." % len(data))
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	174
				175	for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
				176	success = False
				177	try:
				178	a = time.time()
				179	soup = BeautifulSoup(data, parser)
				180	b = time.time()
				181	success = True
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	182	except Exception as e:
				183	print("%s could not parse the markup." % parser)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	184	traceback.print_exc()
				185	if success:
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	186	print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	187
				188	from lxml import etree
				189	a = time.time()
				190	etree.HTML(data)
				191	b = time.time()
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	192	print("Raw lxml parsed the markup in %.2fs." % (b-a))
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	193
				194	import html5lib
				195	parser = html5lib.HTMLParser()
				196	a = time.time()
				197	parser.parse(data)
				198	b = time.time()
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	199	print("Raw html5lib parsed the markup in %.2fs." % (b-a))
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	200
				201	def profile(num_elements=100000, parser="lxml"):
				202
				203	filehandle = tempfile.NamedTemporaryFile()
				204	filename = filehandle.name
				205
				206	data = rdoc(num_elements)
				207	vars = dict(bs4=bs4, data=data, parser=parser)
				208	cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename)
				209
				210	stats = pstats.Stats(filename)
				211	# stats.strip_dirs()
				212	stats.sort_stats("cumulative")
				213	stats.print_stats('_html5lib\|bs4', 50)
				214
				215	if __name__ == '__main__':
				216	diagnose(sys.stdin.read())