blob: 4d0b00afaddf9ff6b005f3755ae28e0922d52a86 [file] [log] [blame]
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001"""Diagnostic functions, mainly for use when doing tech support."""
2import cProfile
3from StringIO import StringIO
4from HTMLParser import HTMLParser
5import bs4
6from bs4 import BeautifulSoup, __version__
7from bs4.builder import builder_registry
8
9import os
10import pstats
11import random
12import tempfile
13import time
14import traceback
15import sys
16import cProfile
17
18def diagnose(data):
19 """Diagnostic suite for isolating common problems."""
20 print "Diagnostic running on Beautiful Soup %s" % __version__
21 print "Python version %s" % sys.version
22
23 basic_parsers = ["html.parser", "html5lib", "lxml"]
24 for name in basic_parsers:
25 for builder in builder_registry.builders:
26 if name in builder.features:
27 break
28 else:
29 basic_parsers.remove(name)
30 print (
31 "I noticed that %s is not installed. Installing it may help." %
32 name)
33
34 if 'lxml' in basic_parsers:
35 basic_parsers.append(["lxml", "xml"])
36 from lxml import etree
37 print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
38
39 if 'html5lib' in basic_parsers:
40 import html5lib
41 print "Found html5lib version %s" % html5lib.__version__
42
43 if hasattr(data, 'read'):
44 data = data.read()
45 elif os.path.exists(data):
46 print '"%s" looks like a filename. Reading data from the file.' % data
47 data = open(data).read()
48 elif data.startswith("http:") or data.startswith("https:"):
49 print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
50 print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
51 return
52 print
53
54 for parser in basic_parsers:
55 print "Trying to parse your markup with %s" % parser
56 success = False
57 try:
58 soup = BeautifulSoup(data, parser)
59 success = True
60 except Exception, e:
61 print "%s could not parse the markup." % parser
62 traceback.print_exc()
63 if success:
64 print "Here's what %s did with the markup:" % parser
65 print soup.prettify()
66
67 print "-" * 80
68
69def lxml_trace(data, html=True, **kwargs):
70 """Print out the lxml events that occur during parsing.
71
72 This lets you see how lxml parses a document when no Beautiful
73 Soup code is running.
74 """
75 from lxml import etree
76 for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
77 print("%s, %4s, %s" % (event, element.tag, element.text))
78
79class AnnouncingParser(HTMLParser):
80 """Announces HTMLParser parse events, without doing anything else."""
81
82 def _p(self, s):
83 print(s)
84
85 def handle_starttag(self, name, attrs):
86 self._p("%s START" % name)
87
88 def handle_endtag(self, name):
89 self._p("%s END" % name)
90
91 def handle_data(self, data):
92 self._p("%s DATA" % data)
93
94 def handle_charref(self, name):
95 self._p("%s CHARREF" % name)
96
97 def handle_entityref(self, name):
98 self._p("%s ENTITYREF" % name)
99
100 def handle_comment(self, data):
101 self._p("%s COMMENT" % data)
102
103 def handle_decl(self, data):
104 self._p("%s DECL" % data)
105
106 def unknown_decl(self, data):
107 self._p("%s UNKNOWN-DECL" % data)
108
109 def handle_pi(self, data):
110 self._p("%s PI" % data)
111
112def htmlparser_trace(data):
113 """Print out the HTMLParser events that occur during parsing.
114
115 This lets you see how HTMLParser parses a document when no
116 Beautiful Soup code is running.
117 """
118 parser = AnnouncingParser()
119 parser.feed(data)
120
121_vowels = "aeiou"
122_consonants = "bcdfghjklmnpqrstvwxyz"
123
124def rword(length=5):
125 "Generate a random word-like string."
126 s = ''
127 for i in range(length):
128 if i % 2 == 0:
129 t = _consonants
130 else:
131 t = _vowels
132 s += random.choice(t)
133 return s
134
135def rsentence(length=4):
136 "Generate a random sentence-like string."
137 return " ".join(rword(random.randint(4,9)) for i in range(length))
138
139def rdoc(num_elements=1000):
140 """Randomly generate an invalid HTML document."""
141 tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
142 elements = []
143 for i in range(num_elements):
144 choice = random.randint(0,3)
145 if choice == 0:
146 # New tag.
147 tag_name = random.choice(tag_names)
148 elements.append("<%s>" % tag_name)
149 elif choice == 1:
150 elements.append(rsentence(random.randint(1,4)))
151 elif choice == 2:
152 # Close a tag.
153 tag_name = random.choice(tag_names)
154 elements.append("</%s>" % tag_name)
155 return "<html>" + "\n".join(elements) + "</html>"
156
157def benchmark_parsers(num_elements=100000):
158 """Very basic head-to-head performance benchmark."""
159 print "Comparative parser benchmark on Beautiful Soup %s" % __version__
160 data = rdoc(num_elements)
161 print "Generated a large invalid HTML document (%d bytes)." % len(data)
162
163 for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
164 success = False
165 try:
166 a = time.time()
167 soup = BeautifulSoup(data, parser)
168 b = time.time()
169 success = True
170 except Exception, e:
171 print "%s could not parse the markup." % parser
172 traceback.print_exc()
173 if success:
174 print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
175
176 from lxml import etree
177 a = time.time()
178 etree.HTML(data)
179 b = time.time()
180 print "Raw lxml parsed the markup in %.2fs." % (b-a)
181
182 import html5lib
183 parser = html5lib.HTMLParser()
184 a = time.time()
185 parser.parse(data)
186 b = time.time()
187 print "Raw html5lib parsed the markup in %.2fs." % (b-a)
188
189def profile(num_elements=100000, parser="lxml"):
190
191 filehandle = tempfile.NamedTemporaryFile()
192 filename = filehandle.name
193
194 data = rdoc(num_elements)
195 vars = dict(bs4=bs4, data=data, parser=parser)
196 cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename)
197
198 stats = pstats.Stats(filename)
199 # stats.strip_dirs()
200 stats.sort_stats("cumulative")
201 stats.print_stats('_html5lib|bs4', 50)
202
203if __name__ == '__main__':
204 diagnose(sys.stdin.read())