blob: ca8d8b892bf3689117befadb0359fd2f3ca859c6 [file] [log] [blame]
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001"""Use the HTMLParser library to parse HTML files that aren't too bad."""
2
3__all__ = [
4 'HTMLParserTreeBuilder',
5 ]
6
7from HTMLParser import (
8 HTMLParser,
9 HTMLParseError,
10 )
11import sys
12import warnings
13
14# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
15# argument, which we'd like to set to False. Unfortunately,
16# http://bugs.python.org/issue13273 makes strict=True a better bet
17# before Python 3.2.3.
18#
19# At the end of this file, we monkeypatch HTMLParser so that
20# strict=True works well on Python 3.2.2.
21major, minor, release = sys.version_info[:3]
22CONSTRUCTOR_TAKES_STRICT = (
23 major > 3
24 or (major == 3 and minor > 2)
25 or (major == 3 and minor == 2 and release >= 3))
26
27from bs4.element import (
28 CData,
29 Comment,
30 Declaration,
31 Doctype,
32 ProcessingInstruction,
33 )
34from bs4.dammit import EntitySubstitution, UnicodeDammit
35
36from bs4.builder import (
37 HTML,
38 HTMLTreeBuilder,
39 STRICT,
40 )
41
42
43HTMLPARSER = 'html.parser'
44
45class BeautifulSoupHTMLParser(HTMLParser):
46 def handle_starttag(self, name, attrs):
47 # XXX namespace
48 attr_dict = {}
49 for key, value in attrs:
50 # Change None attribute values to the empty string
51 # for consistency with the other tree builders.
52 if value is None:
53 value = ''
54 attr_dict[key] = value
55 attrvalue = '""'
56 self.soup.handle_starttag(name, None, None, attr_dict)
57
58 def handle_endtag(self, name):
59 self.soup.handle_endtag(name)
60
61 def handle_data(self, data):
62 self.soup.handle_data(data)
63
64 def handle_charref(self, name):
65 # XXX workaround for a bug in HTMLParser. Remove this once
66 # it's fixed.
67 if name.startswith('x'):
68 real_name = int(name.lstrip('x'), 16)
69 elif name.startswith('X'):
70 real_name = int(name.lstrip('X'), 16)
71 else:
72 real_name = int(name)
73
74 try:
75 data = unichr(real_name)
76 except (ValueError, OverflowError), e:
77 data = u"\N{REPLACEMENT CHARACTER}"
78
79 self.handle_data(data)
80
81 def handle_entityref(self, name):
82 character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
83 if character is not None:
84 data = character
85 else:
86 data = "&%s;" % name
87 self.handle_data(data)
88
89 def handle_comment(self, data):
90 self.soup.endData()
91 self.soup.handle_data(data)
92 self.soup.endData(Comment)
93
94 def handle_decl(self, data):
95 self.soup.endData()
96 if data.startswith("DOCTYPE "):
97 data = data[len("DOCTYPE "):]
98 elif data == 'DOCTYPE':
99 # i.e. "<!DOCTYPE>"
100 data = ''
101 self.soup.handle_data(data)
102 self.soup.endData(Doctype)
103
104 def unknown_decl(self, data):
105 if data.upper().startswith('CDATA['):
106 cls = CData
107 data = data[len('CDATA['):]
108 else:
109 cls = Declaration
110 self.soup.endData()
111 self.soup.handle_data(data)
112 self.soup.endData(cls)
113
114 def handle_pi(self, data):
115 self.soup.endData()
116 if data.endswith("?") and data.lower().startswith("xml"):
117 # "An XHTML processing instruction using the trailing '?'
118 # will cause the '?' to be included in data." - HTMLParser
119 # docs.
120 #
121 # Strip the question mark so we don't end up with two
122 # question marks.
123 data = data[:-1]
124 self.soup.handle_data(data)
125 self.soup.endData(ProcessingInstruction)
126
127
128class HTMLParserTreeBuilder(HTMLTreeBuilder):
129
130 is_xml = False
131 features = [HTML, STRICT, HTMLPARSER]
132
133 def __init__(self, *args, **kwargs):
134 if CONSTRUCTOR_TAKES_STRICT:
135 kwargs['strict'] = False
136 self.parser_args = (args, kwargs)
137
138 def prepare_markup(self, markup, user_specified_encoding=None,
139 document_declared_encoding=None):
140 """
141 :return: A 4-tuple (markup, original encoding, encoding
142 declared within markup, whether any characters had to be
143 replaced with REPLACEMENT CHARACTER).
144 """
145 if isinstance(markup, unicode):
146 yield (markup, None, None, False)
147 return
148
149 try_encodings = [user_specified_encoding, document_declared_encoding]
150 dammit = UnicodeDammit(markup, try_encodings, is_html=True)
151 yield (dammit.markup, dammit.original_encoding,
152 dammit.declared_html_encoding,
153 dammit.contains_replacement_characters)
154
155 def feed(self, markup):
156 args, kwargs = self.parser_args
157 parser = BeautifulSoupHTMLParser(*args, **kwargs)
158 parser.soup = self.soup
159 try:
160 parser.feed(markup)
161 except HTMLParseError, e:
162 warnings.warn(RuntimeWarning(
163 "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
164 raise e
165
166# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
167# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
168# string.
169#
170# XXX This code can be removed once most Python 3 users are on 3.2.3.
171if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
172 import re
173 attrfind_tolerant = re.compile(
174 r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
175 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
176 HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
177
178 locatestarttagend = re.compile(r"""
179 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
180 (?:\s+ # whitespace before attribute name
181 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
182 (?:\s*=\s* # value indicator
183 (?:'[^']*' # LITA-enclosed value
184 |\"[^\"]*\" # LIT-enclosed value
185 |[^'\">\s]+ # bare value
186 )
187 )?
188 )
189 )*
190 \s* # trailing whitespace
191""", re.VERBOSE)
192 BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
193
194 from html.parser import tagfind, attrfind
195
196 def parse_starttag(self, i):
197 self.__starttag_text = None
198 endpos = self.check_for_whole_start_tag(i)
199 if endpos < 0:
200 return endpos
201 rawdata = self.rawdata
202 self.__starttag_text = rawdata[i:endpos]
203
204 # Now parse the data between i+1 and j into a tag and attrs
205 attrs = []
206 match = tagfind.match(rawdata, i+1)
207 assert match, 'unexpected call to parse_starttag()'
208 k = match.end()
209 self.lasttag = tag = rawdata[i+1:k].lower()
210 while k < endpos:
211 if self.strict:
212 m = attrfind.match(rawdata, k)
213 else:
214 m = attrfind_tolerant.match(rawdata, k)
215 if not m:
216 break
217 attrname, rest, attrvalue = m.group(1, 2, 3)
218 if not rest:
219 attrvalue = None
220 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
221 attrvalue[:1] == '"' == attrvalue[-1:]:
222 attrvalue = attrvalue[1:-1]
223 if attrvalue:
224 attrvalue = self.unescape(attrvalue)
225 attrs.append((attrname.lower(), attrvalue))
226 k = m.end()
227
228 end = rawdata[k:endpos].strip()
229 if end not in (">", "/>"):
230 lineno, offset = self.getpos()
231 if "\n" in self.__starttag_text:
232 lineno = lineno + self.__starttag_text.count("\n")
233 offset = len(self.__starttag_text) \
234 - self.__starttag_text.rfind("\n")
235 else:
236 offset = offset + len(self.__starttag_text)
237 if self.strict:
238 self.error("junk characters in start tag: %r"
239 % (rawdata[k:endpos][:20],))
240 self.handle_data(rawdata[i:endpos])
241 return endpos
242 if end.endswith('/>'):
243 # XHTML-style empty tag: <span attr="value" />
244 self.handle_startendtag(tag, attrs)
245 else:
246 self.handle_starttag(tag, attrs)
247 if tag in self.CDATA_CONTENT_ELEMENTS:
248 self.set_cdata_mode(tag)
249 return endpos
250
251 def set_cdata_mode(self, elem):
252 self.cdata_elem = elem.lower()
253 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
254
255 BeautifulSoupHTMLParser.parse_starttag = parse_starttag
256 BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
257
258 CONSTRUCTOR_TAKES_STRICT = True