Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 1 | """Use the HTMLParser library to parse HTML files that aren't too bad.""" |
| 2 | |
| 3 | __all__ = [ |
| 4 | 'HTMLParserTreeBuilder', |
| 5 | ] |
| 6 | |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 7 | from html.parser import HTMLParser |
| 8 | |
| 9 | try: |
| 10 | from html.parser import HTMLParseError |
| 11 | except ImportError as e: |
| 12 | # HTMLParseError is removed in Python 3.5. Since it can never be |
| 13 | # thrown in 3.5, we can just define our own class as a placeholder. |
| 14 | class HTMLParseError(Exception): |
| 15 | pass |
| 16 | |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 17 | import sys |
| 18 | import warnings |
| 19 | |
| 20 | # Starting in Python 3.2, the HTMLParser constructor takes a 'strict' |
| 21 | # argument, which we'd like to set to False. Unfortunately, |
| 22 | # http://bugs.python.org/issue13273 makes strict=True a better bet |
| 23 | # before Python 3.2.3. |
| 24 | # |
| 25 | # At the end of this file, we monkeypatch HTMLParser so that |
| 26 | # strict=True works well on Python 3.2.2. |
| 27 | major, minor, release = sys.version_info[:3] |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 28 | CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3 |
| 29 | CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3 |
| 30 | CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4 |
| 31 | |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 32 | |
| 33 | from bs4.element import ( |
| 34 | CData, |
| 35 | Comment, |
| 36 | Declaration, |
| 37 | Doctype, |
| 38 | ProcessingInstruction, |
| 39 | ) |
| 40 | from bs4.dammit import EntitySubstitution, UnicodeDammit |
| 41 | |
| 42 | from bs4.builder import ( |
| 43 | HTML, |
| 44 | HTMLTreeBuilder, |
| 45 | STRICT, |
| 46 | ) |
| 47 | |
| 48 | |
| 49 | HTMLPARSER = 'html.parser' |
| 50 | |
| 51 | class BeautifulSoupHTMLParser(HTMLParser): |
| 52 | def handle_starttag(self, name, attrs): |
| 53 | # XXX namespace |
| 54 | attr_dict = {} |
| 55 | for key, value in attrs: |
| 56 | # Change None attribute values to the empty string |
| 57 | # for consistency with the other tree builders. |
| 58 | if value is None: |
| 59 | value = '' |
| 60 | attr_dict[key] = value |
| 61 | attrvalue = '""' |
| 62 | self.soup.handle_starttag(name, None, None, attr_dict) |
| 63 | |
| 64 | def handle_endtag(self, name): |
| 65 | self.soup.handle_endtag(name) |
| 66 | |
| 67 | def handle_data(self, data): |
| 68 | self.soup.handle_data(data) |
| 69 | |
| 70 | def handle_charref(self, name): |
| 71 | # XXX workaround for a bug in HTMLParser. Remove this once |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 72 | # it's fixed in all supported versions. |
| 73 | # http://bugs.python.org/issue13633 |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 74 | if name.startswith('x'): |
| 75 | real_name = int(name.lstrip('x'), 16) |
| 76 | elif name.startswith('X'): |
| 77 | real_name = int(name.lstrip('X'), 16) |
| 78 | else: |
| 79 | real_name = int(name) |
| 80 | |
| 81 | try: |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 82 | data = chr(real_name) |
| 83 | except (ValueError, OverflowError) as e: |
| 84 | data = "\N{REPLACEMENT CHARACTER}" |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 85 | |
| 86 | self.handle_data(data) |
| 87 | |
| 88 | def handle_entityref(self, name): |
| 89 | character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) |
| 90 | if character is not None: |
| 91 | data = character |
| 92 | else: |
| 93 | data = "&%s;" % name |
| 94 | self.handle_data(data) |
| 95 | |
| 96 | def handle_comment(self, data): |
| 97 | self.soup.endData() |
| 98 | self.soup.handle_data(data) |
| 99 | self.soup.endData(Comment) |
| 100 | |
| 101 | def handle_decl(self, data): |
| 102 | self.soup.endData() |
| 103 | if data.startswith("DOCTYPE "): |
| 104 | data = data[len("DOCTYPE "):] |
| 105 | elif data == 'DOCTYPE': |
| 106 | # i.e. "<!DOCTYPE>" |
| 107 | data = '' |
| 108 | self.soup.handle_data(data) |
| 109 | self.soup.endData(Doctype) |
| 110 | |
| 111 | def unknown_decl(self, data): |
| 112 | if data.upper().startswith('CDATA['): |
| 113 | cls = CData |
| 114 | data = data[len('CDATA['):] |
| 115 | else: |
| 116 | cls = Declaration |
| 117 | self.soup.endData() |
| 118 | self.soup.handle_data(data) |
| 119 | self.soup.endData(cls) |
| 120 | |
| 121 | def handle_pi(self, data): |
| 122 | self.soup.endData() |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 123 | self.soup.handle_data(data) |
| 124 | self.soup.endData(ProcessingInstruction) |
| 125 | |
| 126 | |
| 127 | class HTMLParserTreeBuilder(HTMLTreeBuilder): |
| 128 | |
| 129 | is_xml = False |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 130 | picklable = True |
| 131 | NAME = HTMLPARSER |
| 132 | features = [NAME, HTML, STRICT] |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 133 | |
| 134 | def __init__(self, *args, **kwargs): |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 135 | if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 136 | kwargs['strict'] = False |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 137 | if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: |
| 138 | kwargs['convert_charrefs'] = False |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 139 | self.parser_args = (args, kwargs) |
| 140 | |
| 141 | def prepare_markup(self, markup, user_specified_encoding=None, |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 142 | document_declared_encoding=None, exclude_encodings=None): |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 143 | """ |
| 144 | :return: A 4-tuple (markup, original encoding, encoding |
| 145 | declared within markup, whether any characters had to be |
| 146 | replaced with REPLACEMENT CHARACTER). |
| 147 | """ |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 148 | if isinstance(markup, str): |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 149 | yield (markup, None, None, False) |
| 150 | return |
| 151 | |
| 152 | try_encodings = [user_specified_encoding, document_declared_encoding] |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 153 | dammit = UnicodeDammit(markup, try_encodings, is_html=True, |
| 154 | exclude_encodings=exclude_encodings) |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 155 | yield (dammit.markup, dammit.original_encoding, |
| 156 | dammit.declared_html_encoding, |
| 157 | dammit.contains_replacement_characters) |
| 158 | |
| 159 | def feed(self, markup): |
| 160 | args, kwargs = self.parser_args |
| 161 | parser = BeautifulSoupHTMLParser(*args, **kwargs) |
| 162 | parser.soup = self.soup |
| 163 | try: |
| 164 | parser.feed(markup) |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 165 | except HTMLParseError as e: |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 166 | warnings.warn(RuntimeWarning( |
| 167 | "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) |
| 168 | raise e |
| 169 | |
| 170 | # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some |
| 171 | # 3.2.3 code. This ensures they don't treat markup like <p></p> as a |
| 172 | # string. |
| 173 | # |
| 174 | # XXX This code can be removed once most Python 3 users are on 3.2.3. |
| 175 | if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: |
| 176 | import re |
| 177 | attrfind_tolerant = re.compile( |
| 178 | r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' |
| 179 | r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') |
| 180 | HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant |
| 181 | |
| 182 | locatestarttagend = re.compile(r""" |
| 183 | <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name |
| 184 | (?:\s+ # whitespace before attribute name |
| 185 | (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name |
| 186 | (?:\s*=\s* # value indicator |
| 187 | (?:'[^']*' # LITA-enclosed value |
| 188 | |\"[^\"]*\" # LIT-enclosed value |
| 189 | |[^'\">\s]+ # bare value |
| 190 | ) |
| 191 | )? |
| 192 | ) |
| 193 | )* |
| 194 | \s* # trailing whitespace |
| 195 | """, re.VERBOSE) |
| 196 | BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend |
| 197 | |
| 198 | from html.parser import tagfind, attrfind |
| 199 | |
| 200 | def parse_starttag(self, i): |
| 201 | self.__starttag_text = None |
| 202 | endpos = self.check_for_whole_start_tag(i) |
| 203 | if endpos < 0: |
| 204 | return endpos |
| 205 | rawdata = self.rawdata |
| 206 | self.__starttag_text = rawdata[i:endpos] |
| 207 | |
| 208 | # Now parse the data between i+1 and j into a tag and attrs |
| 209 | attrs = [] |
| 210 | match = tagfind.match(rawdata, i+1) |
| 211 | assert match, 'unexpected call to parse_starttag()' |
| 212 | k = match.end() |
| 213 | self.lasttag = tag = rawdata[i+1:k].lower() |
| 214 | while k < endpos: |
| 215 | if self.strict: |
| 216 | m = attrfind.match(rawdata, k) |
| 217 | else: |
| 218 | m = attrfind_tolerant.match(rawdata, k) |
| 219 | if not m: |
| 220 | break |
| 221 | attrname, rest, attrvalue = m.group(1, 2, 3) |
| 222 | if not rest: |
| 223 | attrvalue = None |
| 224 | elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ |
| 225 | attrvalue[:1] == '"' == attrvalue[-1:]: |
| 226 | attrvalue = attrvalue[1:-1] |
| 227 | if attrvalue: |
| 228 | attrvalue = self.unescape(attrvalue) |
| 229 | attrs.append((attrname.lower(), attrvalue)) |
| 230 | k = m.end() |
| 231 | |
| 232 | end = rawdata[k:endpos].strip() |
| 233 | if end not in (">", "/>"): |
| 234 | lineno, offset = self.getpos() |
| 235 | if "\n" in self.__starttag_text: |
| 236 | lineno = lineno + self.__starttag_text.count("\n") |
| 237 | offset = len(self.__starttag_text) \ |
| 238 | - self.__starttag_text.rfind("\n") |
| 239 | else: |
| 240 | offset = offset + len(self.__starttag_text) |
| 241 | if self.strict: |
| 242 | self.error("junk characters in start tag: %r" |
| 243 | % (rawdata[k:endpos][:20],)) |
| 244 | self.handle_data(rawdata[i:endpos]) |
| 245 | return endpos |
| 246 | if end.endswith('/>'): |
| 247 | # XHTML-style empty tag: <span attr="value" /> |
| 248 | self.handle_startendtag(tag, attrs) |
| 249 | else: |
| 250 | self.handle_starttag(tag, attrs) |
| 251 | if tag in self.CDATA_CONTENT_ELEMENTS: |
| 252 | self.set_cdata_mode(tag) |
| 253 | return endpos |
| 254 | |
| 255 | def set_cdata_mode(self, elem): |
| 256 | self.cdata_elem = elem.lower() |
| 257 | self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) |
| 258 | |
| 259 | BeautifulSoupHTMLParser.parse_starttag = parse_starttag |
| 260 | BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode |
| 261 | |
| 262 | CONSTRUCTOR_TAKES_STRICT = True |