Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 1 | """Use the HTMLParser library to parse HTML files that aren't too bad.""" |
| 2 | |
| 3 | __all__ = [ |
| 4 | 'HTMLParserTreeBuilder', |
| 5 | ] |
| 6 | |
| 7 | from HTMLParser import ( |
| 8 | HTMLParser, |
| 9 | HTMLParseError, |
| 10 | ) |
| 11 | import sys |
| 12 | import warnings |
| 13 | |
| 14 | # Starting in Python 3.2, the HTMLParser constructor takes a 'strict' |
| 15 | # argument, which we'd like to set to False. Unfortunately, |
| 16 | # http://bugs.python.org/issue13273 makes strict=True a better bet |
| 17 | # before Python 3.2.3. |
| 18 | # |
| 19 | # At the end of this file, we monkeypatch HTMLParser so that |
| 20 | # strict=True works well on Python 3.2.2. |
| 21 | major, minor, release = sys.version_info[:3] |
| 22 | CONSTRUCTOR_TAKES_STRICT = ( |
| 23 | major > 3 |
| 24 | or (major == 3 and minor > 2) |
| 25 | or (major == 3 and minor == 2 and release >= 3)) |
| 26 | |
| 27 | from bs4.element import ( |
| 28 | CData, |
| 29 | Comment, |
| 30 | Declaration, |
| 31 | Doctype, |
| 32 | ProcessingInstruction, |
| 33 | ) |
| 34 | from bs4.dammit import EntitySubstitution, UnicodeDammit |
| 35 | |
| 36 | from bs4.builder import ( |
| 37 | HTML, |
| 38 | HTMLTreeBuilder, |
| 39 | STRICT, |
| 40 | ) |
| 41 | |
| 42 | |
| 43 | HTMLPARSER = 'html.parser' |
| 44 | |
| 45 | class BeautifulSoupHTMLParser(HTMLParser): |
| 46 | def handle_starttag(self, name, attrs): |
| 47 | # XXX namespace |
| 48 | attr_dict = {} |
| 49 | for key, value in attrs: |
| 50 | # Change None attribute values to the empty string |
| 51 | # for consistency with the other tree builders. |
| 52 | if value is None: |
| 53 | value = '' |
| 54 | attr_dict[key] = value |
| 55 | attrvalue = '""' |
| 56 | self.soup.handle_starttag(name, None, None, attr_dict) |
| 57 | |
| 58 | def handle_endtag(self, name): |
| 59 | self.soup.handle_endtag(name) |
| 60 | |
| 61 | def handle_data(self, data): |
| 62 | self.soup.handle_data(data) |
| 63 | |
| 64 | def handle_charref(self, name): |
| 65 | # XXX workaround for a bug in HTMLParser. Remove this once |
| 66 | # it's fixed. |
| 67 | if name.startswith('x'): |
| 68 | real_name = int(name.lstrip('x'), 16) |
| 69 | elif name.startswith('X'): |
| 70 | real_name = int(name.lstrip('X'), 16) |
| 71 | else: |
| 72 | real_name = int(name) |
| 73 | |
| 74 | try: |
| 75 | data = unichr(real_name) |
| 76 | except (ValueError, OverflowError), e: |
| 77 | data = u"\N{REPLACEMENT CHARACTER}" |
| 78 | |
| 79 | self.handle_data(data) |
| 80 | |
| 81 | def handle_entityref(self, name): |
| 82 | character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) |
| 83 | if character is not None: |
| 84 | data = character |
| 85 | else: |
| 86 | data = "&%s;" % name |
| 87 | self.handle_data(data) |
| 88 | |
| 89 | def handle_comment(self, data): |
| 90 | self.soup.endData() |
| 91 | self.soup.handle_data(data) |
| 92 | self.soup.endData(Comment) |
| 93 | |
| 94 | def handle_decl(self, data): |
| 95 | self.soup.endData() |
| 96 | if data.startswith("DOCTYPE "): |
| 97 | data = data[len("DOCTYPE "):] |
| 98 | elif data == 'DOCTYPE': |
| 99 | # i.e. "<!DOCTYPE>" |
| 100 | data = '' |
| 101 | self.soup.handle_data(data) |
| 102 | self.soup.endData(Doctype) |
| 103 | |
| 104 | def unknown_decl(self, data): |
| 105 | if data.upper().startswith('CDATA['): |
| 106 | cls = CData |
| 107 | data = data[len('CDATA['):] |
| 108 | else: |
| 109 | cls = Declaration |
| 110 | self.soup.endData() |
| 111 | self.soup.handle_data(data) |
| 112 | self.soup.endData(cls) |
| 113 | |
| 114 | def handle_pi(self, data): |
| 115 | self.soup.endData() |
| 116 | if data.endswith("?") and data.lower().startswith("xml"): |
| 117 | # "An XHTML processing instruction using the trailing '?' |
| 118 | # will cause the '?' to be included in data." - HTMLParser |
| 119 | # docs. |
| 120 | # |
| 121 | # Strip the question mark so we don't end up with two |
| 122 | # question marks. |
| 123 | data = data[:-1] |
| 124 | self.soup.handle_data(data) |
| 125 | self.soup.endData(ProcessingInstruction) |
| 126 | |
| 127 | |
| 128 | class HTMLParserTreeBuilder(HTMLTreeBuilder): |
| 129 | |
| 130 | is_xml = False |
| 131 | features = [HTML, STRICT, HTMLPARSER] |
| 132 | |
| 133 | def __init__(self, *args, **kwargs): |
| 134 | if CONSTRUCTOR_TAKES_STRICT: |
| 135 | kwargs['strict'] = False |
| 136 | self.parser_args = (args, kwargs) |
| 137 | |
| 138 | def prepare_markup(self, markup, user_specified_encoding=None, |
| 139 | document_declared_encoding=None): |
| 140 | """ |
| 141 | :return: A 4-tuple (markup, original encoding, encoding |
| 142 | declared within markup, whether any characters had to be |
| 143 | replaced with REPLACEMENT CHARACTER). |
| 144 | """ |
| 145 | if isinstance(markup, unicode): |
| 146 | yield (markup, None, None, False) |
| 147 | return |
| 148 | |
| 149 | try_encodings = [user_specified_encoding, document_declared_encoding] |
| 150 | dammit = UnicodeDammit(markup, try_encodings, is_html=True) |
| 151 | yield (dammit.markup, dammit.original_encoding, |
| 152 | dammit.declared_html_encoding, |
| 153 | dammit.contains_replacement_characters) |
| 154 | |
| 155 | def feed(self, markup): |
| 156 | args, kwargs = self.parser_args |
| 157 | parser = BeautifulSoupHTMLParser(*args, **kwargs) |
| 158 | parser.soup = self.soup |
| 159 | try: |
| 160 | parser.feed(markup) |
| 161 | except HTMLParseError, e: |
| 162 | warnings.warn(RuntimeWarning( |
| 163 | "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) |
| 164 | raise e |
| 165 | |
| 166 | # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some |
| 167 | # 3.2.3 code. This ensures they don't treat markup like <p></p> as a |
| 168 | # string. |
| 169 | # |
| 170 | # XXX This code can be removed once most Python 3 users are on 3.2.3. |
| 171 | if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: |
| 172 | import re |
| 173 | attrfind_tolerant = re.compile( |
| 174 | r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' |
| 175 | r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') |
| 176 | HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant |
| 177 | |
| 178 | locatestarttagend = re.compile(r""" |
| 179 | <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name |
| 180 | (?:\s+ # whitespace before attribute name |
| 181 | (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name |
| 182 | (?:\s*=\s* # value indicator |
| 183 | (?:'[^']*' # LITA-enclosed value |
| 184 | |\"[^\"]*\" # LIT-enclosed value |
| 185 | |[^'\">\s]+ # bare value |
| 186 | ) |
| 187 | )? |
| 188 | ) |
| 189 | )* |
| 190 | \s* # trailing whitespace |
| 191 | """, re.VERBOSE) |
| 192 | BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend |
| 193 | |
| 194 | from html.parser import tagfind, attrfind |
| 195 | |
| 196 | def parse_starttag(self, i): |
| 197 | self.__starttag_text = None |
| 198 | endpos = self.check_for_whole_start_tag(i) |
| 199 | if endpos < 0: |
| 200 | return endpos |
| 201 | rawdata = self.rawdata |
| 202 | self.__starttag_text = rawdata[i:endpos] |
| 203 | |
| 204 | # Now parse the data between i+1 and j into a tag and attrs |
| 205 | attrs = [] |
| 206 | match = tagfind.match(rawdata, i+1) |
| 207 | assert match, 'unexpected call to parse_starttag()' |
| 208 | k = match.end() |
| 209 | self.lasttag = tag = rawdata[i+1:k].lower() |
| 210 | while k < endpos: |
| 211 | if self.strict: |
| 212 | m = attrfind.match(rawdata, k) |
| 213 | else: |
| 214 | m = attrfind_tolerant.match(rawdata, k) |
| 215 | if not m: |
| 216 | break |
| 217 | attrname, rest, attrvalue = m.group(1, 2, 3) |
| 218 | if not rest: |
| 219 | attrvalue = None |
| 220 | elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ |
| 221 | attrvalue[:1] == '"' == attrvalue[-1:]: |
| 222 | attrvalue = attrvalue[1:-1] |
| 223 | if attrvalue: |
| 224 | attrvalue = self.unescape(attrvalue) |
| 225 | attrs.append((attrname.lower(), attrvalue)) |
| 226 | k = m.end() |
| 227 | |
| 228 | end = rawdata[k:endpos].strip() |
| 229 | if end not in (">", "/>"): |
| 230 | lineno, offset = self.getpos() |
| 231 | if "\n" in self.__starttag_text: |
| 232 | lineno = lineno + self.__starttag_text.count("\n") |
| 233 | offset = len(self.__starttag_text) \ |
| 234 | - self.__starttag_text.rfind("\n") |
| 235 | else: |
| 236 | offset = offset + len(self.__starttag_text) |
| 237 | if self.strict: |
| 238 | self.error("junk characters in start tag: %r" |
| 239 | % (rawdata[k:endpos][:20],)) |
| 240 | self.handle_data(rawdata[i:endpos]) |
| 241 | return endpos |
| 242 | if end.endswith('/>'): |
| 243 | # XHTML-style empty tag: <span attr="value" /> |
| 244 | self.handle_startendtag(tag, attrs) |
| 245 | else: |
| 246 | self.handle_starttag(tag, attrs) |
| 247 | if tag in self.CDATA_CONTENT_ELEMENTS: |
| 248 | self.set_cdata_mode(tag) |
| 249 | return endpos |
| 250 | |
| 251 | def set_cdata_mode(self, elem): |
| 252 | self.cdata_elem = elem.lower() |
| 253 | self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) |
| 254 | |
| 255 | BeautifulSoupHTMLParser.parse_starttag = parse_starttag |
| 256 | BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode |
| 257 | |
| 258 | CONSTRUCTOR_TAKES_STRICT = True |