| Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 1 | # -*- coding: utf-8 -*- | 
|  | 2 | """Beautiful Soup bonus library: Unicode, Dammit | 
|  | 3 |  | 
|  | 4 | This library converts a bytestream to Unicode through any means | 
|  | 5 | necessary. It is heavily based on code from Mark Pilgrim's Universal | 
|  | 6 | Feed Parser. It works best on XML and XML, but it does not rewrite the | 
|  | 7 | XML or HTML to reflect a new encoding; that's the tree builder's job. | 
|  | 8 | """ | 
|  | 9 |  | 
|  | 10 | import codecs | 
|  | 11 | from htmlentitydefs import codepoint2name | 
|  | 12 | import re | 
|  | 13 | import logging | 
|  | 14 | import string | 
|  | 15 |  | 
|  | 16 | # Import a library to autodetect character encodings. | 
|  | 17 | chardet_type = None | 
|  | 18 | try: | 
|  | 19 | # First try the fast C implementation. | 
|  | 20 | #  PyPI package: cchardet | 
|  | 21 | import cchardet | 
|  | 22 | def chardet_dammit(s): | 
|  | 23 | return cchardet.detect(s)['encoding'] | 
|  | 24 | except ImportError: | 
|  | 25 | try: | 
|  | 26 | # Fall back to the pure Python implementation | 
|  | 27 | #  Debian package: python-chardet | 
|  | 28 | #  PyPI package: chardet | 
|  | 29 | import chardet | 
|  | 30 | def chardet_dammit(s): | 
|  | 31 | return chardet.detect(s)['encoding'] | 
|  | 32 | #import chardet.constants | 
|  | 33 | #chardet.constants._debug = 1 | 
|  | 34 | except ImportError: | 
|  | 35 | # No chardet available. | 
|  | 36 | def chardet_dammit(s): | 
|  | 37 | return None | 
|  | 38 |  | 
|  | 39 | # Available from http://cjkpython.i18n.org/. | 
|  | 40 | try: | 
|  | 41 | import iconv_codec | 
|  | 42 | except ImportError: | 
|  | 43 | pass | 
|  | 44 |  | 
|  | 45 | xml_encoding_re = re.compile( | 
|  | 46 | '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I) | 
|  | 47 | html_meta_re = re.compile( | 
|  | 48 | '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) | 
|  | 49 |  | 
|  | 50 | class EntitySubstitution(object): | 
|  | 51 |  | 
|  | 52 | """Substitute XML or HTML entities for the corresponding characters.""" | 
|  | 53 |  | 
|  | 54 | def _populate_class_variables(): | 
|  | 55 | lookup = {} | 
|  | 56 | reverse_lookup = {} | 
|  | 57 | characters_for_re = [] | 
|  | 58 | for codepoint, name in list(codepoint2name.items()): | 
|  | 59 | character = unichr(codepoint) | 
|  | 60 | if codepoint != 34: | 
|  | 61 | # There's no point in turning the quotation mark into | 
|  | 62 | # ", unless it happens within an attribute value, which | 
|  | 63 | # is handled elsewhere. | 
|  | 64 | characters_for_re.append(character) | 
|  | 65 | lookup[character] = name | 
|  | 66 | # But we do want to turn " into the quotation mark. | 
|  | 67 | reverse_lookup[name] = character | 
|  | 68 | re_definition = "[%s]" % "".join(characters_for_re) | 
|  | 69 | return lookup, reverse_lookup, re.compile(re_definition) | 
|  | 70 | (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, | 
|  | 71 | CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() | 
|  | 72 |  | 
|  | 73 | CHARACTER_TO_XML_ENTITY = { | 
|  | 74 | "'": "apos", | 
|  | 75 | '"': "quot", | 
|  | 76 | "&": "amp", | 
|  | 77 | "<": "lt", | 
|  | 78 | ">": "gt", | 
|  | 79 | } | 
|  | 80 |  | 
|  | 81 | BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" | 
|  | 82 | "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" | 
|  | 83 | ")") | 
|  | 84 |  | 
|  | 85 | AMPERSAND_OR_BRACKET = re.compile("([<>&])") | 
|  | 86 |  | 
|  | 87 | @classmethod | 
|  | 88 | def _substitute_html_entity(cls, matchobj): | 
|  | 89 | entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) | 
|  | 90 | return "&%s;" % entity | 
|  | 91 |  | 
|  | 92 | @classmethod | 
|  | 93 | def _substitute_xml_entity(cls, matchobj): | 
|  | 94 | """Used with a regular expression to substitute the | 
|  | 95 | appropriate XML entity for an XML special character.""" | 
|  | 96 | entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] | 
|  | 97 | return "&%s;" % entity | 
|  | 98 |  | 
|  | 99 | @classmethod | 
|  | 100 | def quoted_attribute_value(self, value): | 
|  | 101 | """Make a value into a quoted XML attribute, possibly escaping it. | 
|  | 102 |  | 
|  | 103 | Most strings will be quoted using double quotes. | 
|  | 104 |  | 
|  | 105 | Bob's Bar -> "Bob's Bar" | 
|  | 106 |  | 
|  | 107 | If a string contains double quotes, it will be quoted using | 
|  | 108 | single quotes. | 
|  | 109 |  | 
|  | 110 | Welcome to "my bar" -> 'Welcome to "my bar"' | 
|  | 111 |  | 
|  | 112 | If a string contains both single and double quotes, the | 
|  | 113 | double quotes will be escaped, and the string will be quoted | 
|  | 114 | using double quotes. | 
|  | 115 |  | 
|  | 116 | Welcome to "Bob's Bar" -> "Welcome to "Bob's bar" | 
|  | 117 | """ | 
|  | 118 | quote_with = '"' | 
|  | 119 | if '"' in value: | 
|  | 120 | if "'" in value: | 
|  | 121 | # The string contains both single and double | 
|  | 122 | # quotes.  Turn the double quotes into | 
|  | 123 | # entities. We quote the double quotes rather than | 
|  | 124 | # the single quotes because the entity name is | 
|  | 125 | # """ whether this is HTML or XML.  If we | 
|  | 126 | # quoted the single quotes, we'd have to decide | 
|  | 127 | # between ' and &squot;. | 
|  | 128 | replace_with = """ | 
|  | 129 | value = value.replace('"', replace_with) | 
|  | 130 | else: | 
|  | 131 | # There are double quotes but no single quotes. | 
|  | 132 | # We can use single quotes to quote the attribute. | 
|  | 133 | quote_with = "'" | 
|  | 134 | return quote_with + value + quote_with | 
|  | 135 |  | 
|  | 136 | @classmethod | 
|  | 137 | def substitute_xml(cls, value, make_quoted_attribute=False): | 
|  | 138 | """Substitute XML entities for special XML characters. | 
|  | 139 |  | 
|  | 140 | :param value: A string to be substituted. The less-than sign | 
|  | 141 | will become <, the greater-than sign will become >, | 
|  | 142 | and any ampersands will become &. If you want ampersands | 
|  | 143 | that appear to be part of an entity definition to be left | 
|  | 144 | alone, use substitute_xml_containing_entities() instead. | 
|  | 145 |  | 
|  | 146 | :param make_quoted_attribute: If True, then the string will be | 
|  | 147 | quoted, as befits an attribute value. | 
|  | 148 | """ | 
|  | 149 | # Escape angle brackets and ampersands. | 
|  | 150 | value = cls.AMPERSAND_OR_BRACKET.sub( | 
|  | 151 | cls._substitute_xml_entity, value) | 
|  | 152 |  | 
|  | 153 | if make_quoted_attribute: | 
|  | 154 | value = cls.quoted_attribute_value(value) | 
|  | 155 | return value | 
|  | 156 |  | 
|  | 157 | @classmethod | 
|  | 158 | def substitute_xml_containing_entities( | 
|  | 159 | cls, value, make_quoted_attribute=False): | 
|  | 160 | """Substitute XML entities for special XML characters. | 
|  | 161 |  | 
|  | 162 | :param value: A string to be substituted. The less-than sign will | 
|  | 163 | become <, the greater-than sign will become >, and any | 
|  | 164 | ampersands that are not part of an entity defition will | 
|  | 165 | become &. | 
|  | 166 |  | 
|  | 167 | :param make_quoted_attribute: If True, then the string will be | 
|  | 168 | quoted, as befits an attribute value. | 
|  | 169 | """ | 
|  | 170 | # Escape angle brackets, and ampersands that aren't part of | 
|  | 171 | # entities. | 
|  | 172 | value = cls.BARE_AMPERSAND_OR_BRACKET.sub( | 
|  | 173 | cls._substitute_xml_entity, value) | 
|  | 174 |  | 
|  | 175 | if make_quoted_attribute: | 
|  | 176 | value = cls.quoted_attribute_value(value) | 
|  | 177 | return value | 
|  | 178 |  | 
|  | 179 | @classmethod | 
|  | 180 | def substitute_html(cls, s): | 
|  | 181 | """Replace certain Unicode characters with named HTML entities. | 
|  | 182 |  | 
|  | 183 | This differs from data.encode(encoding, 'xmlcharrefreplace') | 
|  | 184 | in that the goal is to make the result more readable (to those | 
|  | 185 | with ASCII displays) rather than to recover from | 
|  | 186 | errors. There's absolutely nothing wrong with a UTF-8 string | 
|  | 187 | containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that | 
|  | 188 | character with "é" will make it more readable to some | 
|  | 189 | people. | 
|  | 190 | """ | 
|  | 191 | return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( | 
|  | 192 | cls._substitute_html_entity, s) | 
|  | 193 |  | 
|  | 194 |  | 
|  | 195 | class EncodingDetector: | 
|  | 196 | """Suggests a number of possible encodings for a bytestring. | 
|  | 197 |  | 
|  | 198 | Order of precedence: | 
|  | 199 |  | 
|  | 200 | 1. Encodings you specifically tell EncodingDetector to try first | 
|  | 201 | (the override_encodings argument to the constructor). | 
|  | 202 |  | 
|  | 203 | 2. An encoding declared within the bytestring itself, either in an | 
|  | 204 | XML declaration (if the bytestring is to be interpreted as an XML | 
|  | 205 | document), or in a <meta> tag (if the bytestring is to be | 
|  | 206 | interpreted as an HTML document.) | 
|  | 207 |  | 
|  | 208 | 3. An encoding detected through textual analysis by chardet, | 
|  | 209 | cchardet, or a similar external library. | 
|  | 210 |  | 
|  | 211 | 4. UTF-8. | 
|  | 212 |  | 
|  | 213 | 5. Windows-1252. | 
|  | 214 | """ | 
|  | 215 | def __init__(self, markup, override_encodings=None, is_html=False): | 
|  | 216 | self.override_encodings = override_encodings or [] | 
|  | 217 | self.chardet_encoding = None | 
|  | 218 | self.is_html = is_html | 
|  | 219 | self.declared_encoding = None | 
|  | 220 |  | 
|  | 221 | # First order of business: strip a byte-order mark. | 
|  | 222 | self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup) | 
|  | 223 |  | 
|  | 224 | def _usable(self, encoding, tried): | 
|  | 225 | if encoding is not None: | 
|  | 226 | encoding = encoding.lower() | 
|  | 227 | if encoding not in tried: | 
|  | 228 | tried.add(encoding) | 
|  | 229 | return True | 
|  | 230 | return False | 
|  | 231 |  | 
|  | 232 | @property | 
|  | 233 | def encodings(self): | 
|  | 234 | """Yield a number of encodings that might work for this markup.""" | 
|  | 235 | tried = set() | 
|  | 236 | for e in self.override_encodings: | 
|  | 237 | if self._usable(e, tried): | 
|  | 238 | yield e | 
|  | 239 |  | 
|  | 240 | # Did the document originally start with a byte-order mark | 
|  | 241 | # that indicated its encoding? | 
|  | 242 | if self._usable(self.sniffed_encoding, tried): | 
|  | 243 | yield self.sniffed_encoding | 
|  | 244 |  | 
|  | 245 | # Look within the document for an XML or HTML encoding | 
|  | 246 | # declaration. | 
|  | 247 | if self.declared_encoding is None: | 
|  | 248 | self.declared_encoding = self.find_declared_encoding( | 
|  | 249 | self.markup, self.is_html) | 
|  | 250 | if self._usable(self.declared_encoding, tried): | 
|  | 251 | yield self.declared_encoding | 
|  | 252 |  | 
|  | 253 | # Use third-party character set detection to guess at the | 
|  | 254 | # encoding. | 
|  | 255 | if self.chardet_encoding is None: | 
|  | 256 | self.chardet_encoding = chardet_dammit(self.markup) | 
|  | 257 | if self._usable(self.chardet_encoding, tried): | 
|  | 258 | yield self.chardet_encoding | 
|  | 259 |  | 
|  | 260 | # As a last-ditch effort, try utf-8 and windows-1252. | 
|  | 261 | for e in ('utf-8', 'windows-1252'): | 
|  | 262 | if self._usable(e, tried): | 
|  | 263 | yield e | 
|  | 264 |  | 
|  | 265 | @classmethod | 
|  | 266 | def strip_byte_order_mark(cls, data): | 
|  | 267 | """If a byte-order mark is present, strip it and return the encoding it implies.""" | 
|  | 268 | encoding = None | 
|  | 269 | if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ | 
|  | 270 | and (data[2:4] != '\x00\x00'): | 
|  | 271 | encoding = 'utf-16be' | 
|  | 272 | data = data[2:] | 
|  | 273 | elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \ | 
|  | 274 | and (data[2:4] != '\x00\x00'): | 
|  | 275 | encoding = 'utf-16le' | 
|  | 276 | data = data[2:] | 
|  | 277 | elif data[:3] == b'\xef\xbb\xbf': | 
|  | 278 | encoding = 'utf-8' | 
|  | 279 | data = data[3:] | 
|  | 280 | elif data[:4] == b'\x00\x00\xfe\xff': | 
|  | 281 | encoding = 'utf-32be' | 
|  | 282 | data = data[4:] | 
|  | 283 | elif data[:4] == b'\xff\xfe\x00\x00': | 
|  | 284 | encoding = 'utf-32le' | 
|  | 285 | data = data[4:] | 
|  | 286 | return data, encoding | 
|  | 287 |  | 
|  | 288 | @classmethod | 
|  | 289 | def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False): | 
|  | 290 | """Given a document, tries to find its declared encoding. | 
|  | 291 |  | 
|  | 292 | An XML encoding is declared at the beginning of the document. | 
|  | 293 |  | 
|  | 294 | An HTML encoding is declared in a <meta> tag, hopefully near the | 
|  | 295 | beginning of the document. | 
|  | 296 | """ | 
|  | 297 | if search_entire_document: | 
|  | 298 | xml_endpos = html_endpos = len(markup) | 
|  | 299 | else: | 
|  | 300 | xml_endpos = 1024 | 
|  | 301 | html_endpos = max(2048, int(len(markup) * 0.05)) | 
|  | 302 |  | 
|  | 303 | declared_encoding = None | 
|  | 304 | declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos) | 
|  | 305 | if not declared_encoding_match and is_html: | 
|  | 306 | declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos) | 
|  | 307 | if declared_encoding_match is not None: | 
|  | 308 | declared_encoding = declared_encoding_match.groups()[0].decode( | 
|  | 309 | 'ascii') | 
|  | 310 | if declared_encoding: | 
|  | 311 | return declared_encoding.lower() | 
|  | 312 | return None | 
|  | 313 |  | 
|  | 314 | class UnicodeDammit: | 
|  | 315 | """A class for detecting the encoding of a *ML document and | 
|  | 316 | converting it to a Unicode string. If the source encoding is | 
|  | 317 | windows-1252, can replace MS smart quotes with their HTML or XML | 
|  | 318 | equivalents.""" | 
|  | 319 |  | 
|  | 320 | # This dictionary maps commonly seen values for "charset" in HTML | 
|  | 321 | # meta tags to the corresponding Python codec names. It only covers | 
|  | 322 | # values that aren't in Python's aliases and can't be determined | 
|  | 323 | # by the heuristics in find_codec. | 
|  | 324 | CHARSET_ALIASES = {"macintosh": "mac-roman", | 
|  | 325 | "x-sjis": "shift-jis"} | 
|  | 326 |  | 
|  | 327 | ENCODINGS_WITH_SMART_QUOTES = [ | 
|  | 328 | "windows-1252", | 
|  | 329 | "iso-8859-1", | 
|  | 330 | "iso-8859-2", | 
|  | 331 | ] | 
|  | 332 |  | 
|  | 333 | def __init__(self, markup, override_encodings=[], | 
|  | 334 | smart_quotes_to=None, is_html=False): | 
|  | 335 | self.smart_quotes_to = smart_quotes_to | 
|  | 336 | self.tried_encodings = [] | 
|  | 337 | self.contains_replacement_characters = False | 
|  | 338 | self.is_html = is_html | 
|  | 339 |  | 
|  | 340 | self.detector = EncodingDetector(markup, override_encodings, is_html) | 
|  | 341 |  | 
|  | 342 | # Short-circuit if the data is in Unicode to begin with. | 
|  | 343 | if isinstance(markup, unicode) or markup == '': | 
|  | 344 | self.markup = markup | 
|  | 345 | self.unicode_markup = unicode(markup) | 
|  | 346 | self.original_encoding = None | 
|  | 347 | return | 
|  | 348 |  | 
|  | 349 | # The encoding detector may have stripped a byte-order mark. | 
|  | 350 | # Use the stripped markup from this point on. | 
|  | 351 | self.markup = self.detector.markup | 
|  | 352 |  | 
|  | 353 | u = None | 
|  | 354 | for encoding in self.detector.encodings: | 
|  | 355 | markup = self.detector.markup | 
|  | 356 | u = self._convert_from(encoding) | 
|  | 357 | if u is not None: | 
|  | 358 | break | 
|  | 359 |  | 
|  | 360 | if not u: | 
|  | 361 | # None of the encodings worked. As an absolute last resort, | 
|  | 362 | # try them again with character replacement. | 
|  | 363 |  | 
|  | 364 | for encoding in self.detector.encodings: | 
|  | 365 | if encoding != "ascii": | 
|  | 366 | u = self._convert_from(encoding, "replace") | 
|  | 367 | if u is not None: | 
|  | 368 | logging.warning( | 
|  | 369 | "Some characters could not be decoded, and were " | 
|  | 370 | "replaced with REPLACEMENT CHARACTER.") | 
|  | 371 | self.contains_replacement_characters = True | 
|  | 372 | break | 
|  | 373 |  | 
|  | 374 | # If none of that worked, we could at this point force it to | 
|  | 375 | # ASCII, but that would destroy so much data that I think | 
|  | 376 | # giving up is better. | 
|  | 377 | self.unicode_markup = u | 
|  | 378 | if not u: | 
|  | 379 | self.original_encoding = None | 
|  | 380 |  | 
|  | 381 | def _sub_ms_char(self, match): | 
|  | 382 | """Changes a MS smart quote character to an XML or HTML | 
|  | 383 | entity, or an ASCII character.""" | 
|  | 384 | orig = match.group(1) | 
|  | 385 | if self.smart_quotes_to == 'ascii': | 
|  | 386 | sub = self.MS_CHARS_TO_ASCII.get(orig).encode() | 
|  | 387 | else: | 
|  | 388 | sub = self.MS_CHARS.get(orig) | 
|  | 389 | if type(sub) == tuple: | 
|  | 390 | if self.smart_quotes_to == 'xml': | 
|  | 391 | sub = '&#x'.encode() + sub[1].encode() + ';'.encode() | 
|  | 392 | else: | 
|  | 393 | sub = '&'.encode() + sub[0].encode() + ';'.encode() | 
|  | 394 | else: | 
|  | 395 | sub = sub.encode() | 
|  | 396 | return sub | 
|  | 397 |  | 
|  | 398 | def _convert_from(self, proposed, errors="strict"): | 
|  | 399 | proposed = self.find_codec(proposed) | 
|  | 400 | if not proposed or (proposed, errors) in self.tried_encodings: | 
|  | 401 | return None | 
|  | 402 | self.tried_encodings.append((proposed, errors)) | 
|  | 403 | markup = self.markup | 
|  | 404 | # Convert smart quotes to HTML if coming from an encoding | 
|  | 405 | # that might have them. | 
|  | 406 | if (self.smart_quotes_to is not None | 
|  | 407 | and proposed in self.ENCODINGS_WITH_SMART_QUOTES): | 
|  | 408 | smart_quotes_re = b"([\x80-\x9f])" | 
|  | 409 | smart_quotes_compiled = re.compile(smart_quotes_re) | 
|  | 410 | markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) | 
|  | 411 |  | 
|  | 412 | try: | 
|  | 413 | #print "Trying to convert document to %s (errors=%s)" % ( | 
|  | 414 | #    proposed, errors) | 
|  | 415 | u = self._to_unicode(markup, proposed, errors) | 
|  | 416 | self.markup = u | 
|  | 417 | self.original_encoding = proposed | 
|  | 418 | except Exception as e: | 
|  | 419 | #print "That didn't work!" | 
|  | 420 | #print e | 
|  | 421 | return None | 
|  | 422 | #print "Correct encoding: %s" % proposed | 
|  | 423 | return self.markup | 
|  | 424 |  | 
|  | 425 | def _to_unicode(self, data, encoding, errors="strict"): | 
|  | 426 | '''Given a string and its encoding, decodes the string into Unicode. | 
|  | 427 | %encoding is a string recognized by encodings.aliases''' | 
|  | 428 | return unicode(data, encoding, errors) | 
|  | 429 |  | 
|  | 430 | @property | 
|  | 431 | def declared_html_encoding(self): | 
|  | 432 | if not self.is_html: | 
|  | 433 | return None | 
|  | 434 | return self.detector.declared_encoding | 
|  | 435 |  | 
|  | 436 | def find_codec(self, charset): | 
|  | 437 | value = (self._codec(self.CHARSET_ALIASES.get(charset, charset)) | 
|  | 438 | or (charset and self._codec(charset.replace("-", ""))) | 
|  | 439 | or (charset and self._codec(charset.replace("-", "_"))) | 
|  | 440 | or (charset and charset.lower()) | 
|  | 441 | or charset | 
|  | 442 | ) | 
|  | 443 | if value: | 
|  | 444 | return value.lower() | 
|  | 445 | return None | 
|  | 446 |  | 
|  | 447 | def _codec(self, charset): | 
|  | 448 | if not charset: | 
|  | 449 | return charset | 
|  | 450 | codec = None | 
|  | 451 | try: | 
|  | 452 | codecs.lookup(charset) | 
|  | 453 | codec = charset | 
|  | 454 | except (LookupError, ValueError): | 
|  | 455 | pass | 
|  | 456 | return codec | 
|  | 457 |  | 
|  | 458 |  | 
|  | 459 | # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities. | 
|  | 460 | MS_CHARS = {b'\x80': ('euro', '20AC'), | 
|  | 461 | b'\x81': ' ', | 
|  | 462 | b'\x82': ('sbquo', '201A'), | 
|  | 463 | b'\x83': ('fnof', '192'), | 
|  | 464 | b'\x84': ('bdquo', '201E'), | 
|  | 465 | b'\x85': ('hellip', '2026'), | 
|  | 466 | b'\x86': ('dagger', '2020'), | 
|  | 467 | b'\x87': ('Dagger', '2021'), | 
|  | 468 | b'\x88': ('circ', '2C6'), | 
|  | 469 | b'\x89': ('permil', '2030'), | 
|  | 470 | b'\x8A': ('Scaron', '160'), | 
|  | 471 | b'\x8B': ('lsaquo', '2039'), | 
|  | 472 | b'\x8C': ('OElig', '152'), | 
|  | 473 | b'\x8D': '?', | 
|  | 474 | b'\x8E': ('#x17D', '17D'), | 
|  | 475 | b'\x8F': '?', | 
|  | 476 | b'\x90': '?', | 
|  | 477 | b'\x91': ('lsquo', '2018'), | 
|  | 478 | b'\x92': ('rsquo', '2019'), | 
|  | 479 | b'\x93': ('ldquo', '201C'), | 
|  | 480 | b'\x94': ('rdquo', '201D'), | 
|  | 481 | b'\x95': ('bull', '2022'), | 
|  | 482 | b'\x96': ('ndash', '2013'), | 
|  | 483 | b'\x97': ('mdash', '2014'), | 
|  | 484 | b'\x98': ('tilde', '2DC'), | 
|  | 485 | b'\x99': ('trade', '2122'), | 
|  | 486 | b'\x9a': ('scaron', '161'), | 
|  | 487 | b'\x9b': ('rsaquo', '203A'), | 
|  | 488 | b'\x9c': ('oelig', '153'), | 
|  | 489 | b'\x9d': '?', | 
|  | 490 | b'\x9e': ('#x17E', '17E'), | 
|  | 491 | b'\x9f': ('Yuml', ''),} | 
|  | 492 |  | 
|  | 493 | # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains | 
|  | 494 | # horrors like stripping diacritical marks to turn á into a, but also | 
|  | 495 | # contains non-horrors like turning “ into ". | 
|  | 496 | MS_CHARS_TO_ASCII = { | 
|  | 497 | b'\x80' : 'EUR', | 
|  | 498 | b'\x81' : ' ', | 
|  | 499 | b'\x82' : ',', | 
|  | 500 | b'\x83' : 'f', | 
|  | 501 | b'\x84' : ',,', | 
|  | 502 | b'\x85' : '...', | 
|  | 503 | b'\x86' : '+', | 
|  | 504 | b'\x87' : '++', | 
|  | 505 | b'\x88' : '^', | 
|  | 506 | b'\x89' : '%', | 
|  | 507 | b'\x8a' : 'S', | 
|  | 508 | b'\x8b' : '<', | 
|  | 509 | b'\x8c' : 'OE', | 
|  | 510 | b'\x8d' : '?', | 
|  | 511 | b'\x8e' : 'Z', | 
|  | 512 | b'\x8f' : '?', | 
|  | 513 | b'\x90' : '?', | 
|  | 514 | b'\x91' : "'", | 
|  | 515 | b'\x92' : "'", | 
|  | 516 | b'\x93' : '"', | 
|  | 517 | b'\x94' : '"', | 
|  | 518 | b'\x95' : '*', | 
|  | 519 | b'\x96' : '-', | 
|  | 520 | b'\x97' : '--', | 
|  | 521 | b'\x98' : '~', | 
|  | 522 | b'\x99' : '(TM)', | 
|  | 523 | b'\x9a' : 's', | 
|  | 524 | b'\x9b' : '>', | 
|  | 525 | b'\x9c' : 'oe', | 
|  | 526 | b'\x9d' : '?', | 
|  | 527 | b'\x9e' : 'z', | 
|  | 528 | b'\x9f' : 'Y', | 
|  | 529 | b'\xa0' : ' ', | 
|  | 530 | b'\xa1' : '!', | 
|  | 531 | b'\xa2' : 'c', | 
|  | 532 | b'\xa3' : 'GBP', | 
|  | 533 | b'\xa4' : '$', #This approximation is especially parochial--this is the | 
|  | 534 | #generic currency symbol. | 
|  | 535 | b'\xa5' : 'YEN', | 
|  | 536 | b'\xa6' : '|', | 
|  | 537 | b'\xa7' : 'S', | 
|  | 538 | b'\xa8' : '..', | 
|  | 539 | b'\xa9' : '', | 
|  | 540 | b'\xaa' : '(th)', | 
|  | 541 | b'\xab' : '<<', | 
|  | 542 | b'\xac' : '!', | 
|  | 543 | b'\xad' : ' ', | 
|  | 544 | b'\xae' : '(R)', | 
|  | 545 | b'\xaf' : '-', | 
|  | 546 | b'\xb0' : 'o', | 
|  | 547 | b'\xb1' : '+-', | 
|  | 548 | b'\xb2' : '2', | 
|  | 549 | b'\xb3' : '3', | 
|  | 550 | b'\xb4' : ("'", 'acute'), | 
|  | 551 | b'\xb5' : 'u', | 
|  | 552 | b'\xb6' : 'P', | 
|  | 553 | b'\xb7' : '*', | 
|  | 554 | b'\xb8' : ',', | 
|  | 555 | b'\xb9' : '1', | 
|  | 556 | b'\xba' : '(th)', | 
|  | 557 | b'\xbb' : '>>', | 
|  | 558 | b'\xbc' : '1/4', | 
|  | 559 | b'\xbd' : '1/2', | 
|  | 560 | b'\xbe' : '3/4', | 
|  | 561 | b'\xbf' : '?', | 
|  | 562 | b'\xc0' : 'A', | 
|  | 563 | b'\xc1' : 'A', | 
|  | 564 | b'\xc2' : 'A', | 
|  | 565 | b'\xc3' : 'A', | 
|  | 566 | b'\xc4' : 'A', | 
|  | 567 | b'\xc5' : 'A', | 
|  | 568 | b'\xc6' : 'AE', | 
|  | 569 | b'\xc7' : 'C', | 
|  | 570 | b'\xc8' : 'E', | 
|  | 571 | b'\xc9' : 'E', | 
|  | 572 | b'\xca' : 'E', | 
|  | 573 | b'\xcb' : 'E', | 
|  | 574 | b'\xcc' : 'I', | 
|  | 575 | b'\xcd' : 'I', | 
|  | 576 | b'\xce' : 'I', | 
|  | 577 | b'\xcf' : 'I', | 
|  | 578 | b'\xd0' : 'D', | 
|  | 579 | b'\xd1' : 'N', | 
|  | 580 | b'\xd2' : 'O', | 
|  | 581 | b'\xd3' : 'O', | 
|  | 582 | b'\xd4' : 'O', | 
|  | 583 | b'\xd5' : 'O', | 
|  | 584 | b'\xd6' : 'O', | 
|  | 585 | b'\xd7' : '*', | 
|  | 586 | b'\xd8' : 'O', | 
|  | 587 | b'\xd9' : 'U', | 
|  | 588 | b'\xda' : 'U', | 
|  | 589 | b'\xdb' : 'U', | 
|  | 590 | b'\xdc' : 'U', | 
|  | 591 | b'\xdd' : 'Y', | 
|  | 592 | b'\xde' : 'b', | 
|  | 593 | b'\xdf' : 'B', | 
|  | 594 | b'\xe0' : 'a', | 
|  | 595 | b'\xe1' : 'a', | 
|  | 596 | b'\xe2' : 'a', | 
|  | 597 | b'\xe3' : 'a', | 
|  | 598 | b'\xe4' : 'a', | 
|  | 599 | b'\xe5' : 'a', | 
|  | 600 | b'\xe6' : 'ae', | 
|  | 601 | b'\xe7' : 'c', | 
|  | 602 | b'\xe8' : 'e', | 
|  | 603 | b'\xe9' : 'e', | 
|  | 604 | b'\xea' : 'e', | 
|  | 605 | b'\xeb' : 'e', | 
|  | 606 | b'\xec' : 'i', | 
|  | 607 | b'\xed' : 'i', | 
|  | 608 | b'\xee' : 'i', | 
|  | 609 | b'\xef' : 'i', | 
|  | 610 | b'\xf0' : 'o', | 
|  | 611 | b'\xf1' : 'n', | 
|  | 612 | b'\xf2' : 'o', | 
|  | 613 | b'\xf3' : 'o', | 
|  | 614 | b'\xf4' : 'o', | 
|  | 615 | b'\xf5' : 'o', | 
|  | 616 | b'\xf6' : 'o', | 
|  | 617 | b'\xf7' : '/', | 
|  | 618 | b'\xf8' : 'o', | 
|  | 619 | b'\xf9' : 'u', | 
|  | 620 | b'\xfa' : 'u', | 
|  | 621 | b'\xfb' : 'u', | 
|  | 622 | b'\xfc' : 'u', | 
|  | 623 | b'\xfd' : 'y', | 
|  | 624 | b'\xfe' : 'b', | 
|  | 625 | b'\xff' : 'y', | 
|  | 626 | } | 
|  | 627 |  | 
|  | 628 | # A map used when removing rogue Windows-1252/ISO-8859-1 | 
|  | 629 | # characters in otherwise UTF-8 documents. | 
|  | 630 | # | 
|  | 631 | # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in | 
|  | 632 | # Windows-1252. | 
|  | 633 | WINDOWS_1252_TO_UTF8 = { | 
|  | 634 | 0x80 : b'\xe2\x82\xac', # € | 
|  | 635 | 0x82 : b'\xe2\x80\x9a', # ‚ | 
|  | 636 | 0x83 : b'\xc6\x92',     # ƒ | 
|  | 637 | 0x84 : b'\xe2\x80\x9e', # „ | 
|  | 638 | 0x85 : b'\xe2\x80\xa6', # … | 
|  | 639 | 0x86 : b'\xe2\x80\xa0', # † | 
|  | 640 | 0x87 : b'\xe2\x80\xa1', # ‡ | 
|  | 641 | 0x88 : b'\xcb\x86',     # ˆ | 
|  | 642 | 0x89 : b'\xe2\x80\xb0', # ‰ | 
|  | 643 | 0x8a : b'\xc5\xa0',     # Š | 
|  | 644 | 0x8b : b'\xe2\x80\xb9', # ‹ | 
|  | 645 | 0x8c : b'\xc5\x92',     # Œ | 
|  | 646 | 0x8e : b'\xc5\xbd',     # Ž | 
|  | 647 | 0x91 : b'\xe2\x80\x98', # ‘ | 
|  | 648 | 0x92 : b'\xe2\x80\x99', # ’ | 
|  | 649 | 0x93 : b'\xe2\x80\x9c', # “ | 
|  | 650 | 0x94 : b'\xe2\x80\x9d', # ” | 
|  | 651 | 0x95 : b'\xe2\x80\xa2', # • | 
|  | 652 | 0x96 : b'\xe2\x80\x93', # – | 
|  | 653 | 0x97 : b'\xe2\x80\x94', # — | 
|  | 654 | 0x98 : b'\xcb\x9c',     # ˜ | 
|  | 655 | 0x99 : b'\xe2\x84\xa2', # ™ | 
|  | 656 | 0x9a : b'\xc5\xa1',     # š | 
|  | 657 | 0x9b : b'\xe2\x80\xba', # › | 
|  | 658 | 0x9c : b'\xc5\x93',     # œ | 
|  | 659 | 0x9e : b'\xc5\xbe',     # ž | 
|  | 660 | 0x9f : b'\xc5\xb8',     # Ÿ | 
|  | 661 | 0xa0 : b'\xc2\xa0',     # | 
|  | 662 | 0xa1 : b'\xc2\xa1',     # ¡ | 
|  | 663 | 0xa2 : b'\xc2\xa2',     # ¢ | 
|  | 664 | 0xa3 : b'\xc2\xa3',     # £ | 
|  | 665 | 0xa4 : b'\xc2\xa4',     # ¤ | 
|  | 666 | 0xa5 : b'\xc2\xa5',     # ¥ | 
|  | 667 | 0xa6 : b'\xc2\xa6',     # ¦ | 
|  | 668 | 0xa7 : b'\xc2\xa7',     # § | 
|  | 669 | 0xa8 : b'\xc2\xa8',     # ¨ | 
|  | 670 | 0xa9 : b'\xc2\xa9',     # © | 
|  | 671 | 0xaa : b'\xc2\xaa',     # ª | 
|  | 672 | 0xab : b'\xc2\xab',     # « | 
|  | 673 | 0xac : b'\xc2\xac',     # ¬ | 
|  | 674 | 0xad : b'\xc2\xad',     #  | 
|  | 675 | 0xae : b'\xc2\xae',     # ® | 
|  | 676 | 0xaf : b'\xc2\xaf',     # ¯ | 
|  | 677 | 0xb0 : b'\xc2\xb0',     # ° | 
|  | 678 | 0xb1 : b'\xc2\xb1',     # ± | 
|  | 679 | 0xb2 : b'\xc2\xb2',     # ² | 
|  | 680 | 0xb3 : b'\xc2\xb3',     # ³ | 
|  | 681 | 0xb4 : b'\xc2\xb4',     # ´ | 
|  | 682 | 0xb5 : b'\xc2\xb5',     # µ | 
|  | 683 | 0xb6 : b'\xc2\xb6',     # ¶ | 
|  | 684 | 0xb7 : b'\xc2\xb7',     # · | 
|  | 685 | 0xb8 : b'\xc2\xb8',     # ¸ | 
|  | 686 | 0xb9 : b'\xc2\xb9',     # ¹ | 
|  | 687 | 0xba : b'\xc2\xba',     # º | 
|  | 688 | 0xbb : b'\xc2\xbb',     # » | 
|  | 689 | 0xbc : b'\xc2\xbc',     # ¼ | 
|  | 690 | 0xbd : b'\xc2\xbd',     # ½ | 
|  | 691 | 0xbe : b'\xc2\xbe',     # ¾ | 
|  | 692 | 0xbf : b'\xc2\xbf',     # ¿ | 
|  | 693 | 0xc0 : b'\xc3\x80',     # À | 
|  | 694 | 0xc1 : b'\xc3\x81',     # Á | 
|  | 695 | 0xc2 : b'\xc3\x82',     # Â | 
|  | 696 | 0xc3 : b'\xc3\x83',     # Ã | 
|  | 697 | 0xc4 : b'\xc3\x84',     # Ä | 
|  | 698 | 0xc5 : b'\xc3\x85',     # Å | 
|  | 699 | 0xc6 : b'\xc3\x86',     # Æ | 
|  | 700 | 0xc7 : b'\xc3\x87',     # Ç | 
|  | 701 | 0xc8 : b'\xc3\x88',     # È | 
|  | 702 | 0xc9 : b'\xc3\x89',     # É | 
|  | 703 | 0xca : b'\xc3\x8a',     # Ê | 
|  | 704 | 0xcb : b'\xc3\x8b',     # Ë | 
|  | 705 | 0xcc : b'\xc3\x8c',     # Ì | 
|  | 706 | 0xcd : b'\xc3\x8d',     # Í | 
|  | 707 | 0xce : b'\xc3\x8e',     # Î | 
|  | 708 | 0xcf : b'\xc3\x8f',     # Ï | 
|  | 709 | 0xd0 : b'\xc3\x90',     # Ð | 
|  | 710 | 0xd1 : b'\xc3\x91',     # Ñ | 
|  | 711 | 0xd2 : b'\xc3\x92',     # Ò | 
|  | 712 | 0xd3 : b'\xc3\x93',     # Ó | 
|  | 713 | 0xd4 : b'\xc3\x94',     # Ô | 
|  | 714 | 0xd5 : b'\xc3\x95',     # Õ | 
|  | 715 | 0xd6 : b'\xc3\x96',     # Ö | 
|  | 716 | 0xd7 : b'\xc3\x97',     # × | 
|  | 717 | 0xd8 : b'\xc3\x98',     # Ø | 
|  | 718 | 0xd9 : b'\xc3\x99',     # Ù | 
|  | 719 | 0xda : b'\xc3\x9a',     # Ú | 
|  | 720 | 0xdb : b'\xc3\x9b',     # Û | 
|  | 721 | 0xdc : b'\xc3\x9c',     # Ü | 
|  | 722 | 0xdd : b'\xc3\x9d',     # Ý | 
|  | 723 | 0xde : b'\xc3\x9e',     # Þ | 
|  | 724 | 0xdf : b'\xc3\x9f',     # ß | 
|  | 725 | 0xe0 : b'\xc3\xa0',     # à | 
|  | 726 | 0xe1 : b'\xa1',     # á | 
|  | 727 | 0xe2 : b'\xc3\xa2',     # â | 
|  | 728 | 0xe3 : b'\xc3\xa3',     # ã | 
|  | 729 | 0xe4 : b'\xc3\xa4',     # ä | 
|  | 730 | 0xe5 : b'\xc3\xa5',     # å | 
|  | 731 | 0xe6 : b'\xc3\xa6',     # æ | 
|  | 732 | 0xe7 : b'\xc3\xa7',     # ç | 
|  | 733 | 0xe8 : b'\xc3\xa8',     # è | 
|  | 734 | 0xe9 : b'\xc3\xa9',     # é | 
|  | 735 | 0xea : b'\xc3\xaa',     # ê | 
|  | 736 | 0xeb : b'\xc3\xab',     # ë | 
|  | 737 | 0xec : b'\xc3\xac',     # ì | 
|  | 738 | 0xed : b'\xc3\xad',     # í | 
|  | 739 | 0xee : b'\xc3\xae',     # î | 
|  | 740 | 0xef : b'\xc3\xaf',     # ï | 
|  | 741 | 0xf0 : b'\xc3\xb0',     # ð | 
|  | 742 | 0xf1 : b'\xc3\xb1',     # ñ | 
|  | 743 | 0xf2 : b'\xc3\xb2',     # ò | 
|  | 744 | 0xf3 : b'\xc3\xb3',     # ó | 
|  | 745 | 0xf4 : b'\xc3\xb4',     # ô | 
|  | 746 | 0xf5 : b'\xc3\xb5',     # õ | 
|  | 747 | 0xf6 : b'\xc3\xb6',     # ö | 
|  | 748 | 0xf7 : b'\xc3\xb7',     # ÷ | 
|  | 749 | 0xf8 : b'\xc3\xb8',     # ø | 
|  | 750 | 0xf9 : b'\xc3\xb9',     # ù | 
|  | 751 | 0xfa : b'\xc3\xba',     # ú | 
|  | 752 | 0xfb : b'\xc3\xbb',     # û | 
|  | 753 | 0xfc : b'\xc3\xbc',     # ü | 
|  | 754 | 0xfd : b'\xc3\xbd',     # ý | 
|  | 755 | 0xfe : b'\xc3\xbe',     # þ | 
|  | 756 | } | 
|  | 757 |  | 
|  | 758 | MULTIBYTE_MARKERS_AND_SIZES = [ | 
|  | 759 | (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF | 
|  | 760 | (0xe0, 0xef, 3), # 3-byte characters start with E0-EF | 
|  | 761 | (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4 | 
|  | 762 | ] | 
|  | 763 |  | 
|  | 764 | FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0] | 
|  | 765 | LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1] | 
|  | 766 |  | 
|  | 767 | @classmethod | 
|  | 768 | def detwingle(cls, in_bytes, main_encoding="utf8", | 
|  | 769 | embedded_encoding="windows-1252"): | 
|  | 770 | """Fix characters from one encoding embedded in some other encoding. | 
|  | 771 |  | 
|  | 772 | Currently the only situation supported is Windows-1252 (or its | 
|  | 773 | subset ISO-8859-1), embedded in UTF-8. | 
|  | 774 |  | 
|  | 775 | The input must be a bytestring. If you've already converted | 
|  | 776 | the document to Unicode, you're too late. | 
|  | 777 |  | 
|  | 778 | The output is a bytestring in which `embedded_encoding` | 
|  | 779 | characters have been converted to their `main_encoding` | 
|  | 780 | equivalents. | 
|  | 781 | """ | 
|  | 782 | if embedded_encoding.replace('_', '-').lower() not in ( | 
|  | 783 | 'windows-1252', 'windows_1252'): | 
|  | 784 | raise NotImplementedError( | 
|  | 785 | "Windows-1252 and ISO-8859-1 are the only currently supported " | 
|  | 786 | "embedded encodings.") | 
|  | 787 |  | 
|  | 788 | if main_encoding.lower() not in ('utf8', 'utf-8'): | 
|  | 789 | raise NotImplementedError( | 
|  | 790 | "UTF-8 is the only currently supported main encoding.") | 
|  | 791 |  | 
|  | 792 | byte_chunks = [] | 
|  | 793 |  | 
|  | 794 | chunk_start = 0 | 
|  | 795 | pos = 0 | 
|  | 796 | while pos < len(in_bytes): | 
|  | 797 | byte = in_bytes[pos] | 
|  | 798 | if not isinstance(byte, int): | 
|  | 799 | # Python 2.x | 
|  | 800 | byte = ord(byte) | 
|  | 801 | if (byte >= cls.FIRST_MULTIBYTE_MARKER | 
|  | 802 | and byte <= cls.LAST_MULTIBYTE_MARKER): | 
|  | 803 | # This is the start of a UTF-8 multibyte character. Skip | 
|  | 804 | # to the end. | 
|  | 805 | for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES: | 
|  | 806 | if byte >= start and byte <= end: | 
|  | 807 | pos += size | 
|  | 808 | break | 
|  | 809 | elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8: | 
|  | 810 | # We found a Windows-1252 character! | 
|  | 811 | # Save the string up to this point as a chunk. | 
|  | 812 | byte_chunks.append(in_bytes[chunk_start:pos]) | 
|  | 813 |  | 
|  | 814 | # Now translate the Windows-1252 character into UTF-8 | 
|  | 815 | # and add it as another, one-byte chunk. | 
|  | 816 | byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte]) | 
|  | 817 | pos += 1 | 
|  | 818 | chunk_start = pos | 
|  | 819 | else: | 
|  | 820 | # Go on to the next character. | 
|  | 821 | pos += 1 | 
|  | 822 | if chunk_start == 0: | 
|  | 823 | # The string is unchanged. | 
|  | 824 | return in_bytes | 
|  | 825 | else: | 
|  | 826 | # Store the final chunk. | 
|  | 827 | byte_chunks.append(in_bytes[chunk_start:]) | 
|  | 828 | return b''.join(byte_chunks) | 
|  | 829 |  |