Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 1 | # -*- coding: utf-8 -*- |
| 2 | """Beautiful Soup bonus library: Unicode, Dammit |
| 3 | |
| 4 | This library converts a bytestream to Unicode through any means |
| 5 | necessary. It is heavily based on code from Mark Pilgrim's Universal |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 6 | Feed Parser. It works best on XML and HTML, but it does not rewrite the |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 7 | XML or HTML to reflect a new encoding; that's the tree builder's job. |
| 8 | """ |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 9 | __license__ = "MIT" |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 10 | |
| 11 | import codecs |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 12 | from html.entities import codepoint2name |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 13 | import re |
| 14 | import logging |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 15 | |
| 16 | # Import a library to autodetect character encodings. |
| 17 | chardet_type = None |
| 18 | try: |
| 19 | # First try the fast C implementation. |
| 20 | # PyPI package: cchardet |
| 21 | import cchardet |
| 22 | def chardet_dammit(s): |
| 23 | return cchardet.detect(s)['encoding'] |
| 24 | except ImportError: |
| 25 | try: |
| 26 | # Fall back to the pure Python implementation |
| 27 | # Debian package: python-chardet |
| 28 | # PyPI package: chardet |
| 29 | import chardet |
| 30 | def chardet_dammit(s): |
| 31 | return chardet.detect(s)['encoding'] |
| 32 | #import chardet.constants |
| 33 | #chardet.constants._debug = 1 |
| 34 | except ImportError: |
| 35 | # No chardet available. |
| 36 | def chardet_dammit(s): |
| 37 | return None |
| 38 | |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 39 | xml_encoding_re = re.compile( |
Brad Bishop | 1932369 | 2019-04-05 15:28:33 -0400 | [diff] [blame] | 40 | r'^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I) |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 41 | html_meta_re = re.compile( |
Brad Bishop | 1932369 | 2019-04-05 15:28:33 -0400 | [diff] [blame] | 42 | r'<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 43 | |
| 44 | class EntitySubstitution(object): |
| 45 | |
| 46 | """Substitute XML or HTML entities for the corresponding characters.""" |
| 47 | |
| 48 | def _populate_class_variables(): |
| 49 | lookup = {} |
| 50 | reverse_lookup = {} |
| 51 | characters_for_re = [] |
| 52 | for codepoint, name in list(codepoint2name.items()): |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 53 | character = chr(codepoint) |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 54 | if codepoint != 34: |
| 55 | # There's no point in turning the quotation mark into |
| 56 | # ", unless it happens within an attribute value, which |
| 57 | # is handled elsewhere. |
| 58 | characters_for_re.append(character) |
| 59 | lookup[character] = name |
| 60 | # But we do want to turn " into the quotation mark. |
| 61 | reverse_lookup[name] = character |
| 62 | re_definition = "[%s]" % "".join(characters_for_re) |
| 63 | return lookup, reverse_lookup, re.compile(re_definition) |
| 64 | (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, |
| 65 | CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() |
| 66 | |
| 67 | CHARACTER_TO_XML_ENTITY = { |
| 68 | "'": "apos", |
| 69 | '"': "quot", |
| 70 | "&": "amp", |
| 71 | "<": "lt", |
| 72 | ">": "gt", |
| 73 | } |
| 74 | |
Brad Bishop | 1932369 | 2019-04-05 15:28:33 -0400 | [diff] [blame] | 75 | BARE_AMPERSAND_OR_BRACKET = re.compile(r"([<>]|" |
| 76 | r"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" |
| 77 | r")") |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 78 | |
Brad Bishop | 1932369 | 2019-04-05 15:28:33 -0400 | [diff] [blame] | 79 | AMPERSAND_OR_BRACKET = re.compile(r"([<>&])") |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 80 | |
| 81 | @classmethod |
| 82 | def _substitute_html_entity(cls, matchobj): |
| 83 | entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) |
| 84 | return "&%s;" % entity |
| 85 | |
| 86 | @classmethod |
| 87 | def _substitute_xml_entity(cls, matchobj): |
| 88 | """Used with a regular expression to substitute the |
| 89 | appropriate XML entity for an XML special character.""" |
| 90 | entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] |
| 91 | return "&%s;" % entity |
| 92 | |
| 93 | @classmethod |
| 94 | def quoted_attribute_value(self, value): |
| 95 | """Make a value into a quoted XML attribute, possibly escaping it. |
| 96 | |
| 97 | Most strings will be quoted using double quotes. |
| 98 | |
| 99 | Bob's Bar -> "Bob's Bar" |
| 100 | |
| 101 | If a string contains double quotes, it will be quoted using |
| 102 | single quotes. |
| 103 | |
| 104 | Welcome to "my bar" -> 'Welcome to "my bar"' |
| 105 | |
| 106 | If a string contains both single and double quotes, the |
| 107 | double quotes will be escaped, and the string will be quoted |
| 108 | using double quotes. |
| 109 | |
| 110 | Welcome to "Bob's Bar" -> "Welcome to "Bob's bar" |
| 111 | """ |
| 112 | quote_with = '"' |
| 113 | if '"' in value: |
| 114 | if "'" in value: |
| 115 | # The string contains both single and double |
| 116 | # quotes. Turn the double quotes into |
| 117 | # entities. We quote the double quotes rather than |
| 118 | # the single quotes because the entity name is |
| 119 | # """ whether this is HTML or XML. If we |
| 120 | # quoted the single quotes, we'd have to decide |
| 121 | # between ' and &squot;. |
| 122 | replace_with = """ |
| 123 | value = value.replace('"', replace_with) |
| 124 | else: |
| 125 | # There are double quotes but no single quotes. |
| 126 | # We can use single quotes to quote the attribute. |
| 127 | quote_with = "'" |
| 128 | return quote_with + value + quote_with |
| 129 | |
| 130 | @classmethod |
| 131 | def substitute_xml(cls, value, make_quoted_attribute=False): |
| 132 | """Substitute XML entities for special XML characters. |
| 133 | |
| 134 | :param value: A string to be substituted. The less-than sign |
| 135 | will become <, the greater-than sign will become >, |
| 136 | and any ampersands will become &. If you want ampersands |
| 137 | that appear to be part of an entity definition to be left |
| 138 | alone, use substitute_xml_containing_entities() instead. |
| 139 | |
| 140 | :param make_quoted_attribute: If True, then the string will be |
| 141 | quoted, as befits an attribute value. |
| 142 | """ |
| 143 | # Escape angle brackets and ampersands. |
| 144 | value = cls.AMPERSAND_OR_BRACKET.sub( |
| 145 | cls._substitute_xml_entity, value) |
| 146 | |
| 147 | if make_quoted_attribute: |
| 148 | value = cls.quoted_attribute_value(value) |
| 149 | return value |
| 150 | |
| 151 | @classmethod |
| 152 | def substitute_xml_containing_entities( |
| 153 | cls, value, make_quoted_attribute=False): |
| 154 | """Substitute XML entities for special XML characters. |
| 155 | |
| 156 | :param value: A string to be substituted. The less-than sign will |
| 157 | become <, the greater-than sign will become >, and any |
| 158 | ampersands that are not part of an entity defition will |
| 159 | become &. |
| 160 | |
| 161 | :param make_quoted_attribute: If True, then the string will be |
| 162 | quoted, as befits an attribute value. |
| 163 | """ |
| 164 | # Escape angle brackets, and ampersands that aren't part of |
| 165 | # entities. |
| 166 | value = cls.BARE_AMPERSAND_OR_BRACKET.sub( |
| 167 | cls._substitute_xml_entity, value) |
| 168 | |
| 169 | if make_quoted_attribute: |
| 170 | value = cls.quoted_attribute_value(value) |
| 171 | return value |
| 172 | |
| 173 | @classmethod |
| 174 | def substitute_html(cls, s): |
| 175 | """Replace certain Unicode characters with named HTML entities. |
| 176 | |
| 177 | This differs from data.encode(encoding, 'xmlcharrefreplace') |
| 178 | in that the goal is to make the result more readable (to those |
| 179 | with ASCII displays) rather than to recover from |
| 180 | errors. There's absolutely nothing wrong with a UTF-8 string |
| 181 | containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that |
| 182 | character with "é" will make it more readable to some |
| 183 | people. |
| 184 | """ |
| 185 | return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( |
| 186 | cls._substitute_html_entity, s) |
| 187 | |
| 188 | |
| 189 | class EncodingDetector: |
| 190 | """Suggests a number of possible encodings for a bytestring. |
| 191 | |
| 192 | Order of precedence: |
| 193 | |
| 194 | 1. Encodings you specifically tell EncodingDetector to try first |
| 195 | (the override_encodings argument to the constructor). |
| 196 | |
| 197 | 2. An encoding declared within the bytestring itself, either in an |
| 198 | XML declaration (if the bytestring is to be interpreted as an XML |
| 199 | document), or in a <meta> tag (if the bytestring is to be |
| 200 | interpreted as an HTML document.) |
| 201 | |
| 202 | 3. An encoding detected through textual analysis by chardet, |
| 203 | cchardet, or a similar external library. |
| 204 | |
| 205 | 4. UTF-8. |
| 206 | |
| 207 | 5. Windows-1252. |
| 208 | """ |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 209 | def __init__(self, markup, override_encodings=None, is_html=False, |
| 210 | exclude_encodings=None): |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 211 | self.override_encodings = override_encodings or [] |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 212 | exclude_encodings = exclude_encodings or [] |
| 213 | self.exclude_encodings = set([x.lower() for x in exclude_encodings]) |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 214 | self.chardet_encoding = None |
| 215 | self.is_html = is_html |
| 216 | self.declared_encoding = None |
| 217 | |
| 218 | # First order of business: strip a byte-order mark. |
| 219 | self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup) |
| 220 | |
| 221 | def _usable(self, encoding, tried): |
| 222 | if encoding is not None: |
| 223 | encoding = encoding.lower() |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 224 | if encoding in self.exclude_encodings: |
| 225 | return False |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 226 | if encoding not in tried: |
| 227 | tried.add(encoding) |
| 228 | return True |
| 229 | return False |
| 230 | |
| 231 | @property |
| 232 | def encodings(self): |
| 233 | """Yield a number of encodings that might work for this markup.""" |
| 234 | tried = set() |
| 235 | for e in self.override_encodings: |
| 236 | if self._usable(e, tried): |
| 237 | yield e |
| 238 | |
| 239 | # Did the document originally start with a byte-order mark |
| 240 | # that indicated its encoding? |
| 241 | if self._usable(self.sniffed_encoding, tried): |
| 242 | yield self.sniffed_encoding |
| 243 | |
| 244 | # Look within the document for an XML or HTML encoding |
| 245 | # declaration. |
| 246 | if self.declared_encoding is None: |
| 247 | self.declared_encoding = self.find_declared_encoding( |
| 248 | self.markup, self.is_html) |
| 249 | if self._usable(self.declared_encoding, tried): |
| 250 | yield self.declared_encoding |
| 251 | |
| 252 | # Use third-party character set detection to guess at the |
| 253 | # encoding. |
| 254 | if self.chardet_encoding is None: |
| 255 | self.chardet_encoding = chardet_dammit(self.markup) |
| 256 | if self._usable(self.chardet_encoding, tried): |
| 257 | yield self.chardet_encoding |
| 258 | |
| 259 | # As a last-ditch effort, try utf-8 and windows-1252. |
| 260 | for e in ('utf-8', 'windows-1252'): |
| 261 | if self._usable(e, tried): |
| 262 | yield e |
| 263 | |
| 264 | @classmethod |
| 265 | def strip_byte_order_mark(cls, data): |
| 266 | """If a byte-order mark is present, strip it and return the encoding it implies.""" |
| 267 | encoding = None |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 268 | if isinstance(data, str): |
| 269 | # Unicode data cannot have a byte-order mark. |
| 270 | return data, encoding |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 271 | if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ |
| 272 | and (data[2:4] != '\x00\x00'): |
| 273 | encoding = 'utf-16be' |
| 274 | data = data[2:] |
| 275 | elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \ |
| 276 | and (data[2:4] != '\x00\x00'): |
| 277 | encoding = 'utf-16le' |
| 278 | data = data[2:] |
| 279 | elif data[:3] == b'\xef\xbb\xbf': |
| 280 | encoding = 'utf-8' |
| 281 | data = data[3:] |
| 282 | elif data[:4] == b'\x00\x00\xfe\xff': |
| 283 | encoding = 'utf-32be' |
| 284 | data = data[4:] |
| 285 | elif data[:4] == b'\xff\xfe\x00\x00': |
| 286 | encoding = 'utf-32le' |
| 287 | data = data[4:] |
| 288 | return data, encoding |
| 289 | |
| 290 | @classmethod |
| 291 | def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False): |
| 292 | """Given a document, tries to find its declared encoding. |
| 293 | |
| 294 | An XML encoding is declared at the beginning of the document. |
| 295 | |
| 296 | An HTML encoding is declared in a <meta> tag, hopefully near the |
| 297 | beginning of the document. |
| 298 | """ |
| 299 | if search_entire_document: |
| 300 | xml_endpos = html_endpos = len(markup) |
| 301 | else: |
| 302 | xml_endpos = 1024 |
| 303 | html_endpos = max(2048, int(len(markup) * 0.05)) |
| 304 | |
| 305 | declared_encoding = None |
| 306 | declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos) |
| 307 | if not declared_encoding_match and is_html: |
| 308 | declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos) |
| 309 | if declared_encoding_match is not None: |
| 310 | declared_encoding = declared_encoding_match.groups()[0].decode( |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 311 | 'ascii', 'replace') |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 312 | if declared_encoding: |
| 313 | return declared_encoding.lower() |
| 314 | return None |
| 315 | |
| 316 | class UnicodeDammit: |
| 317 | """A class for detecting the encoding of a *ML document and |
| 318 | converting it to a Unicode string. If the source encoding is |
| 319 | windows-1252, can replace MS smart quotes with their HTML or XML |
| 320 | equivalents.""" |
| 321 | |
| 322 | # This dictionary maps commonly seen values for "charset" in HTML |
| 323 | # meta tags to the corresponding Python codec names. It only covers |
| 324 | # values that aren't in Python's aliases and can't be determined |
| 325 | # by the heuristics in find_codec. |
| 326 | CHARSET_ALIASES = {"macintosh": "mac-roman", |
| 327 | "x-sjis": "shift-jis"} |
| 328 | |
| 329 | ENCODINGS_WITH_SMART_QUOTES = [ |
| 330 | "windows-1252", |
| 331 | "iso-8859-1", |
| 332 | "iso-8859-2", |
| 333 | ] |
| 334 | |
| 335 | def __init__(self, markup, override_encodings=[], |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 336 | smart_quotes_to=None, is_html=False, exclude_encodings=[]): |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 337 | self.smart_quotes_to = smart_quotes_to |
| 338 | self.tried_encodings = [] |
| 339 | self.contains_replacement_characters = False |
| 340 | self.is_html = is_html |
| 341 | |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 342 | self.detector = EncodingDetector( |
| 343 | markup, override_encodings, is_html, exclude_encodings) |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 344 | |
| 345 | # Short-circuit if the data is in Unicode to begin with. |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 346 | if isinstance(markup, str) or markup == '': |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 347 | self.markup = markup |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 348 | self.unicode_markup = str(markup) |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 349 | self.original_encoding = None |
| 350 | return |
| 351 | |
| 352 | # The encoding detector may have stripped a byte-order mark. |
| 353 | # Use the stripped markup from this point on. |
| 354 | self.markup = self.detector.markup |
| 355 | |
| 356 | u = None |
| 357 | for encoding in self.detector.encodings: |
| 358 | markup = self.detector.markup |
| 359 | u = self._convert_from(encoding) |
| 360 | if u is not None: |
| 361 | break |
| 362 | |
| 363 | if not u: |
| 364 | # None of the encodings worked. As an absolute last resort, |
| 365 | # try them again with character replacement. |
| 366 | |
| 367 | for encoding in self.detector.encodings: |
| 368 | if encoding != "ascii": |
| 369 | u = self._convert_from(encoding, "replace") |
| 370 | if u is not None: |
| 371 | logging.warning( |
| 372 | "Some characters could not be decoded, and were " |
| 373 | "replaced with REPLACEMENT CHARACTER.") |
| 374 | self.contains_replacement_characters = True |
| 375 | break |
| 376 | |
| 377 | # If none of that worked, we could at this point force it to |
| 378 | # ASCII, but that would destroy so much data that I think |
| 379 | # giving up is better. |
| 380 | self.unicode_markup = u |
| 381 | if not u: |
| 382 | self.original_encoding = None |
| 383 | |
| 384 | def _sub_ms_char(self, match): |
| 385 | """Changes a MS smart quote character to an XML or HTML |
| 386 | entity, or an ASCII character.""" |
| 387 | orig = match.group(1) |
| 388 | if self.smart_quotes_to == 'ascii': |
| 389 | sub = self.MS_CHARS_TO_ASCII.get(orig).encode() |
| 390 | else: |
| 391 | sub = self.MS_CHARS.get(orig) |
| 392 | if type(sub) == tuple: |
| 393 | if self.smart_quotes_to == 'xml': |
| 394 | sub = '&#x'.encode() + sub[1].encode() + ';'.encode() |
| 395 | else: |
| 396 | sub = '&'.encode() + sub[0].encode() + ';'.encode() |
| 397 | else: |
| 398 | sub = sub.encode() |
| 399 | return sub |
| 400 | |
| 401 | def _convert_from(self, proposed, errors="strict"): |
| 402 | proposed = self.find_codec(proposed) |
| 403 | if not proposed or (proposed, errors) in self.tried_encodings: |
| 404 | return None |
| 405 | self.tried_encodings.append((proposed, errors)) |
| 406 | markup = self.markup |
| 407 | # Convert smart quotes to HTML if coming from an encoding |
| 408 | # that might have them. |
| 409 | if (self.smart_quotes_to is not None |
| 410 | and proposed in self.ENCODINGS_WITH_SMART_QUOTES): |
| 411 | smart_quotes_re = b"([\x80-\x9f])" |
| 412 | smart_quotes_compiled = re.compile(smart_quotes_re) |
| 413 | markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) |
| 414 | |
| 415 | try: |
| 416 | #print "Trying to convert document to %s (errors=%s)" % ( |
| 417 | # proposed, errors) |
| 418 | u = self._to_unicode(markup, proposed, errors) |
| 419 | self.markup = u |
| 420 | self.original_encoding = proposed |
| 421 | except Exception as e: |
| 422 | #print "That didn't work!" |
| 423 | #print e |
| 424 | return None |
| 425 | #print "Correct encoding: %s" % proposed |
| 426 | return self.markup |
| 427 | |
| 428 | def _to_unicode(self, data, encoding, errors="strict"): |
| 429 | '''Given a string and its encoding, decodes the string into Unicode. |
| 430 | %encoding is a string recognized by encodings.aliases''' |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 431 | return str(data, encoding, errors) |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 432 | |
| 433 | @property |
| 434 | def declared_html_encoding(self): |
| 435 | if not self.is_html: |
| 436 | return None |
| 437 | return self.detector.declared_encoding |
| 438 | |
| 439 | def find_codec(self, charset): |
| 440 | value = (self._codec(self.CHARSET_ALIASES.get(charset, charset)) |
| 441 | or (charset and self._codec(charset.replace("-", ""))) |
| 442 | or (charset and self._codec(charset.replace("-", "_"))) |
| 443 | or (charset and charset.lower()) |
| 444 | or charset |
| 445 | ) |
| 446 | if value: |
| 447 | return value.lower() |
| 448 | return None |
| 449 | |
| 450 | def _codec(self, charset): |
| 451 | if not charset: |
| 452 | return charset |
| 453 | codec = None |
| 454 | try: |
| 455 | codecs.lookup(charset) |
| 456 | codec = charset |
| 457 | except (LookupError, ValueError): |
| 458 | pass |
| 459 | return codec |
| 460 | |
| 461 | |
| 462 | # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities. |
| 463 | MS_CHARS = {b'\x80': ('euro', '20AC'), |
| 464 | b'\x81': ' ', |
| 465 | b'\x82': ('sbquo', '201A'), |
| 466 | b'\x83': ('fnof', '192'), |
| 467 | b'\x84': ('bdquo', '201E'), |
| 468 | b'\x85': ('hellip', '2026'), |
| 469 | b'\x86': ('dagger', '2020'), |
| 470 | b'\x87': ('Dagger', '2021'), |
| 471 | b'\x88': ('circ', '2C6'), |
| 472 | b'\x89': ('permil', '2030'), |
| 473 | b'\x8A': ('Scaron', '160'), |
| 474 | b'\x8B': ('lsaquo', '2039'), |
| 475 | b'\x8C': ('OElig', '152'), |
| 476 | b'\x8D': '?', |
| 477 | b'\x8E': ('#x17D', '17D'), |
| 478 | b'\x8F': '?', |
| 479 | b'\x90': '?', |
| 480 | b'\x91': ('lsquo', '2018'), |
| 481 | b'\x92': ('rsquo', '2019'), |
| 482 | b'\x93': ('ldquo', '201C'), |
| 483 | b'\x94': ('rdquo', '201D'), |
| 484 | b'\x95': ('bull', '2022'), |
| 485 | b'\x96': ('ndash', '2013'), |
| 486 | b'\x97': ('mdash', '2014'), |
| 487 | b'\x98': ('tilde', '2DC'), |
| 488 | b'\x99': ('trade', '2122'), |
| 489 | b'\x9a': ('scaron', '161'), |
| 490 | b'\x9b': ('rsaquo', '203A'), |
| 491 | b'\x9c': ('oelig', '153'), |
| 492 | b'\x9d': '?', |
| 493 | b'\x9e': ('#x17E', '17E'), |
| 494 | b'\x9f': ('Yuml', ''),} |
| 495 | |
| 496 | # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains |
| 497 | # horrors like stripping diacritical marks to turn á into a, but also |
| 498 | # contains non-horrors like turning “ into ". |
| 499 | MS_CHARS_TO_ASCII = { |
| 500 | b'\x80' : 'EUR', |
| 501 | b'\x81' : ' ', |
| 502 | b'\x82' : ',', |
| 503 | b'\x83' : 'f', |
| 504 | b'\x84' : ',,', |
| 505 | b'\x85' : '...', |
| 506 | b'\x86' : '+', |
| 507 | b'\x87' : '++', |
| 508 | b'\x88' : '^', |
| 509 | b'\x89' : '%', |
| 510 | b'\x8a' : 'S', |
| 511 | b'\x8b' : '<', |
| 512 | b'\x8c' : 'OE', |
| 513 | b'\x8d' : '?', |
| 514 | b'\x8e' : 'Z', |
| 515 | b'\x8f' : '?', |
| 516 | b'\x90' : '?', |
| 517 | b'\x91' : "'", |
| 518 | b'\x92' : "'", |
| 519 | b'\x93' : '"', |
| 520 | b'\x94' : '"', |
| 521 | b'\x95' : '*', |
| 522 | b'\x96' : '-', |
| 523 | b'\x97' : '--', |
| 524 | b'\x98' : '~', |
| 525 | b'\x99' : '(TM)', |
| 526 | b'\x9a' : 's', |
| 527 | b'\x9b' : '>', |
| 528 | b'\x9c' : 'oe', |
| 529 | b'\x9d' : '?', |
| 530 | b'\x9e' : 'z', |
| 531 | b'\x9f' : 'Y', |
| 532 | b'\xa0' : ' ', |
| 533 | b'\xa1' : '!', |
| 534 | b'\xa2' : 'c', |
| 535 | b'\xa3' : 'GBP', |
| 536 | b'\xa4' : '$', #This approximation is especially parochial--this is the |
| 537 | #generic currency symbol. |
| 538 | b'\xa5' : 'YEN', |
| 539 | b'\xa6' : '|', |
| 540 | b'\xa7' : 'S', |
| 541 | b'\xa8' : '..', |
| 542 | b'\xa9' : '', |
| 543 | b'\xaa' : '(th)', |
| 544 | b'\xab' : '<<', |
| 545 | b'\xac' : '!', |
| 546 | b'\xad' : ' ', |
| 547 | b'\xae' : '(R)', |
| 548 | b'\xaf' : '-', |
| 549 | b'\xb0' : 'o', |
| 550 | b'\xb1' : '+-', |
| 551 | b'\xb2' : '2', |
| 552 | b'\xb3' : '3', |
| 553 | b'\xb4' : ("'", 'acute'), |
| 554 | b'\xb5' : 'u', |
| 555 | b'\xb6' : 'P', |
| 556 | b'\xb7' : '*', |
| 557 | b'\xb8' : ',', |
| 558 | b'\xb9' : '1', |
| 559 | b'\xba' : '(th)', |
| 560 | b'\xbb' : '>>', |
| 561 | b'\xbc' : '1/4', |
| 562 | b'\xbd' : '1/2', |
| 563 | b'\xbe' : '3/4', |
| 564 | b'\xbf' : '?', |
| 565 | b'\xc0' : 'A', |
| 566 | b'\xc1' : 'A', |
| 567 | b'\xc2' : 'A', |
| 568 | b'\xc3' : 'A', |
| 569 | b'\xc4' : 'A', |
| 570 | b'\xc5' : 'A', |
| 571 | b'\xc6' : 'AE', |
| 572 | b'\xc7' : 'C', |
| 573 | b'\xc8' : 'E', |
| 574 | b'\xc9' : 'E', |
| 575 | b'\xca' : 'E', |
| 576 | b'\xcb' : 'E', |
| 577 | b'\xcc' : 'I', |
| 578 | b'\xcd' : 'I', |
| 579 | b'\xce' : 'I', |
| 580 | b'\xcf' : 'I', |
| 581 | b'\xd0' : 'D', |
| 582 | b'\xd1' : 'N', |
| 583 | b'\xd2' : 'O', |
| 584 | b'\xd3' : 'O', |
| 585 | b'\xd4' : 'O', |
| 586 | b'\xd5' : 'O', |
| 587 | b'\xd6' : 'O', |
| 588 | b'\xd7' : '*', |
| 589 | b'\xd8' : 'O', |
| 590 | b'\xd9' : 'U', |
| 591 | b'\xda' : 'U', |
| 592 | b'\xdb' : 'U', |
| 593 | b'\xdc' : 'U', |
| 594 | b'\xdd' : 'Y', |
| 595 | b'\xde' : 'b', |
| 596 | b'\xdf' : 'B', |
| 597 | b'\xe0' : 'a', |
| 598 | b'\xe1' : 'a', |
| 599 | b'\xe2' : 'a', |
| 600 | b'\xe3' : 'a', |
| 601 | b'\xe4' : 'a', |
| 602 | b'\xe5' : 'a', |
| 603 | b'\xe6' : 'ae', |
| 604 | b'\xe7' : 'c', |
| 605 | b'\xe8' : 'e', |
| 606 | b'\xe9' : 'e', |
| 607 | b'\xea' : 'e', |
| 608 | b'\xeb' : 'e', |
| 609 | b'\xec' : 'i', |
| 610 | b'\xed' : 'i', |
| 611 | b'\xee' : 'i', |
| 612 | b'\xef' : 'i', |
| 613 | b'\xf0' : 'o', |
| 614 | b'\xf1' : 'n', |
| 615 | b'\xf2' : 'o', |
| 616 | b'\xf3' : 'o', |
| 617 | b'\xf4' : 'o', |
| 618 | b'\xf5' : 'o', |
| 619 | b'\xf6' : 'o', |
| 620 | b'\xf7' : '/', |
| 621 | b'\xf8' : 'o', |
| 622 | b'\xf9' : 'u', |
| 623 | b'\xfa' : 'u', |
| 624 | b'\xfb' : 'u', |
| 625 | b'\xfc' : 'u', |
| 626 | b'\xfd' : 'y', |
| 627 | b'\xfe' : 'b', |
| 628 | b'\xff' : 'y', |
| 629 | } |
| 630 | |
| 631 | # A map used when removing rogue Windows-1252/ISO-8859-1 |
| 632 | # characters in otherwise UTF-8 documents. |
| 633 | # |
| 634 | # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in |
| 635 | # Windows-1252. |
| 636 | WINDOWS_1252_TO_UTF8 = { |
| 637 | 0x80 : b'\xe2\x82\xac', # € |
| 638 | 0x82 : b'\xe2\x80\x9a', # ‚ |
| 639 | 0x83 : b'\xc6\x92', # ƒ |
| 640 | 0x84 : b'\xe2\x80\x9e', # „ |
| 641 | 0x85 : b'\xe2\x80\xa6', # … |
| 642 | 0x86 : b'\xe2\x80\xa0', # † |
| 643 | 0x87 : b'\xe2\x80\xa1', # ‡ |
| 644 | 0x88 : b'\xcb\x86', # ˆ |
| 645 | 0x89 : b'\xe2\x80\xb0', # ‰ |
| 646 | 0x8a : b'\xc5\xa0', # Š |
| 647 | 0x8b : b'\xe2\x80\xb9', # ‹ |
| 648 | 0x8c : b'\xc5\x92', # Œ |
| 649 | 0x8e : b'\xc5\xbd', # Ž |
| 650 | 0x91 : b'\xe2\x80\x98', # ‘ |
| 651 | 0x92 : b'\xe2\x80\x99', # ’ |
| 652 | 0x93 : b'\xe2\x80\x9c', # “ |
| 653 | 0x94 : b'\xe2\x80\x9d', # ” |
| 654 | 0x95 : b'\xe2\x80\xa2', # • |
| 655 | 0x96 : b'\xe2\x80\x93', # – |
| 656 | 0x97 : b'\xe2\x80\x94', # — |
| 657 | 0x98 : b'\xcb\x9c', # ˜ |
| 658 | 0x99 : b'\xe2\x84\xa2', # ™ |
| 659 | 0x9a : b'\xc5\xa1', # š |
| 660 | 0x9b : b'\xe2\x80\xba', # › |
| 661 | 0x9c : b'\xc5\x93', # œ |
| 662 | 0x9e : b'\xc5\xbe', # ž |
| 663 | 0x9f : b'\xc5\xb8', # Ÿ |
| 664 | 0xa0 : b'\xc2\xa0', # |
| 665 | 0xa1 : b'\xc2\xa1', # ¡ |
| 666 | 0xa2 : b'\xc2\xa2', # ¢ |
| 667 | 0xa3 : b'\xc2\xa3', # £ |
| 668 | 0xa4 : b'\xc2\xa4', # ¤ |
| 669 | 0xa5 : b'\xc2\xa5', # ¥ |
| 670 | 0xa6 : b'\xc2\xa6', # ¦ |
| 671 | 0xa7 : b'\xc2\xa7', # § |
| 672 | 0xa8 : b'\xc2\xa8', # ¨ |
| 673 | 0xa9 : b'\xc2\xa9', # © |
| 674 | 0xaa : b'\xc2\xaa', # ª |
| 675 | 0xab : b'\xc2\xab', # « |
| 676 | 0xac : b'\xc2\xac', # ¬ |
| 677 | 0xad : b'\xc2\xad', # |
| 678 | 0xae : b'\xc2\xae', # ® |
| 679 | 0xaf : b'\xc2\xaf', # ¯ |
| 680 | 0xb0 : b'\xc2\xb0', # ° |
| 681 | 0xb1 : b'\xc2\xb1', # ± |
| 682 | 0xb2 : b'\xc2\xb2', # ² |
| 683 | 0xb3 : b'\xc2\xb3', # ³ |
| 684 | 0xb4 : b'\xc2\xb4', # ´ |
| 685 | 0xb5 : b'\xc2\xb5', # µ |
| 686 | 0xb6 : b'\xc2\xb6', # ¶ |
| 687 | 0xb7 : b'\xc2\xb7', # · |
| 688 | 0xb8 : b'\xc2\xb8', # ¸ |
| 689 | 0xb9 : b'\xc2\xb9', # ¹ |
| 690 | 0xba : b'\xc2\xba', # º |
| 691 | 0xbb : b'\xc2\xbb', # » |
| 692 | 0xbc : b'\xc2\xbc', # ¼ |
| 693 | 0xbd : b'\xc2\xbd', # ½ |
| 694 | 0xbe : b'\xc2\xbe', # ¾ |
| 695 | 0xbf : b'\xc2\xbf', # ¿ |
| 696 | 0xc0 : b'\xc3\x80', # À |
| 697 | 0xc1 : b'\xc3\x81', # Á |
| 698 | 0xc2 : b'\xc3\x82', # Â |
| 699 | 0xc3 : b'\xc3\x83', # Ã |
| 700 | 0xc4 : b'\xc3\x84', # Ä |
| 701 | 0xc5 : b'\xc3\x85', # Å |
| 702 | 0xc6 : b'\xc3\x86', # Æ |
| 703 | 0xc7 : b'\xc3\x87', # Ç |
| 704 | 0xc8 : b'\xc3\x88', # È |
| 705 | 0xc9 : b'\xc3\x89', # É |
| 706 | 0xca : b'\xc3\x8a', # Ê |
| 707 | 0xcb : b'\xc3\x8b', # Ë |
| 708 | 0xcc : b'\xc3\x8c', # Ì |
| 709 | 0xcd : b'\xc3\x8d', # Í |
| 710 | 0xce : b'\xc3\x8e', # Î |
| 711 | 0xcf : b'\xc3\x8f', # Ï |
| 712 | 0xd0 : b'\xc3\x90', # Ð |
| 713 | 0xd1 : b'\xc3\x91', # Ñ |
| 714 | 0xd2 : b'\xc3\x92', # Ò |
| 715 | 0xd3 : b'\xc3\x93', # Ó |
| 716 | 0xd4 : b'\xc3\x94', # Ô |
| 717 | 0xd5 : b'\xc3\x95', # Õ |
| 718 | 0xd6 : b'\xc3\x96', # Ö |
| 719 | 0xd7 : b'\xc3\x97', # × |
| 720 | 0xd8 : b'\xc3\x98', # Ø |
| 721 | 0xd9 : b'\xc3\x99', # Ù |
| 722 | 0xda : b'\xc3\x9a', # Ú |
| 723 | 0xdb : b'\xc3\x9b', # Û |
| 724 | 0xdc : b'\xc3\x9c', # Ü |
| 725 | 0xdd : b'\xc3\x9d', # Ý |
| 726 | 0xde : b'\xc3\x9e', # Þ |
| 727 | 0xdf : b'\xc3\x9f', # ß |
| 728 | 0xe0 : b'\xc3\xa0', # à |
| 729 | 0xe1 : b'\xa1', # á |
| 730 | 0xe2 : b'\xc3\xa2', # â |
| 731 | 0xe3 : b'\xc3\xa3', # ã |
| 732 | 0xe4 : b'\xc3\xa4', # ä |
| 733 | 0xe5 : b'\xc3\xa5', # å |
| 734 | 0xe6 : b'\xc3\xa6', # æ |
| 735 | 0xe7 : b'\xc3\xa7', # ç |
| 736 | 0xe8 : b'\xc3\xa8', # è |
| 737 | 0xe9 : b'\xc3\xa9', # é |
| 738 | 0xea : b'\xc3\xaa', # ê |
| 739 | 0xeb : b'\xc3\xab', # ë |
| 740 | 0xec : b'\xc3\xac', # ì |
| 741 | 0xed : b'\xc3\xad', # í |
| 742 | 0xee : b'\xc3\xae', # î |
| 743 | 0xef : b'\xc3\xaf', # ï |
| 744 | 0xf0 : b'\xc3\xb0', # ð |
| 745 | 0xf1 : b'\xc3\xb1', # ñ |
| 746 | 0xf2 : b'\xc3\xb2', # ò |
| 747 | 0xf3 : b'\xc3\xb3', # ó |
| 748 | 0xf4 : b'\xc3\xb4', # ô |
| 749 | 0xf5 : b'\xc3\xb5', # õ |
| 750 | 0xf6 : b'\xc3\xb6', # ö |
| 751 | 0xf7 : b'\xc3\xb7', # ÷ |
| 752 | 0xf8 : b'\xc3\xb8', # ø |
| 753 | 0xf9 : b'\xc3\xb9', # ù |
| 754 | 0xfa : b'\xc3\xba', # ú |
| 755 | 0xfb : b'\xc3\xbb', # û |
| 756 | 0xfc : b'\xc3\xbc', # ü |
| 757 | 0xfd : b'\xc3\xbd', # ý |
| 758 | 0xfe : b'\xc3\xbe', # þ |
| 759 | } |
| 760 | |
| 761 | MULTIBYTE_MARKERS_AND_SIZES = [ |
| 762 | (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF |
| 763 | (0xe0, 0xef, 3), # 3-byte characters start with E0-EF |
| 764 | (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4 |
| 765 | ] |
| 766 | |
| 767 | FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0] |
| 768 | LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1] |
| 769 | |
| 770 | @classmethod |
| 771 | def detwingle(cls, in_bytes, main_encoding="utf8", |
| 772 | embedded_encoding="windows-1252"): |
| 773 | """Fix characters from one encoding embedded in some other encoding. |
| 774 | |
| 775 | Currently the only situation supported is Windows-1252 (or its |
| 776 | subset ISO-8859-1), embedded in UTF-8. |
| 777 | |
| 778 | The input must be a bytestring. If you've already converted |
| 779 | the document to Unicode, you're too late. |
| 780 | |
| 781 | The output is a bytestring in which `embedded_encoding` |
| 782 | characters have been converted to their `main_encoding` |
| 783 | equivalents. |
| 784 | """ |
| 785 | if embedded_encoding.replace('_', '-').lower() not in ( |
| 786 | 'windows-1252', 'windows_1252'): |
| 787 | raise NotImplementedError( |
| 788 | "Windows-1252 and ISO-8859-1 are the only currently supported " |
| 789 | "embedded encodings.") |
| 790 | |
| 791 | if main_encoding.lower() not in ('utf8', 'utf-8'): |
| 792 | raise NotImplementedError( |
| 793 | "UTF-8 is the only currently supported main encoding.") |
| 794 | |
| 795 | byte_chunks = [] |
| 796 | |
| 797 | chunk_start = 0 |
| 798 | pos = 0 |
| 799 | while pos < len(in_bytes): |
| 800 | byte = in_bytes[pos] |
| 801 | if not isinstance(byte, int): |
| 802 | # Python 2.x |
| 803 | byte = ord(byte) |
| 804 | if (byte >= cls.FIRST_MULTIBYTE_MARKER |
| 805 | and byte <= cls.LAST_MULTIBYTE_MARKER): |
| 806 | # This is the start of a UTF-8 multibyte character. Skip |
| 807 | # to the end. |
| 808 | for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES: |
| 809 | if byte >= start and byte <= end: |
| 810 | pos += size |
| 811 | break |
| 812 | elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8: |
| 813 | # We found a Windows-1252 character! |
| 814 | # Save the string up to this point as a chunk. |
| 815 | byte_chunks.append(in_bytes[chunk_start:pos]) |
| 816 | |
| 817 | # Now translate the Windows-1252 character into UTF-8 |
| 818 | # and add it as another, one-byte chunk. |
| 819 | byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte]) |
| 820 | pos += 1 |
| 821 | chunk_start = pos |
| 822 | else: |
| 823 | # Go on to the next character. |
| 824 | pos += 1 |
| 825 | if chunk_start == 0: |
| 826 | # The string is unchanged. |
| 827 | return in_bytes |
| 828 | else: |
| 829 | # Store the final chunk. |
| 830 | byte_chunks.append(in_bytes[chunk_start:]) |
| 831 | return b''.join(byte_chunks) |
| 832 | |