blob: 7ad9e0dd1eb4a93d36be069052fe2ac0a7bf6edb [file] [log] [blame]
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001# -*- coding: utf-8 -*-
2"""Beautiful Soup bonus library: Unicode, Dammit
3
4This library converts a bytestream to Unicode through any means
5necessary. It is heavily based on code from Mark Pilgrim's Universal
Patrick Williamsc0f7c042017-02-23 20:41:17 -06006Feed Parser. It works best on XML and HTML, but it does not rewrite the
Patrick Williamsc124f4f2015-09-15 14:41:29 -05007XML or HTML to reflect a new encoding; that's the tree builder's job.
8"""
Patrick Williamsc0f7c042017-02-23 20:41:17 -06009__license__ = "MIT"
Patrick Williamsc124f4f2015-09-15 14:41:29 -050010
11import codecs
Patrick Williamsc0f7c042017-02-23 20:41:17 -060012from html.entities import codepoint2name
Patrick Williamsc124f4f2015-09-15 14:41:29 -050013import re
14import logging
Patrick Williamsc124f4f2015-09-15 14:41:29 -050015
16# Import a library to autodetect character encodings.
17chardet_type = None
18try:
19 # First try the fast C implementation.
20 # PyPI package: cchardet
21 import cchardet
22 def chardet_dammit(s):
23 return cchardet.detect(s)['encoding']
24except ImportError:
25 try:
26 # Fall back to the pure Python implementation
27 # Debian package: python-chardet
28 # PyPI package: chardet
29 import chardet
30 def chardet_dammit(s):
31 return chardet.detect(s)['encoding']
32 #import chardet.constants
33 #chardet.constants._debug = 1
34 except ImportError:
35 # No chardet available.
36 def chardet_dammit(s):
37 return None
38
Patrick Williamsc124f4f2015-09-15 14:41:29 -050039xml_encoding_re = re.compile(
Brad Bishop19323692019-04-05 15:28:33 -040040 r'^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
Patrick Williamsc124f4f2015-09-15 14:41:29 -050041html_meta_re = re.compile(
Brad Bishop19323692019-04-05 15:28:33 -040042 r'<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
Patrick Williamsc124f4f2015-09-15 14:41:29 -050043
44class EntitySubstitution(object):
45
46 """Substitute XML or HTML entities for the corresponding characters."""
47
48 def _populate_class_variables():
49 lookup = {}
50 reverse_lookup = {}
51 characters_for_re = []
52 for codepoint, name in list(codepoint2name.items()):
Patrick Williamsc0f7c042017-02-23 20:41:17 -060053 character = chr(codepoint)
Patrick Williamsc124f4f2015-09-15 14:41:29 -050054 if codepoint != 34:
55 # There's no point in turning the quotation mark into
56 # &quot;, unless it happens within an attribute value, which
57 # is handled elsewhere.
58 characters_for_re.append(character)
59 lookup[character] = name
60 # But we do want to turn &quot; into the quotation mark.
61 reverse_lookup[name] = character
62 re_definition = "[%s]" % "".join(characters_for_re)
63 return lookup, reverse_lookup, re.compile(re_definition)
64 (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
65 CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
66
67 CHARACTER_TO_XML_ENTITY = {
68 "'": "apos",
69 '"': "quot",
70 "&": "amp",
71 "<": "lt",
72 ">": "gt",
73 }
74
Brad Bishop19323692019-04-05 15:28:33 -040075 BARE_AMPERSAND_OR_BRACKET = re.compile(r"([<>]|"
76 r"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
77 r")")
Patrick Williamsc124f4f2015-09-15 14:41:29 -050078
Brad Bishop19323692019-04-05 15:28:33 -040079 AMPERSAND_OR_BRACKET = re.compile(r"([<>&])")
Patrick Williamsc124f4f2015-09-15 14:41:29 -050080
81 @classmethod
82 def _substitute_html_entity(cls, matchobj):
83 entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
84 return "&%s;" % entity
85
86 @classmethod
87 def _substitute_xml_entity(cls, matchobj):
88 """Used with a regular expression to substitute the
89 appropriate XML entity for an XML special character."""
90 entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
91 return "&%s;" % entity
92
93 @classmethod
94 def quoted_attribute_value(self, value):
95 """Make a value into a quoted XML attribute, possibly escaping it.
96
97 Most strings will be quoted using double quotes.
98
99 Bob's Bar -> "Bob's Bar"
100
101 If a string contains double quotes, it will be quoted using
102 single quotes.
103
104 Welcome to "my bar" -> 'Welcome to "my bar"'
105
106 If a string contains both single and double quotes, the
107 double quotes will be escaped, and the string will be quoted
108 using double quotes.
109
110 Welcome to "Bob's Bar" -> "Welcome to &quot;Bob's bar&quot;
111 """
112 quote_with = '"'
113 if '"' in value:
114 if "'" in value:
115 # The string contains both single and double
116 # quotes. Turn the double quotes into
117 # entities. We quote the double quotes rather than
118 # the single quotes because the entity name is
119 # "&quot;" whether this is HTML or XML. If we
120 # quoted the single quotes, we'd have to decide
121 # between &apos; and &squot;.
122 replace_with = "&quot;"
123 value = value.replace('"', replace_with)
124 else:
125 # There are double quotes but no single quotes.
126 # We can use single quotes to quote the attribute.
127 quote_with = "'"
128 return quote_with + value + quote_with
129
130 @classmethod
131 def substitute_xml(cls, value, make_quoted_attribute=False):
132 """Substitute XML entities for special XML characters.
133
134 :param value: A string to be substituted. The less-than sign
135 will become &lt;, the greater-than sign will become &gt;,
136 and any ampersands will become &amp;. If you want ampersands
137 that appear to be part of an entity definition to be left
138 alone, use substitute_xml_containing_entities() instead.
139
140 :param make_quoted_attribute: If True, then the string will be
141 quoted, as befits an attribute value.
142 """
143 # Escape angle brackets and ampersands.
144 value = cls.AMPERSAND_OR_BRACKET.sub(
145 cls._substitute_xml_entity, value)
146
147 if make_quoted_attribute:
148 value = cls.quoted_attribute_value(value)
149 return value
150
151 @classmethod
152 def substitute_xml_containing_entities(
153 cls, value, make_quoted_attribute=False):
154 """Substitute XML entities for special XML characters.
155
156 :param value: A string to be substituted. The less-than sign will
157 become &lt;, the greater-than sign will become &gt;, and any
158 ampersands that are not part of an entity defition will
159 become &amp;.
160
161 :param make_quoted_attribute: If True, then the string will be
162 quoted, as befits an attribute value.
163 """
164 # Escape angle brackets, and ampersands that aren't part of
165 # entities.
166 value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
167 cls._substitute_xml_entity, value)
168
169 if make_quoted_attribute:
170 value = cls.quoted_attribute_value(value)
171 return value
172
173 @classmethod
174 def substitute_html(cls, s):
175 """Replace certain Unicode characters with named HTML entities.
176
177 This differs from data.encode(encoding, 'xmlcharrefreplace')
178 in that the goal is to make the result more readable (to those
179 with ASCII displays) rather than to recover from
180 errors. There's absolutely nothing wrong with a UTF-8 string
181 containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
182 character with "&eacute;" will make it more readable to some
183 people.
184 """
185 return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
186 cls._substitute_html_entity, s)
187
188
189class EncodingDetector:
190 """Suggests a number of possible encodings for a bytestring.
191
192 Order of precedence:
193
194 1. Encodings you specifically tell EncodingDetector to try first
195 (the override_encodings argument to the constructor).
196
197 2. An encoding declared within the bytestring itself, either in an
198 XML declaration (if the bytestring is to be interpreted as an XML
199 document), or in a <meta> tag (if the bytestring is to be
200 interpreted as an HTML document.)
201
202 3. An encoding detected through textual analysis by chardet,
203 cchardet, or a similar external library.
204
205 4. UTF-8.
206
207 5. Windows-1252.
208 """
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600209 def __init__(self, markup, override_encodings=None, is_html=False,
210 exclude_encodings=None):
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500211 self.override_encodings = override_encodings or []
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600212 exclude_encodings = exclude_encodings or []
213 self.exclude_encodings = set([x.lower() for x in exclude_encodings])
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500214 self.chardet_encoding = None
215 self.is_html = is_html
216 self.declared_encoding = None
217
218 # First order of business: strip a byte-order mark.
219 self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
220
221 def _usable(self, encoding, tried):
222 if encoding is not None:
223 encoding = encoding.lower()
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600224 if encoding in self.exclude_encodings:
225 return False
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500226 if encoding not in tried:
227 tried.add(encoding)
228 return True
229 return False
230
231 @property
232 def encodings(self):
233 """Yield a number of encodings that might work for this markup."""
234 tried = set()
235 for e in self.override_encodings:
236 if self._usable(e, tried):
237 yield e
238
239 # Did the document originally start with a byte-order mark
240 # that indicated its encoding?
241 if self._usable(self.sniffed_encoding, tried):
242 yield self.sniffed_encoding
243
244 # Look within the document for an XML or HTML encoding
245 # declaration.
246 if self.declared_encoding is None:
247 self.declared_encoding = self.find_declared_encoding(
248 self.markup, self.is_html)
249 if self._usable(self.declared_encoding, tried):
250 yield self.declared_encoding
251
252 # Use third-party character set detection to guess at the
253 # encoding.
254 if self.chardet_encoding is None:
255 self.chardet_encoding = chardet_dammit(self.markup)
256 if self._usable(self.chardet_encoding, tried):
257 yield self.chardet_encoding
258
259 # As a last-ditch effort, try utf-8 and windows-1252.
260 for e in ('utf-8', 'windows-1252'):
261 if self._usable(e, tried):
262 yield e
263
264 @classmethod
265 def strip_byte_order_mark(cls, data):
266 """If a byte-order mark is present, strip it and return the encoding it implies."""
267 encoding = None
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600268 if isinstance(data, str):
269 # Unicode data cannot have a byte-order mark.
270 return data, encoding
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500271 if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
272 and (data[2:4] != '\x00\x00'):
273 encoding = 'utf-16be'
274 data = data[2:]
275 elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
276 and (data[2:4] != '\x00\x00'):
277 encoding = 'utf-16le'
278 data = data[2:]
279 elif data[:3] == b'\xef\xbb\xbf':
280 encoding = 'utf-8'
281 data = data[3:]
282 elif data[:4] == b'\x00\x00\xfe\xff':
283 encoding = 'utf-32be'
284 data = data[4:]
285 elif data[:4] == b'\xff\xfe\x00\x00':
286 encoding = 'utf-32le'
287 data = data[4:]
288 return data, encoding
289
290 @classmethod
291 def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):
292 """Given a document, tries to find its declared encoding.
293
294 An XML encoding is declared at the beginning of the document.
295
296 An HTML encoding is declared in a <meta> tag, hopefully near the
297 beginning of the document.
298 """
299 if search_entire_document:
300 xml_endpos = html_endpos = len(markup)
301 else:
302 xml_endpos = 1024
303 html_endpos = max(2048, int(len(markup) * 0.05))
304
305 declared_encoding = None
306 declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
307 if not declared_encoding_match and is_html:
308 declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
309 if declared_encoding_match is not None:
310 declared_encoding = declared_encoding_match.groups()[0].decode(
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600311 'ascii', 'replace')
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500312 if declared_encoding:
313 return declared_encoding.lower()
314 return None
315
316class UnicodeDammit:
317 """A class for detecting the encoding of a *ML document and
318 converting it to a Unicode string. If the source encoding is
319 windows-1252, can replace MS smart quotes with their HTML or XML
320 equivalents."""
321
322 # This dictionary maps commonly seen values for "charset" in HTML
323 # meta tags to the corresponding Python codec names. It only covers
324 # values that aren't in Python's aliases and can't be determined
325 # by the heuristics in find_codec.
326 CHARSET_ALIASES = {"macintosh": "mac-roman",
327 "x-sjis": "shift-jis"}
328
329 ENCODINGS_WITH_SMART_QUOTES = [
330 "windows-1252",
331 "iso-8859-1",
332 "iso-8859-2",
333 ]
334
335 def __init__(self, markup, override_encodings=[],
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600336 smart_quotes_to=None, is_html=False, exclude_encodings=[]):
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500337 self.smart_quotes_to = smart_quotes_to
338 self.tried_encodings = []
339 self.contains_replacement_characters = False
340 self.is_html = is_html
341
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600342 self.detector = EncodingDetector(
343 markup, override_encodings, is_html, exclude_encodings)
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500344
345 # Short-circuit if the data is in Unicode to begin with.
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600346 if isinstance(markup, str) or markup == '':
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500347 self.markup = markup
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600348 self.unicode_markup = str(markup)
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500349 self.original_encoding = None
350 return
351
352 # The encoding detector may have stripped a byte-order mark.
353 # Use the stripped markup from this point on.
354 self.markup = self.detector.markup
355
356 u = None
357 for encoding in self.detector.encodings:
358 markup = self.detector.markup
359 u = self._convert_from(encoding)
360 if u is not None:
361 break
362
363 if not u:
364 # None of the encodings worked. As an absolute last resort,
365 # try them again with character replacement.
366
367 for encoding in self.detector.encodings:
368 if encoding != "ascii":
369 u = self._convert_from(encoding, "replace")
370 if u is not None:
371 logging.warning(
372 "Some characters could not be decoded, and were "
373 "replaced with REPLACEMENT CHARACTER.")
374 self.contains_replacement_characters = True
375 break
376
377 # If none of that worked, we could at this point force it to
378 # ASCII, but that would destroy so much data that I think
379 # giving up is better.
380 self.unicode_markup = u
381 if not u:
382 self.original_encoding = None
383
384 def _sub_ms_char(self, match):
385 """Changes a MS smart quote character to an XML or HTML
386 entity, or an ASCII character."""
387 orig = match.group(1)
388 if self.smart_quotes_to == 'ascii':
389 sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
390 else:
391 sub = self.MS_CHARS.get(orig)
392 if type(sub) == tuple:
393 if self.smart_quotes_to == 'xml':
394 sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
395 else:
396 sub = '&'.encode() + sub[0].encode() + ';'.encode()
397 else:
398 sub = sub.encode()
399 return sub
400
401 def _convert_from(self, proposed, errors="strict"):
402 proposed = self.find_codec(proposed)
403 if not proposed or (proposed, errors) in self.tried_encodings:
404 return None
405 self.tried_encodings.append((proposed, errors))
406 markup = self.markup
407 # Convert smart quotes to HTML if coming from an encoding
408 # that might have them.
409 if (self.smart_quotes_to is not None
410 and proposed in self.ENCODINGS_WITH_SMART_QUOTES):
411 smart_quotes_re = b"([\x80-\x9f])"
412 smart_quotes_compiled = re.compile(smart_quotes_re)
413 markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
414
415 try:
416 #print "Trying to convert document to %s (errors=%s)" % (
417 # proposed, errors)
418 u = self._to_unicode(markup, proposed, errors)
419 self.markup = u
420 self.original_encoding = proposed
421 except Exception as e:
422 #print "That didn't work!"
423 #print e
424 return None
425 #print "Correct encoding: %s" % proposed
426 return self.markup
427
428 def _to_unicode(self, data, encoding, errors="strict"):
429 '''Given a string and its encoding, decodes the string into Unicode.
430 %encoding is a string recognized by encodings.aliases'''
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600431 return str(data, encoding, errors)
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500432
433 @property
434 def declared_html_encoding(self):
435 if not self.is_html:
436 return None
437 return self.detector.declared_encoding
438
439 def find_codec(self, charset):
440 value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
441 or (charset and self._codec(charset.replace("-", "")))
442 or (charset and self._codec(charset.replace("-", "_")))
443 or (charset and charset.lower())
444 or charset
445 )
446 if value:
447 return value.lower()
448 return None
449
450 def _codec(self, charset):
451 if not charset:
452 return charset
453 codec = None
454 try:
455 codecs.lookup(charset)
456 codec = charset
457 except (LookupError, ValueError):
458 pass
459 return codec
460
461
462 # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
463 MS_CHARS = {b'\x80': ('euro', '20AC'),
464 b'\x81': ' ',
465 b'\x82': ('sbquo', '201A'),
466 b'\x83': ('fnof', '192'),
467 b'\x84': ('bdquo', '201E'),
468 b'\x85': ('hellip', '2026'),
469 b'\x86': ('dagger', '2020'),
470 b'\x87': ('Dagger', '2021'),
471 b'\x88': ('circ', '2C6'),
472 b'\x89': ('permil', '2030'),
473 b'\x8A': ('Scaron', '160'),
474 b'\x8B': ('lsaquo', '2039'),
475 b'\x8C': ('OElig', '152'),
476 b'\x8D': '?',
477 b'\x8E': ('#x17D', '17D'),
478 b'\x8F': '?',
479 b'\x90': '?',
480 b'\x91': ('lsquo', '2018'),
481 b'\x92': ('rsquo', '2019'),
482 b'\x93': ('ldquo', '201C'),
483 b'\x94': ('rdquo', '201D'),
484 b'\x95': ('bull', '2022'),
485 b'\x96': ('ndash', '2013'),
486 b'\x97': ('mdash', '2014'),
487 b'\x98': ('tilde', '2DC'),
488 b'\x99': ('trade', '2122'),
489 b'\x9a': ('scaron', '161'),
490 b'\x9b': ('rsaquo', '203A'),
491 b'\x9c': ('oelig', '153'),
492 b'\x9d': '?',
493 b'\x9e': ('#x17E', '17E'),
494 b'\x9f': ('Yuml', ''),}
495
496 # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
497 # horrors like stripping diacritical marks to turn á into a, but also
498 # contains non-horrors like turning “ into ".
499 MS_CHARS_TO_ASCII = {
500 b'\x80' : 'EUR',
501 b'\x81' : ' ',
502 b'\x82' : ',',
503 b'\x83' : 'f',
504 b'\x84' : ',,',
505 b'\x85' : '...',
506 b'\x86' : '+',
507 b'\x87' : '++',
508 b'\x88' : '^',
509 b'\x89' : '%',
510 b'\x8a' : 'S',
511 b'\x8b' : '<',
512 b'\x8c' : 'OE',
513 b'\x8d' : '?',
514 b'\x8e' : 'Z',
515 b'\x8f' : '?',
516 b'\x90' : '?',
517 b'\x91' : "'",
518 b'\x92' : "'",
519 b'\x93' : '"',
520 b'\x94' : '"',
521 b'\x95' : '*',
522 b'\x96' : '-',
523 b'\x97' : '--',
524 b'\x98' : '~',
525 b'\x99' : '(TM)',
526 b'\x9a' : 's',
527 b'\x9b' : '>',
528 b'\x9c' : 'oe',
529 b'\x9d' : '?',
530 b'\x9e' : 'z',
531 b'\x9f' : 'Y',
532 b'\xa0' : ' ',
533 b'\xa1' : '!',
534 b'\xa2' : 'c',
535 b'\xa3' : 'GBP',
536 b'\xa4' : '$', #This approximation is especially parochial--this is the
537 #generic currency symbol.
538 b'\xa5' : 'YEN',
539 b'\xa6' : '|',
540 b'\xa7' : 'S',
541 b'\xa8' : '..',
542 b'\xa9' : '',
543 b'\xaa' : '(th)',
544 b'\xab' : '<<',
545 b'\xac' : '!',
546 b'\xad' : ' ',
547 b'\xae' : '(R)',
548 b'\xaf' : '-',
549 b'\xb0' : 'o',
550 b'\xb1' : '+-',
551 b'\xb2' : '2',
552 b'\xb3' : '3',
553 b'\xb4' : ("'", 'acute'),
554 b'\xb5' : 'u',
555 b'\xb6' : 'P',
556 b'\xb7' : '*',
557 b'\xb8' : ',',
558 b'\xb9' : '1',
559 b'\xba' : '(th)',
560 b'\xbb' : '>>',
561 b'\xbc' : '1/4',
562 b'\xbd' : '1/2',
563 b'\xbe' : '3/4',
564 b'\xbf' : '?',
565 b'\xc0' : 'A',
566 b'\xc1' : 'A',
567 b'\xc2' : 'A',
568 b'\xc3' : 'A',
569 b'\xc4' : 'A',
570 b'\xc5' : 'A',
571 b'\xc6' : 'AE',
572 b'\xc7' : 'C',
573 b'\xc8' : 'E',
574 b'\xc9' : 'E',
575 b'\xca' : 'E',
576 b'\xcb' : 'E',
577 b'\xcc' : 'I',
578 b'\xcd' : 'I',
579 b'\xce' : 'I',
580 b'\xcf' : 'I',
581 b'\xd0' : 'D',
582 b'\xd1' : 'N',
583 b'\xd2' : 'O',
584 b'\xd3' : 'O',
585 b'\xd4' : 'O',
586 b'\xd5' : 'O',
587 b'\xd6' : 'O',
588 b'\xd7' : '*',
589 b'\xd8' : 'O',
590 b'\xd9' : 'U',
591 b'\xda' : 'U',
592 b'\xdb' : 'U',
593 b'\xdc' : 'U',
594 b'\xdd' : 'Y',
595 b'\xde' : 'b',
596 b'\xdf' : 'B',
597 b'\xe0' : 'a',
598 b'\xe1' : 'a',
599 b'\xe2' : 'a',
600 b'\xe3' : 'a',
601 b'\xe4' : 'a',
602 b'\xe5' : 'a',
603 b'\xe6' : 'ae',
604 b'\xe7' : 'c',
605 b'\xe8' : 'e',
606 b'\xe9' : 'e',
607 b'\xea' : 'e',
608 b'\xeb' : 'e',
609 b'\xec' : 'i',
610 b'\xed' : 'i',
611 b'\xee' : 'i',
612 b'\xef' : 'i',
613 b'\xf0' : 'o',
614 b'\xf1' : 'n',
615 b'\xf2' : 'o',
616 b'\xf3' : 'o',
617 b'\xf4' : 'o',
618 b'\xf5' : 'o',
619 b'\xf6' : 'o',
620 b'\xf7' : '/',
621 b'\xf8' : 'o',
622 b'\xf9' : 'u',
623 b'\xfa' : 'u',
624 b'\xfb' : 'u',
625 b'\xfc' : 'u',
626 b'\xfd' : 'y',
627 b'\xfe' : 'b',
628 b'\xff' : 'y',
629 }
630
631 # A map used when removing rogue Windows-1252/ISO-8859-1
632 # characters in otherwise UTF-8 documents.
633 #
634 # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
635 # Windows-1252.
636 WINDOWS_1252_TO_UTF8 = {
637 0x80 : b'\xe2\x82\xac', # €
638 0x82 : b'\xe2\x80\x9a', # ‚
639 0x83 : b'\xc6\x92', # ƒ
640 0x84 : b'\xe2\x80\x9e', # „
641 0x85 : b'\xe2\x80\xa6', # …
642 0x86 : b'\xe2\x80\xa0', # †
643 0x87 : b'\xe2\x80\xa1', # ‡
644 0x88 : b'\xcb\x86', # ˆ
645 0x89 : b'\xe2\x80\xb0', # ‰
646 0x8a : b'\xc5\xa0', # Š
647 0x8b : b'\xe2\x80\xb9', # ‹
648 0x8c : b'\xc5\x92', # Œ
649 0x8e : b'\xc5\xbd', # Ž
650 0x91 : b'\xe2\x80\x98', # ‘
651 0x92 : b'\xe2\x80\x99', # ’
652 0x93 : b'\xe2\x80\x9c', # “
653 0x94 : b'\xe2\x80\x9d', # ”
654 0x95 : b'\xe2\x80\xa2', # •
655 0x96 : b'\xe2\x80\x93', # –
656 0x97 : b'\xe2\x80\x94', # —
657 0x98 : b'\xcb\x9c', # ˜
658 0x99 : b'\xe2\x84\xa2', # ™
659 0x9a : b'\xc5\xa1', # š
660 0x9b : b'\xe2\x80\xba', # ›
661 0x9c : b'\xc5\x93', # œ
662 0x9e : b'\xc5\xbe', # ž
663 0x9f : b'\xc5\xb8', # Ÿ
664 0xa0 : b'\xc2\xa0', #  
665 0xa1 : b'\xc2\xa1', # ¡
666 0xa2 : b'\xc2\xa2', # ¢
667 0xa3 : b'\xc2\xa3', # £
668 0xa4 : b'\xc2\xa4', # ¤
669 0xa5 : b'\xc2\xa5', # ¥
670 0xa6 : b'\xc2\xa6', # ¦
671 0xa7 : b'\xc2\xa7', # §
672 0xa8 : b'\xc2\xa8', # ¨
673 0xa9 : b'\xc2\xa9', # ©
674 0xaa : b'\xc2\xaa', # ª
675 0xab : b'\xc2\xab', # «
676 0xac : b'\xc2\xac', # ¬
677 0xad : b'\xc2\xad', # ­
678 0xae : b'\xc2\xae', # ®
679 0xaf : b'\xc2\xaf', # ¯
680 0xb0 : b'\xc2\xb0', # °
681 0xb1 : b'\xc2\xb1', # ±
682 0xb2 : b'\xc2\xb2', # ²
683 0xb3 : b'\xc2\xb3', # ³
684 0xb4 : b'\xc2\xb4', # ´
685 0xb5 : b'\xc2\xb5', # µ
686 0xb6 : b'\xc2\xb6', # ¶
687 0xb7 : b'\xc2\xb7', # ·
688 0xb8 : b'\xc2\xb8', # ¸
689 0xb9 : b'\xc2\xb9', # ¹
690 0xba : b'\xc2\xba', # º
691 0xbb : b'\xc2\xbb', # »
692 0xbc : b'\xc2\xbc', # ¼
693 0xbd : b'\xc2\xbd', # ½
694 0xbe : b'\xc2\xbe', # ¾
695 0xbf : b'\xc2\xbf', # ¿
696 0xc0 : b'\xc3\x80', # À
697 0xc1 : b'\xc3\x81', # Á
698 0xc2 : b'\xc3\x82', # Â
699 0xc3 : b'\xc3\x83', # Ã
700 0xc4 : b'\xc3\x84', # Ä
701 0xc5 : b'\xc3\x85', # Å
702 0xc6 : b'\xc3\x86', # Æ
703 0xc7 : b'\xc3\x87', # Ç
704 0xc8 : b'\xc3\x88', # È
705 0xc9 : b'\xc3\x89', # É
706 0xca : b'\xc3\x8a', # Ê
707 0xcb : b'\xc3\x8b', # Ë
708 0xcc : b'\xc3\x8c', # Ì
709 0xcd : b'\xc3\x8d', # Í
710 0xce : b'\xc3\x8e', # Î
711 0xcf : b'\xc3\x8f', # Ï
712 0xd0 : b'\xc3\x90', # Ð
713 0xd1 : b'\xc3\x91', # Ñ
714 0xd2 : b'\xc3\x92', # Ò
715 0xd3 : b'\xc3\x93', # Ó
716 0xd4 : b'\xc3\x94', # Ô
717 0xd5 : b'\xc3\x95', # Õ
718 0xd6 : b'\xc3\x96', # Ö
719 0xd7 : b'\xc3\x97', # ×
720 0xd8 : b'\xc3\x98', # Ø
721 0xd9 : b'\xc3\x99', # Ù
722 0xda : b'\xc3\x9a', # Ú
723 0xdb : b'\xc3\x9b', # Û
724 0xdc : b'\xc3\x9c', # Ü
725 0xdd : b'\xc3\x9d', # Ý
726 0xde : b'\xc3\x9e', # Þ
727 0xdf : b'\xc3\x9f', # ß
728 0xe0 : b'\xc3\xa0', # à
729 0xe1 : b'\xa1', # á
730 0xe2 : b'\xc3\xa2', # â
731 0xe3 : b'\xc3\xa3', # ã
732 0xe4 : b'\xc3\xa4', # ä
733 0xe5 : b'\xc3\xa5', # å
734 0xe6 : b'\xc3\xa6', # æ
735 0xe7 : b'\xc3\xa7', # ç
736 0xe8 : b'\xc3\xa8', # è
737 0xe9 : b'\xc3\xa9', # é
738 0xea : b'\xc3\xaa', # ê
739 0xeb : b'\xc3\xab', # ë
740 0xec : b'\xc3\xac', # ì
741 0xed : b'\xc3\xad', # í
742 0xee : b'\xc3\xae', # î
743 0xef : b'\xc3\xaf', # ï
744 0xf0 : b'\xc3\xb0', # ð
745 0xf1 : b'\xc3\xb1', # ñ
746 0xf2 : b'\xc3\xb2', # ò
747 0xf3 : b'\xc3\xb3', # ó
748 0xf4 : b'\xc3\xb4', # ô
749 0xf5 : b'\xc3\xb5', # õ
750 0xf6 : b'\xc3\xb6', # ö
751 0xf7 : b'\xc3\xb7', # ÷
752 0xf8 : b'\xc3\xb8', # ø
753 0xf9 : b'\xc3\xb9', # ù
754 0xfa : b'\xc3\xba', # ú
755 0xfb : b'\xc3\xbb', # û
756 0xfc : b'\xc3\xbc', # ü
757 0xfd : b'\xc3\xbd', # ý
758 0xfe : b'\xc3\xbe', # þ
759 }
760
761 MULTIBYTE_MARKERS_AND_SIZES = [
762 (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
763 (0xe0, 0xef, 3), # 3-byte characters start with E0-EF
764 (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
765 ]
766
767 FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
768 LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
769
770 @classmethod
771 def detwingle(cls, in_bytes, main_encoding="utf8",
772 embedded_encoding="windows-1252"):
773 """Fix characters from one encoding embedded in some other encoding.
774
775 Currently the only situation supported is Windows-1252 (or its
776 subset ISO-8859-1), embedded in UTF-8.
777
778 The input must be a bytestring. If you've already converted
779 the document to Unicode, you're too late.
780
781 The output is a bytestring in which `embedded_encoding`
782 characters have been converted to their `main_encoding`
783 equivalents.
784 """
785 if embedded_encoding.replace('_', '-').lower() not in (
786 'windows-1252', 'windows_1252'):
787 raise NotImplementedError(
788 "Windows-1252 and ISO-8859-1 are the only currently supported "
789 "embedded encodings.")
790
791 if main_encoding.lower() not in ('utf8', 'utf-8'):
792 raise NotImplementedError(
793 "UTF-8 is the only currently supported main encoding.")
794
795 byte_chunks = []
796
797 chunk_start = 0
798 pos = 0
799 while pos < len(in_bytes):
800 byte = in_bytes[pos]
801 if not isinstance(byte, int):
802 # Python 2.x
803 byte = ord(byte)
804 if (byte >= cls.FIRST_MULTIBYTE_MARKER
805 and byte <= cls.LAST_MULTIBYTE_MARKER):
806 # This is the start of a UTF-8 multibyte character. Skip
807 # to the end.
808 for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
809 if byte >= start and byte <= end:
810 pos += size
811 break
812 elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
813 # We found a Windows-1252 character!
814 # Save the string up to this point as a chunk.
815 byte_chunks.append(in_bytes[chunk_start:pos])
816
817 # Now translate the Windows-1252 character into UTF-8
818 # and add it as another, one-byte chunk.
819 byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
820 pos += 1
821 chunk_start = pos
822 else:
823 # Go on to the next character.
824 pos += 1
825 if chunk_start == 0:
826 # The string is unchanged.
827 return in_bytes
828 else:
829 # Store the final chunk.
830 byte_chunks.append(in_bytes[chunk_start:])
831 return b''.join(byte_chunks)
832