blob: 68d419feb5dfb0957742156581e1a3dedda94b91 [file] [log] [blame]
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001# -*- coding: utf-8 -*-
2"""Beautiful Soup bonus library: Unicode, Dammit
3
4This library converts a bytestream to Unicode through any means
5necessary. It is heavily based on code from Mark Pilgrim's Universal
Patrick Williamsc0f7c042017-02-23 20:41:17 -06006Feed Parser. It works best on XML and HTML, but it does not rewrite the
Patrick Williamsc124f4f2015-09-15 14:41:29 -05007XML or HTML to reflect a new encoding; that's the tree builder's job.
8"""
Patrick Williamsc0f7c042017-02-23 20:41:17 -06009__license__ = "MIT"
Patrick Williamsc124f4f2015-09-15 14:41:29 -050010
Patrick Williamsc0f7c042017-02-23 20:41:17 -060011from pdb import set_trace
Patrick Williamsc124f4f2015-09-15 14:41:29 -050012import codecs
Patrick Williamsc0f7c042017-02-23 20:41:17 -060013from html.entities import codepoint2name
Patrick Williamsc124f4f2015-09-15 14:41:29 -050014import re
15import logging
16import string
17
18# Import a library to autodetect character encodings.
19chardet_type = None
20try:
21 # First try the fast C implementation.
22 # PyPI package: cchardet
23 import cchardet
24 def chardet_dammit(s):
25 return cchardet.detect(s)['encoding']
26except ImportError:
27 try:
28 # Fall back to the pure Python implementation
29 # Debian package: python-chardet
30 # PyPI package: chardet
31 import chardet
32 def chardet_dammit(s):
33 return chardet.detect(s)['encoding']
34 #import chardet.constants
35 #chardet.constants._debug = 1
36 except ImportError:
37 # No chardet available.
38 def chardet_dammit(s):
39 return None
40
41# Available from http://cjkpython.i18n.org/.
42try:
43 import iconv_codec
44except ImportError:
45 pass
46
47xml_encoding_re = re.compile(
48 '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
49html_meta_re = re.compile(
50 '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
51
52class EntitySubstitution(object):
53
54 """Substitute XML or HTML entities for the corresponding characters."""
55
56 def _populate_class_variables():
57 lookup = {}
58 reverse_lookup = {}
59 characters_for_re = []
60 for codepoint, name in list(codepoint2name.items()):
Patrick Williamsc0f7c042017-02-23 20:41:17 -060061 character = chr(codepoint)
Patrick Williamsc124f4f2015-09-15 14:41:29 -050062 if codepoint != 34:
63 # There's no point in turning the quotation mark into
64 # &quot;, unless it happens within an attribute value, which
65 # is handled elsewhere.
66 characters_for_re.append(character)
67 lookup[character] = name
68 # But we do want to turn &quot; into the quotation mark.
69 reverse_lookup[name] = character
70 re_definition = "[%s]" % "".join(characters_for_re)
71 return lookup, reverse_lookup, re.compile(re_definition)
72 (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
73 CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
74
75 CHARACTER_TO_XML_ENTITY = {
76 "'": "apos",
77 '"': "quot",
78 "&": "amp",
79 "<": "lt",
80 ">": "gt",
81 }
82
83 BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
84 "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
85 ")")
86
87 AMPERSAND_OR_BRACKET = re.compile("([<>&])")
88
89 @classmethod
90 def _substitute_html_entity(cls, matchobj):
91 entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
92 return "&%s;" % entity
93
94 @classmethod
95 def _substitute_xml_entity(cls, matchobj):
96 """Used with a regular expression to substitute the
97 appropriate XML entity for an XML special character."""
98 entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
99 return "&%s;" % entity
100
101 @classmethod
102 def quoted_attribute_value(self, value):
103 """Make a value into a quoted XML attribute, possibly escaping it.
104
105 Most strings will be quoted using double quotes.
106
107 Bob's Bar -> "Bob's Bar"
108
109 If a string contains double quotes, it will be quoted using
110 single quotes.
111
112 Welcome to "my bar" -> 'Welcome to "my bar"'
113
114 If a string contains both single and double quotes, the
115 double quotes will be escaped, and the string will be quoted
116 using double quotes.
117
118 Welcome to "Bob's Bar" -> "Welcome to &quot;Bob's bar&quot;
119 """
120 quote_with = '"'
121 if '"' in value:
122 if "'" in value:
123 # The string contains both single and double
124 # quotes. Turn the double quotes into
125 # entities. We quote the double quotes rather than
126 # the single quotes because the entity name is
127 # "&quot;" whether this is HTML or XML. If we
128 # quoted the single quotes, we'd have to decide
129 # between &apos; and &squot;.
130 replace_with = "&quot;"
131 value = value.replace('"', replace_with)
132 else:
133 # There are double quotes but no single quotes.
134 # We can use single quotes to quote the attribute.
135 quote_with = "'"
136 return quote_with + value + quote_with
137
138 @classmethod
139 def substitute_xml(cls, value, make_quoted_attribute=False):
140 """Substitute XML entities for special XML characters.
141
142 :param value: A string to be substituted. The less-than sign
143 will become &lt;, the greater-than sign will become &gt;,
144 and any ampersands will become &amp;. If you want ampersands
145 that appear to be part of an entity definition to be left
146 alone, use substitute_xml_containing_entities() instead.
147
148 :param make_quoted_attribute: If True, then the string will be
149 quoted, as befits an attribute value.
150 """
151 # Escape angle brackets and ampersands.
152 value = cls.AMPERSAND_OR_BRACKET.sub(
153 cls._substitute_xml_entity, value)
154
155 if make_quoted_attribute:
156 value = cls.quoted_attribute_value(value)
157 return value
158
159 @classmethod
160 def substitute_xml_containing_entities(
161 cls, value, make_quoted_attribute=False):
162 """Substitute XML entities for special XML characters.
163
164 :param value: A string to be substituted. The less-than sign will
165 become &lt;, the greater-than sign will become &gt;, and any
166 ampersands that are not part of an entity defition will
167 become &amp;.
168
169 :param make_quoted_attribute: If True, then the string will be
170 quoted, as befits an attribute value.
171 """
172 # Escape angle brackets, and ampersands that aren't part of
173 # entities.
174 value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
175 cls._substitute_xml_entity, value)
176
177 if make_quoted_attribute:
178 value = cls.quoted_attribute_value(value)
179 return value
180
181 @classmethod
182 def substitute_html(cls, s):
183 """Replace certain Unicode characters with named HTML entities.
184
185 This differs from data.encode(encoding, 'xmlcharrefreplace')
186 in that the goal is to make the result more readable (to those
187 with ASCII displays) rather than to recover from
188 errors. There's absolutely nothing wrong with a UTF-8 string
189 containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
190 character with "&eacute;" will make it more readable to some
191 people.
192 """
193 return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
194 cls._substitute_html_entity, s)
195
196
197class EncodingDetector:
198 """Suggests a number of possible encodings for a bytestring.
199
200 Order of precedence:
201
202 1. Encodings you specifically tell EncodingDetector to try first
203 (the override_encodings argument to the constructor).
204
205 2. An encoding declared within the bytestring itself, either in an
206 XML declaration (if the bytestring is to be interpreted as an XML
207 document), or in a <meta> tag (if the bytestring is to be
208 interpreted as an HTML document.)
209
210 3. An encoding detected through textual analysis by chardet,
211 cchardet, or a similar external library.
212
213 4. UTF-8.
214
215 5. Windows-1252.
216 """
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600217 def __init__(self, markup, override_encodings=None, is_html=False,
218 exclude_encodings=None):
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500219 self.override_encodings = override_encodings or []
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600220 exclude_encodings = exclude_encodings or []
221 self.exclude_encodings = set([x.lower() for x in exclude_encodings])
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500222 self.chardet_encoding = None
223 self.is_html = is_html
224 self.declared_encoding = None
225
226 # First order of business: strip a byte-order mark.
227 self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
228
229 def _usable(self, encoding, tried):
230 if encoding is not None:
231 encoding = encoding.lower()
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600232 if encoding in self.exclude_encodings:
233 return False
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500234 if encoding not in tried:
235 tried.add(encoding)
236 return True
237 return False
238
239 @property
240 def encodings(self):
241 """Yield a number of encodings that might work for this markup."""
242 tried = set()
243 for e in self.override_encodings:
244 if self._usable(e, tried):
245 yield e
246
247 # Did the document originally start with a byte-order mark
248 # that indicated its encoding?
249 if self._usable(self.sniffed_encoding, tried):
250 yield self.sniffed_encoding
251
252 # Look within the document for an XML or HTML encoding
253 # declaration.
254 if self.declared_encoding is None:
255 self.declared_encoding = self.find_declared_encoding(
256 self.markup, self.is_html)
257 if self._usable(self.declared_encoding, tried):
258 yield self.declared_encoding
259
260 # Use third-party character set detection to guess at the
261 # encoding.
262 if self.chardet_encoding is None:
263 self.chardet_encoding = chardet_dammit(self.markup)
264 if self._usable(self.chardet_encoding, tried):
265 yield self.chardet_encoding
266
267 # As a last-ditch effort, try utf-8 and windows-1252.
268 for e in ('utf-8', 'windows-1252'):
269 if self._usable(e, tried):
270 yield e
271
272 @classmethod
273 def strip_byte_order_mark(cls, data):
274 """If a byte-order mark is present, strip it and return the encoding it implies."""
275 encoding = None
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600276 if isinstance(data, str):
277 # Unicode data cannot have a byte-order mark.
278 return data, encoding
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500279 if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
280 and (data[2:4] != '\x00\x00'):
281 encoding = 'utf-16be'
282 data = data[2:]
283 elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
284 and (data[2:4] != '\x00\x00'):
285 encoding = 'utf-16le'
286 data = data[2:]
287 elif data[:3] == b'\xef\xbb\xbf':
288 encoding = 'utf-8'
289 data = data[3:]
290 elif data[:4] == b'\x00\x00\xfe\xff':
291 encoding = 'utf-32be'
292 data = data[4:]
293 elif data[:4] == b'\xff\xfe\x00\x00':
294 encoding = 'utf-32le'
295 data = data[4:]
296 return data, encoding
297
298 @classmethod
299 def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):
300 """Given a document, tries to find its declared encoding.
301
302 An XML encoding is declared at the beginning of the document.
303
304 An HTML encoding is declared in a <meta> tag, hopefully near the
305 beginning of the document.
306 """
307 if search_entire_document:
308 xml_endpos = html_endpos = len(markup)
309 else:
310 xml_endpos = 1024
311 html_endpos = max(2048, int(len(markup) * 0.05))
312
313 declared_encoding = None
314 declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
315 if not declared_encoding_match and is_html:
316 declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
317 if declared_encoding_match is not None:
318 declared_encoding = declared_encoding_match.groups()[0].decode(
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600319 'ascii', 'replace')
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500320 if declared_encoding:
321 return declared_encoding.lower()
322 return None
323
324class UnicodeDammit:
325 """A class for detecting the encoding of a *ML document and
326 converting it to a Unicode string. If the source encoding is
327 windows-1252, can replace MS smart quotes with their HTML or XML
328 equivalents."""
329
330 # This dictionary maps commonly seen values for "charset" in HTML
331 # meta tags to the corresponding Python codec names. It only covers
332 # values that aren't in Python's aliases and can't be determined
333 # by the heuristics in find_codec.
334 CHARSET_ALIASES = {"macintosh": "mac-roman",
335 "x-sjis": "shift-jis"}
336
337 ENCODINGS_WITH_SMART_QUOTES = [
338 "windows-1252",
339 "iso-8859-1",
340 "iso-8859-2",
341 ]
342
343 def __init__(self, markup, override_encodings=[],
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600344 smart_quotes_to=None, is_html=False, exclude_encodings=[]):
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500345 self.smart_quotes_to = smart_quotes_to
346 self.tried_encodings = []
347 self.contains_replacement_characters = False
348 self.is_html = is_html
349
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600350 self.detector = EncodingDetector(
351 markup, override_encodings, is_html, exclude_encodings)
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500352
353 # Short-circuit if the data is in Unicode to begin with.
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600354 if isinstance(markup, str) or markup == '':
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500355 self.markup = markup
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600356 self.unicode_markup = str(markup)
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500357 self.original_encoding = None
358 return
359
360 # The encoding detector may have stripped a byte-order mark.
361 # Use the stripped markup from this point on.
362 self.markup = self.detector.markup
363
364 u = None
365 for encoding in self.detector.encodings:
366 markup = self.detector.markup
367 u = self._convert_from(encoding)
368 if u is not None:
369 break
370
371 if not u:
372 # None of the encodings worked. As an absolute last resort,
373 # try them again with character replacement.
374
375 for encoding in self.detector.encodings:
376 if encoding != "ascii":
377 u = self._convert_from(encoding, "replace")
378 if u is not None:
379 logging.warning(
380 "Some characters could not be decoded, and were "
381 "replaced with REPLACEMENT CHARACTER.")
382 self.contains_replacement_characters = True
383 break
384
385 # If none of that worked, we could at this point force it to
386 # ASCII, but that would destroy so much data that I think
387 # giving up is better.
388 self.unicode_markup = u
389 if not u:
390 self.original_encoding = None
391
392 def _sub_ms_char(self, match):
393 """Changes a MS smart quote character to an XML or HTML
394 entity, or an ASCII character."""
395 orig = match.group(1)
396 if self.smart_quotes_to == 'ascii':
397 sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
398 else:
399 sub = self.MS_CHARS.get(orig)
400 if type(sub) == tuple:
401 if self.smart_quotes_to == 'xml':
402 sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
403 else:
404 sub = '&'.encode() + sub[0].encode() + ';'.encode()
405 else:
406 sub = sub.encode()
407 return sub
408
409 def _convert_from(self, proposed, errors="strict"):
410 proposed = self.find_codec(proposed)
411 if not proposed or (proposed, errors) in self.tried_encodings:
412 return None
413 self.tried_encodings.append((proposed, errors))
414 markup = self.markup
415 # Convert smart quotes to HTML if coming from an encoding
416 # that might have them.
417 if (self.smart_quotes_to is not None
418 and proposed in self.ENCODINGS_WITH_SMART_QUOTES):
419 smart_quotes_re = b"([\x80-\x9f])"
420 smart_quotes_compiled = re.compile(smart_quotes_re)
421 markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
422
423 try:
424 #print "Trying to convert document to %s (errors=%s)" % (
425 # proposed, errors)
426 u = self._to_unicode(markup, proposed, errors)
427 self.markup = u
428 self.original_encoding = proposed
429 except Exception as e:
430 #print "That didn't work!"
431 #print e
432 return None
433 #print "Correct encoding: %s" % proposed
434 return self.markup
435
436 def _to_unicode(self, data, encoding, errors="strict"):
437 '''Given a string and its encoding, decodes the string into Unicode.
438 %encoding is a string recognized by encodings.aliases'''
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600439 return str(data, encoding, errors)
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500440
441 @property
442 def declared_html_encoding(self):
443 if not self.is_html:
444 return None
445 return self.detector.declared_encoding
446
447 def find_codec(self, charset):
448 value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
449 or (charset and self._codec(charset.replace("-", "")))
450 or (charset and self._codec(charset.replace("-", "_")))
451 or (charset and charset.lower())
452 or charset
453 )
454 if value:
455 return value.lower()
456 return None
457
458 def _codec(self, charset):
459 if not charset:
460 return charset
461 codec = None
462 try:
463 codecs.lookup(charset)
464 codec = charset
465 except (LookupError, ValueError):
466 pass
467 return codec
468
469
470 # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
471 MS_CHARS = {b'\x80': ('euro', '20AC'),
472 b'\x81': ' ',
473 b'\x82': ('sbquo', '201A'),
474 b'\x83': ('fnof', '192'),
475 b'\x84': ('bdquo', '201E'),
476 b'\x85': ('hellip', '2026'),
477 b'\x86': ('dagger', '2020'),
478 b'\x87': ('Dagger', '2021'),
479 b'\x88': ('circ', '2C6'),
480 b'\x89': ('permil', '2030'),
481 b'\x8A': ('Scaron', '160'),
482 b'\x8B': ('lsaquo', '2039'),
483 b'\x8C': ('OElig', '152'),
484 b'\x8D': '?',
485 b'\x8E': ('#x17D', '17D'),
486 b'\x8F': '?',
487 b'\x90': '?',
488 b'\x91': ('lsquo', '2018'),
489 b'\x92': ('rsquo', '2019'),
490 b'\x93': ('ldquo', '201C'),
491 b'\x94': ('rdquo', '201D'),
492 b'\x95': ('bull', '2022'),
493 b'\x96': ('ndash', '2013'),
494 b'\x97': ('mdash', '2014'),
495 b'\x98': ('tilde', '2DC'),
496 b'\x99': ('trade', '2122'),
497 b'\x9a': ('scaron', '161'),
498 b'\x9b': ('rsaquo', '203A'),
499 b'\x9c': ('oelig', '153'),
500 b'\x9d': '?',
501 b'\x9e': ('#x17E', '17E'),
502 b'\x9f': ('Yuml', ''),}
503
504 # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
505 # horrors like stripping diacritical marks to turn á into a, but also
506 # contains non-horrors like turning “ into ".
507 MS_CHARS_TO_ASCII = {
508 b'\x80' : 'EUR',
509 b'\x81' : ' ',
510 b'\x82' : ',',
511 b'\x83' : 'f',
512 b'\x84' : ',,',
513 b'\x85' : '...',
514 b'\x86' : '+',
515 b'\x87' : '++',
516 b'\x88' : '^',
517 b'\x89' : '%',
518 b'\x8a' : 'S',
519 b'\x8b' : '<',
520 b'\x8c' : 'OE',
521 b'\x8d' : '?',
522 b'\x8e' : 'Z',
523 b'\x8f' : '?',
524 b'\x90' : '?',
525 b'\x91' : "'",
526 b'\x92' : "'",
527 b'\x93' : '"',
528 b'\x94' : '"',
529 b'\x95' : '*',
530 b'\x96' : '-',
531 b'\x97' : '--',
532 b'\x98' : '~',
533 b'\x99' : '(TM)',
534 b'\x9a' : 's',
535 b'\x9b' : '>',
536 b'\x9c' : 'oe',
537 b'\x9d' : '?',
538 b'\x9e' : 'z',
539 b'\x9f' : 'Y',
540 b'\xa0' : ' ',
541 b'\xa1' : '!',
542 b'\xa2' : 'c',
543 b'\xa3' : 'GBP',
544 b'\xa4' : '$', #This approximation is especially parochial--this is the
545 #generic currency symbol.
546 b'\xa5' : 'YEN',
547 b'\xa6' : '|',
548 b'\xa7' : 'S',
549 b'\xa8' : '..',
550 b'\xa9' : '',
551 b'\xaa' : '(th)',
552 b'\xab' : '<<',
553 b'\xac' : '!',
554 b'\xad' : ' ',
555 b'\xae' : '(R)',
556 b'\xaf' : '-',
557 b'\xb0' : 'o',
558 b'\xb1' : '+-',
559 b'\xb2' : '2',
560 b'\xb3' : '3',
561 b'\xb4' : ("'", 'acute'),
562 b'\xb5' : 'u',
563 b'\xb6' : 'P',
564 b'\xb7' : '*',
565 b'\xb8' : ',',
566 b'\xb9' : '1',
567 b'\xba' : '(th)',
568 b'\xbb' : '>>',
569 b'\xbc' : '1/4',
570 b'\xbd' : '1/2',
571 b'\xbe' : '3/4',
572 b'\xbf' : '?',
573 b'\xc0' : 'A',
574 b'\xc1' : 'A',
575 b'\xc2' : 'A',
576 b'\xc3' : 'A',
577 b'\xc4' : 'A',
578 b'\xc5' : 'A',
579 b'\xc6' : 'AE',
580 b'\xc7' : 'C',
581 b'\xc8' : 'E',
582 b'\xc9' : 'E',
583 b'\xca' : 'E',
584 b'\xcb' : 'E',
585 b'\xcc' : 'I',
586 b'\xcd' : 'I',
587 b'\xce' : 'I',
588 b'\xcf' : 'I',
589 b'\xd0' : 'D',
590 b'\xd1' : 'N',
591 b'\xd2' : 'O',
592 b'\xd3' : 'O',
593 b'\xd4' : 'O',
594 b'\xd5' : 'O',
595 b'\xd6' : 'O',
596 b'\xd7' : '*',
597 b'\xd8' : 'O',
598 b'\xd9' : 'U',
599 b'\xda' : 'U',
600 b'\xdb' : 'U',
601 b'\xdc' : 'U',
602 b'\xdd' : 'Y',
603 b'\xde' : 'b',
604 b'\xdf' : 'B',
605 b'\xe0' : 'a',
606 b'\xe1' : 'a',
607 b'\xe2' : 'a',
608 b'\xe3' : 'a',
609 b'\xe4' : 'a',
610 b'\xe5' : 'a',
611 b'\xe6' : 'ae',
612 b'\xe7' : 'c',
613 b'\xe8' : 'e',
614 b'\xe9' : 'e',
615 b'\xea' : 'e',
616 b'\xeb' : 'e',
617 b'\xec' : 'i',
618 b'\xed' : 'i',
619 b'\xee' : 'i',
620 b'\xef' : 'i',
621 b'\xf0' : 'o',
622 b'\xf1' : 'n',
623 b'\xf2' : 'o',
624 b'\xf3' : 'o',
625 b'\xf4' : 'o',
626 b'\xf5' : 'o',
627 b'\xf6' : 'o',
628 b'\xf7' : '/',
629 b'\xf8' : 'o',
630 b'\xf9' : 'u',
631 b'\xfa' : 'u',
632 b'\xfb' : 'u',
633 b'\xfc' : 'u',
634 b'\xfd' : 'y',
635 b'\xfe' : 'b',
636 b'\xff' : 'y',
637 }
638
639 # A map used when removing rogue Windows-1252/ISO-8859-1
640 # characters in otherwise UTF-8 documents.
641 #
642 # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
643 # Windows-1252.
644 WINDOWS_1252_TO_UTF8 = {
645 0x80 : b'\xe2\x82\xac', # €
646 0x82 : b'\xe2\x80\x9a', # ‚
647 0x83 : b'\xc6\x92', # ƒ
648 0x84 : b'\xe2\x80\x9e', # „
649 0x85 : b'\xe2\x80\xa6', # …
650 0x86 : b'\xe2\x80\xa0', # †
651 0x87 : b'\xe2\x80\xa1', # ‡
652 0x88 : b'\xcb\x86', # ˆ
653 0x89 : b'\xe2\x80\xb0', # ‰
654 0x8a : b'\xc5\xa0', # Š
655 0x8b : b'\xe2\x80\xb9', # ‹
656 0x8c : b'\xc5\x92', # Œ
657 0x8e : b'\xc5\xbd', # Ž
658 0x91 : b'\xe2\x80\x98', # ‘
659 0x92 : b'\xe2\x80\x99', # ’
660 0x93 : b'\xe2\x80\x9c', # “
661 0x94 : b'\xe2\x80\x9d', # ”
662 0x95 : b'\xe2\x80\xa2', # •
663 0x96 : b'\xe2\x80\x93', # –
664 0x97 : b'\xe2\x80\x94', # —
665 0x98 : b'\xcb\x9c', # ˜
666 0x99 : b'\xe2\x84\xa2', # ™
667 0x9a : b'\xc5\xa1', # š
668 0x9b : b'\xe2\x80\xba', # ›
669 0x9c : b'\xc5\x93', # œ
670 0x9e : b'\xc5\xbe', # ž
671 0x9f : b'\xc5\xb8', # Ÿ
672 0xa0 : b'\xc2\xa0', #  
673 0xa1 : b'\xc2\xa1', # ¡
674 0xa2 : b'\xc2\xa2', # ¢
675 0xa3 : b'\xc2\xa3', # £
676 0xa4 : b'\xc2\xa4', # ¤
677 0xa5 : b'\xc2\xa5', # ¥
678 0xa6 : b'\xc2\xa6', # ¦
679 0xa7 : b'\xc2\xa7', # §
680 0xa8 : b'\xc2\xa8', # ¨
681 0xa9 : b'\xc2\xa9', # ©
682 0xaa : b'\xc2\xaa', # ª
683 0xab : b'\xc2\xab', # «
684 0xac : b'\xc2\xac', # ¬
685 0xad : b'\xc2\xad', # ­
686 0xae : b'\xc2\xae', # ®
687 0xaf : b'\xc2\xaf', # ¯
688 0xb0 : b'\xc2\xb0', # °
689 0xb1 : b'\xc2\xb1', # ±
690 0xb2 : b'\xc2\xb2', # ²
691 0xb3 : b'\xc2\xb3', # ³
692 0xb4 : b'\xc2\xb4', # ´
693 0xb5 : b'\xc2\xb5', # µ
694 0xb6 : b'\xc2\xb6', # ¶
695 0xb7 : b'\xc2\xb7', # ·
696 0xb8 : b'\xc2\xb8', # ¸
697 0xb9 : b'\xc2\xb9', # ¹
698 0xba : b'\xc2\xba', # º
699 0xbb : b'\xc2\xbb', # »
700 0xbc : b'\xc2\xbc', # ¼
701 0xbd : b'\xc2\xbd', # ½
702 0xbe : b'\xc2\xbe', # ¾
703 0xbf : b'\xc2\xbf', # ¿
704 0xc0 : b'\xc3\x80', # À
705 0xc1 : b'\xc3\x81', # Á
706 0xc2 : b'\xc3\x82', # Â
707 0xc3 : b'\xc3\x83', # Ã
708 0xc4 : b'\xc3\x84', # Ä
709 0xc5 : b'\xc3\x85', # Å
710 0xc6 : b'\xc3\x86', # Æ
711 0xc7 : b'\xc3\x87', # Ç
712 0xc8 : b'\xc3\x88', # È
713 0xc9 : b'\xc3\x89', # É
714 0xca : b'\xc3\x8a', # Ê
715 0xcb : b'\xc3\x8b', # Ë
716 0xcc : b'\xc3\x8c', # Ì
717 0xcd : b'\xc3\x8d', # Í
718 0xce : b'\xc3\x8e', # Î
719 0xcf : b'\xc3\x8f', # Ï
720 0xd0 : b'\xc3\x90', # Ð
721 0xd1 : b'\xc3\x91', # Ñ
722 0xd2 : b'\xc3\x92', # Ò
723 0xd3 : b'\xc3\x93', # Ó
724 0xd4 : b'\xc3\x94', # Ô
725 0xd5 : b'\xc3\x95', # Õ
726 0xd6 : b'\xc3\x96', # Ö
727 0xd7 : b'\xc3\x97', # ×
728 0xd8 : b'\xc3\x98', # Ø
729 0xd9 : b'\xc3\x99', # Ù
730 0xda : b'\xc3\x9a', # Ú
731 0xdb : b'\xc3\x9b', # Û
732 0xdc : b'\xc3\x9c', # Ü
733 0xdd : b'\xc3\x9d', # Ý
734 0xde : b'\xc3\x9e', # Þ
735 0xdf : b'\xc3\x9f', # ß
736 0xe0 : b'\xc3\xa0', # à
737 0xe1 : b'\xa1', # á
738 0xe2 : b'\xc3\xa2', # â
739 0xe3 : b'\xc3\xa3', # ã
740 0xe4 : b'\xc3\xa4', # ä
741 0xe5 : b'\xc3\xa5', # å
742 0xe6 : b'\xc3\xa6', # æ
743 0xe7 : b'\xc3\xa7', # ç
744 0xe8 : b'\xc3\xa8', # è
745 0xe9 : b'\xc3\xa9', # é
746 0xea : b'\xc3\xaa', # ê
747 0xeb : b'\xc3\xab', # ë
748 0xec : b'\xc3\xac', # ì
749 0xed : b'\xc3\xad', # í
750 0xee : b'\xc3\xae', # î
751 0xef : b'\xc3\xaf', # ï
752 0xf0 : b'\xc3\xb0', # ð
753 0xf1 : b'\xc3\xb1', # ñ
754 0xf2 : b'\xc3\xb2', # ò
755 0xf3 : b'\xc3\xb3', # ó
756 0xf4 : b'\xc3\xb4', # ô
757 0xf5 : b'\xc3\xb5', # õ
758 0xf6 : b'\xc3\xb6', # ö
759 0xf7 : b'\xc3\xb7', # ÷
760 0xf8 : b'\xc3\xb8', # ø
761 0xf9 : b'\xc3\xb9', # ù
762 0xfa : b'\xc3\xba', # ú
763 0xfb : b'\xc3\xbb', # û
764 0xfc : b'\xc3\xbc', # ü
765 0xfd : b'\xc3\xbd', # ý
766 0xfe : b'\xc3\xbe', # þ
767 }
768
769 MULTIBYTE_MARKERS_AND_SIZES = [
770 (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
771 (0xe0, 0xef, 3), # 3-byte characters start with E0-EF
772 (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
773 ]
774
775 FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
776 LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
777
778 @classmethod
779 def detwingle(cls, in_bytes, main_encoding="utf8",
780 embedded_encoding="windows-1252"):
781 """Fix characters from one encoding embedded in some other encoding.
782
783 Currently the only situation supported is Windows-1252 (or its
784 subset ISO-8859-1), embedded in UTF-8.
785
786 The input must be a bytestring. If you've already converted
787 the document to Unicode, you're too late.
788
789 The output is a bytestring in which `embedded_encoding`
790 characters have been converted to their `main_encoding`
791 equivalents.
792 """
793 if embedded_encoding.replace('_', '-').lower() not in (
794 'windows-1252', 'windows_1252'):
795 raise NotImplementedError(
796 "Windows-1252 and ISO-8859-1 are the only currently supported "
797 "embedded encodings.")
798
799 if main_encoding.lower() not in ('utf8', 'utf-8'):
800 raise NotImplementedError(
801 "UTF-8 is the only currently supported main encoding.")
802
803 byte_chunks = []
804
805 chunk_start = 0
806 pos = 0
807 while pos < len(in_bytes):
808 byte = in_bytes[pos]
809 if not isinstance(byte, int):
810 # Python 2.x
811 byte = ord(byte)
812 if (byte >= cls.FIRST_MULTIBYTE_MARKER
813 and byte <= cls.LAST_MULTIBYTE_MARKER):
814 # This is the start of a UTF-8 multibyte character. Skip
815 # to the end.
816 for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
817 if byte >= start and byte <= end:
818 pos += size
819 break
820 elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
821 # We found a Windows-1252 character!
822 # Save the string up to this point as a chunk.
823 byte_chunks.append(in_bytes[chunk_start:pos])
824
825 # Now translate the Windows-1252 character into UTF-8
826 # and add it as another, one-byte chunk.
827 byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
828 pos += 1
829 chunk_start = pos
830 else:
831 # Go on to the next character.
832 pos += 1
833 if chunk_start == 0:
834 # The string is unchanged.
835 return in_bytes
836 else:
837 # Store the final chunk.
838 byte_chunks.append(in_bytes[chunk_start:])
839 return b''.join(byte_chunks)
840