blob: 59640b7ce3a0f1386fdca863cd7eb95a3942a3ee [file] [log] [blame]
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001# -*- coding: utf-8 -*-
2"""Beautiful Soup bonus library: Unicode, Dammit
3
4This library converts a bytestream to Unicode through any means
5necessary. It is heavily based on code from Mark Pilgrim's Universal
6Feed Parser. It works best on XML and XML, but it does not rewrite the
7XML or HTML to reflect a new encoding; that's the tree builder's job.
8"""
9
10import codecs
11from htmlentitydefs import codepoint2name
12import re
13import logging
14import string
15
16# Import a library to autodetect character encodings.
17chardet_type = None
18try:
19 # First try the fast C implementation.
20 # PyPI package: cchardet
21 import cchardet
22 def chardet_dammit(s):
23 return cchardet.detect(s)['encoding']
24except ImportError:
25 try:
26 # Fall back to the pure Python implementation
27 # Debian package: python-chardet
28 # PyPI package: chardet
29 import chardet
30 def chardet_dammit(s):
31 return chardet.detect(s)['encoding']
32 #import chardet.constants
33 #chardet.constants._debug = 1
34 except ImportError:
35 # No chardet available.
36 def chardet_dammit(s):
37 return None
38
39# Available from http://cjkpython.i18n.org/.
40try:
41 import iconv_codec
42except ImportError:
43 pass
44
45xml_encoding_re = re.compile(
46 '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
47html_meta_re = re.compile(
48 '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
49
50class EntitySubstitution(object):
51
52 """Substitute XML or HTML entities for the corresponding characters."""
53
54 def _populate_class_variables():
55 lookup = {}
56 reverse_lookup = {}
57 characters_for_re = []
58 for codepoint, name in list(codepoint2name.items()):
59 character = unichr(codepoint)
60 if codepoint != 34:
61 # There's no point in turning the quotation mark into
62 # &quot;, unless it happens within an attribute value, which
63 # is handled elsewhere.
64 characters_for_re.append(character)
65 lookup[character] = name
66 # But we do want to turn &quot; into the quotation mark.
67 reverse_lookup[name] = character
68 re_definition = "[%s]" % "".join(characters_for_re)
69 return lookup, reverse_lookup, re.compile(re_definition)
70 (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
71 CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
72
73 CHARACTER_TO_XML_ENTITY = {
74 "'": "apos",
75 '"': "quot",
76 "&": "amp",
77 "<": "lt",
78 ">": "gt",
79 }
80
81 BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
82 "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
83 ")")
84
85 AMPERSAND_OR_BRACKET = re.compile("([<>&])")
86
87 @classmethod
88 def _substitute_html_entity(cls, matchobj):
89 entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
90 return "&%s;" % entity
91
92 @classmethod
93 def _substitute_xml_entity(cls, matchobj):
94 """Used with a regular expression to substitute the
95 appropriate XML entity for an XML special character."""
96 entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
97 return "&%s;" % entity
98
99 @classmethod
100 def quoted_attribute_value(self, value):
101 """Make a value into a quoted XML attribute, possibly escaping it.
102
103 Most strings will be quoted using double quotes.
104
105 Bob's Bar -> "Bob's Bar"
106
107 If a string contains double quotes, it will be quoted using
108 single quotes.
109
110 Welcome to "my bar" -> 'Welcome to "my bar"'
111
112 If a string contains both single and double quotes, the
113 double quotes will be escaped, and the string will be quoted
114 using double quotes.
115
116 Welcome to "Bob's Bar" -> "Welcome to &quot;Bob's bar&quot;
117 """
118 quote_with = '"'
119 if '"' in value:
120 if "'" in value:
121 # The string contains both single and double
122 # quotes. Turn the double quotes into
123 # entities. We quote the double quotes rather than
124 # the single quotes because the entity name is
125 # "&quot;" whether this is HTML or XML. If we
126 # quoted the single quotes, we'd have to decide
127 # between &apos; and &squot;.
128 replace_with = "&quot;"
129 value = value.replace('"', replace_with)
130 else:
131 # There are double quotes but no single quotes.
132 # We can use single quotes to quote the attribute.
133 quote_with = "'"
134 return quote_with + value + quote_with
135
136 @classmethod
137 def substitute_xml(cls, value, make_quoted_attribute=False):
138 """Substitute XML entities for special XML characters.
139
140 :param value: A string to be substituted. The less-than sign
141 will become &lt;, the greater-than sign will become &gt;,
142 and any ampersands will become &amp;. If you want ampersands
143 that appear to be part of an entity definition to be left
144 alone, use substitute_xml_containing_entities() instead.
145
146 :param make_quoted_attribute: If True, then the string will be
147 quoted, as befits an attribute value.
148 """
149 # Escape angle brackets and ampersands.
150 value = cls.AMPERSAND_OR_BRACKET.sub(
151 cls._substitute_xml_entity, value)
152
153 if make_quoted_attribute:
154 value = cls.quoted_attribute_value(value)
155 return value
156
157 @classmethod
158 def substitute_xml_containing_entities(
159 cls, value, make_quoted_attribute=False):
160 """Substitute XML entities for special XML characters.
161
162 :param value: A string to be substituted. The less-than sign will
163 become &lt;, the greater-than sign will become &gt;, and any
164 ampersands that are not part of an entity defition will
165 become &amp;.
166
167 :param make_quoted_attribute: If True, then the string will be
168 quoted, as befits an attribute value.
169 """
170 # Escape angle brackets, and ampersands that aren't part of
171 # entities.
172 value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
173 cls._substitute_xml_entity, value)
174
175 if make_quoted_attribute:
176 value = cls.quoted_attribute_value(value)
177 return value
178
179 @classmethod
180 def substitute_html(cls, s):
181 """Replace certain Unicode characters with named HTML entities.
182
183 This differs from data.encode(encoding, 'xmlcharrefreplace')
184 in that the goal is to make the result more readable (to those
185 with ASCII displays) rather than to recover from
186 errors. There's absolutely nothing wrong with a UTF-8 string
187 containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
188 character with "&eacute;" will make it more readable to some
189 people.
190 """
191 return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
192 cls._substitute_html_entity, s)
193
194
195class EncodingDetector:
196 """Suggests a number of possible encodings for a bytestring.
197
198 Order of precedence:
199
200 1. Encodings you specifically tell EncodingDetector to try first
201 (the override_encodings argument to the constructor).
202
203 2. An encoding declared within the bytestring itself, either in an
204 XML declaration (if the bytestring is to be interpreted as an XML
205 document), or in a <meta> tag (if the bytestring is to be
206 interpreted as an HTML document.)
207
208 3. An encoding detected through textual analysis by chardet,
209 cchardet, or a similar external library.
210
211 4. UTF-8.
212
213 5. Windows-1252.
214 """
215 def __init__(self, markup, override_encodings=None, is_html=False):
216 self.override_encodings = override_encodings or []
217 self.chardet_encoding = None
218 self.is_html = is_html
219 self.declared_encoding = None
220
221 # First order of business: strip a byte-order mark.
222 self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
223
224 def _usable(self, encoding, tried):
225 if encoding is not None:
226 encoding = encoding.lower()
227 if encoding not in tried:
228 tried.add(encoding)
229 return True
230 return False
231
232 @property
233 def encodings(self):
234 """Yield a number of encodings that might work for this markup."""
235 tried = set()
236 for e in self.override_encodings:
237 if self._usable(e, tried):
238 yield e
239
240 # Did the document originally start with a byte-order mark
241 # that indicated its encoding?
242 if self._usable(self.sniffed_encoding, tried):
243 yield self.sniffed_encoding
244
245 # Look within the document for an XML or HTML encoding
246 # declaration.
247 if self.declared_encoding is None:
248 self.declared_encoding = self.find_declared_encoding(
249 self.markup, self.is_html)
250 if self._usable(self.declared_encoding, tried):
251 yield self.declared_encoding
252
253 # Use third-party character set detection to guess at the
254 # encoding.
255 if self.chardet_encoding is None:
256 self.chardet_encoding = chardet_dammit(self.markup)
257 if self._usable(self.chardet_encoding, tried):
258 yield self.chardet_encoding
259
260 # As a last-ditch effort, try utf-8 and windows-1252.
261 for e in ('utf-8', 'windows-1252'):
262 if self._usable(e, tried):
263 yield e
264
265 @classmethod
266 def strip_byte_order_mark(cls, data):
267 """If a byte-order mark is present, strip it and return the encoding it implies."""
268 encoding = None
269 if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
270 and (data[2:4] != '\x00\x00'):
271 encoding = 'utf-16be'
272 data = data[2:]
273 elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
274 and (data[2:4] != '\x00\x00'):
275 encoding = 'utf-16le'
276 data = data[2:]
277 elif data[:3] == b'\xef\xbb\xbf':
278 encoding = 'utf-8'
279 data = data[3:]
280 elif data[:4] == b'\x00\x00\xfe\xff':
281 encoding = 'utf-32be'
282 data = data[4:]
283 elif data[:4] == b'\xff\xfe\x00\x00':
284 encoding = 'utf-32le'
285 data = data[4:]
286 return data, encoding
287
288 @classmethod
289 def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):
290 """Given a document, tries to find its declared encoding.
291
292 An XML encoding is declared at the beginning of the document.
293
294 An HTML encoding is declared in a <meta> tag, hopefully near the
295 beginning of the document.
296 """
297 if search_entire_document:
298 xml_endpos = html_endpos = len(markup)
299 else:
300 xml_endpos = 1024
301 html_endpos = max(2048, int(len(markup) * 0.05))
302
303 declared_encoding = None
304 declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
305 if not declared_encoding_match and is_html:
306 declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
307 if declared_encoding_match is not None:
308 declared_encoding = declared_encoding_match.groups()[0].decode(
309 'ascii')
310 if declared_encoding:
311 return declared_encoding.lower()
312 return None
313
314class UnicodeDammit:
315 """A class for detecting the encoding of a *ML document and
316 converting it to a Unicode string. If the source encoding is
317 windows-1252, can replace MS smart quotes with their HTML or XML
318 equivalents."""
319
320 # This dictionary maps commonly seen values for "charset" in HTML
321 # meta tags to the corresponding Python codec names. It only covers
322 # values that aren't in Python's aliases and can't be determined
323 # by the heuristics in find_codec.
324 CHARSET_ALIASES = {"macintosh": "mac-roman",
325 "x-sjis": "shift-jis"}
326
327 ENCODINGS_WITH_SMART_QUOTES = [
328 "windows-1252",
329 "iso-8859-1",
330 "iso-8859-2",
331 ]
332
333 def __init__(self, markup, override_encodings=[],
334 smart_quotes_to=None, is_html=False):
335 self.smart_quotes_to = smart_quotes_to
336 self.tried_encodings = []
337 self.contains_replacement_characters = False
338 self.is_html = is_html
339
340 self.detector = EncodingDetector(markup, override_encodings, is_html)
341
342 # Short-circuit if the data is in Unicode to begin with.
343 if isinstance(markup, unicode) or markup == '':
344 self.markup = markup
345 self.unicode_markup = unicode(markup)
346 self.original_encoding = None
347 return
348
349 # The encoding detector may have stripped a byte-order mark.
350 # Use the stripped markup from this point on.
351 self.markup = self.detector.markup
352
353 u = None
354 for encoding in self.detector.encodings:
355 markup = self.detector.markup
356 u = self._convert_from(encoding)
357 if u is not None:
358 break
359
360 if not u:
361 # None of the encodings worked. As an absolute last resort,
362 # try them again with character replacement.
363
364 for encoding in self.detector.encodings:
365 if encoding != "ascii":
366 u = self._convert_from(encoding, "replace")
367 if u is not None:
368 logging.warning(
369 "Some characters could not be decoded, and were "
370 "replaced with REPLACEMENT CHARACTER.")
371 self.contains_replacement_characters = True
372 break
373
374 # If none of that worked, we could at this point force it to
375 # ASCII, but that would destroy so much data that I think
376 # giving up is better.
377 self.unicode_markup = u
378 if not u:
379 self.original_encoding = None
380
381 def _sub_ms_char(self, match):
382 """Changes a MS smart quote character to an XML or HTML
383 entity, or an ASCII character."""
384 orig = match.group(1)
385 if self.smart_quotes_to == 'ascii':
386 sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
387 else:
388 sub = self.MS_CHARS.get(orig)
389 if type(sub) == tuple:
390 if self.smart_quotes_to == 'xml':
391 sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
392 else:
393 sub = '&'.encode() + sub[0].encode() + ';'.encode()
394 else:
395 sub = sub.encode()
396 return sub
397
398 def _convert_from(self, proposed, errors="strict"):
399 proposed = self.find_codec(proposed)
400 if not proposed or (proposed, errors) in self.tried_encodings:
401 return None
402 self.tried_encodings.append((proposed, errors))
403 markup = self.markup
404 # Convert smart quotes to HTML if coming from an encoding
405 # that might have them.
406 if (self.smart_quotes_to is not None
407 and proposed in self.ENCODINGS_WITH_SMART_QUOTES):
408 smart_quotes_re = b"([\x80-\x9f])"
409 smart_quotes_compiled = re.compile(smart_quotes_re)
410 markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
411
412 try:
413 #print "Trying to convert document to %s (errors=%s)" % (
414 # proposed, errors)
415 u = self._to_unicode(markup, proposed, errors)
416 self.markup = u
417 self.original_encoding = proposed
418 except Exception as e:
419 #print "That didn't work!"
420 #print e
421 return None
422 #print "Correct encoding: %s" % proposed
423 return self.markup
424
425 def _to_unicode(self, data, encoding, errors="strict"):
426 '''Given a string and its encoding, decodes the string into Unicode.
427 %encoding is a string recognized by encodings.aliases'''
428 return unicode(data, encoding, errors)
429
430 @property
431 def declared_html_encoding(self):
432 if not self.is_html:
433 return None
434 return self.detector.declared_encoding
435
436 def find_codec(self, charset):
437 value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
438 or (charset and self._codec(charset.replace("-", "")))
439 or (charset and self._codec(charset.replace("-", "_")))
440 or (charset and charset.lower())
441 or charset
442 )
443 if value:
444 return value.lower()
445 return None
446
447 def _codec(self, charset):
448 if not charset:
449 return charset
450 codec = None
451 try:
452 codecs.lookup(charset)
453 codec = charset
454 except (LookupError, ValueError):
455 pass
456 return codec
457
458
459 # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
460 MS_CHARS = {b'\x80': ('euro', '20AC'),
461 b'\x81': ' ',
462 b'\x82': ('sbquo', '201A'),
463 b'\x83': ('fnof', '192'),
464 b'\x84': ('bdquo', '201E'),
465 b'\x85': ('hellip', '2026'),
466 b'\x86': ('dagger', '2020'),
467 b'\x87': ('Dagger', '2021'),
468 b'\x88': ('circ', '2C6'),
469 b'\x89': ('permil', '2030'),
470 b'\x8A': ('Scaron', '160'),
471 b'\x8B': ('lsaquo', '2039'),
472 b'\x8C': ('OElig', '152'),
473 b'\x8D': '?',
474 b'\x8E': ('#x17D', '17D'),
475 b'\x8F': '?',
476 b'\x90': '?',
477 b'\x91': ('lsquo', '2018'),
478 b'\x92': ('rsquo', '2019'),
479 b'\x93': ('ldquo', '201C'),
480 b'\x94': ('rdquo', '201D'),
481 b'\x95': ('bull', '2022'),
482 b'\x96': ('ndash', '2013'),
483 b'\x97': ('mdash', '2014'),
484 b'\x98': ('tilde', '2DC'),
485 b'\x99': ('trade', '2122'),
486 b'\x9a': ('scaron', '161'),
487 b'\x9b': ('rsaquo', '203A'),
488 b'\x9c': ('oelig', '153'),
489 b'\x9d': '?',
490 b'\x9e': ('#x17E', '17E'),
491 b'\x9f': ('Yuml', ''),}
492
493 # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
494 # horrors like stripping diacritical marks to turn á into a, but also
495 # contains non-horrors like turning “ into ".
496 MS_CHARS_TO_ASCII = {
497 b'\x80' : 'EUR',
498 b'\x81' : ' ',
499 b'\x82' : ',',
500 b'\x83' : 'f',
501 b'\x84' : ',,',
502 b'\x85' : '...',
503 b'\x86' : '+',
504 b'\x87' : '++',
505 b'\x88' : '^',
506 b'\x89' : '%',
507 b'\x8a' : 'S',
508 b'\x8b' : '<',
509 b'\x8c' : 'OE',
510 b'\x8d' : '?',
511 b'\x8e' : 'Z',
512 b'\x8f' : '?',
513 b'\x90' : '?',
514 b'\x91' : "'",
515 b'\x92' : "'",
516 b'\x93' : '"',
517 b'\x94' : '"',
518 b'\x95' : '*',
519 b'\x96' : '-',
520 b'\x97' : '--',
521 b'\x98' : '~',
522 b'\x99' : '(TM)',
523 b'\x9a' : 's',
524 b'\x9b' : '>',
525 b'\x9c' : 'oe',
526 b'\x9d' : '?',
527 b'\x9e' : 'z',
528 b'\x9f' : 'Y',
529 b'\xa0' : ' ',
530 b'\xa1' : '!',
531 b'\xa2' : 'c',
532 b'\xa3' : 'GBP',
533 b'\xa4' : '$', #This approximation is especially parochial--this is the
534 #generic currency symbol.
535 b'\xa5' : 'YEN',
536 b'\xa6' : '|',
537 b'\xa7' : 'S',
538 b'\xa8' : '..',
539 b'\xa9' : '',
540 b'\xaa' : '(th)',
541 b'\xab' : '<<',
542 b'\xac' : '!',
543 b'\xad' : ' ',
544 b'\xae' : '(R)',
545 b'\xaf' : '-',
546 b'\xb0' : 'o',
547 b'\xb1' : '+-',
548 b'\xb2' : '2',
549 b'\xb3' : '3',
550 b'\xb4' : ("'", 'acute'),
551 b'\xb5' : 'u',
552 b'\xb6' : 'P',
553 b'\xb7' : '*',
554 b'\xb8' : ',',
555 b'\xb9' : '1',
556 b'\xba' : '(th)',
557 b'\xbb' : '>>',
558 b'\xbc' : '1/4',
559 b'\xbd' : '1/2',
560 b'\xbe' : '3/4',
561 b'\xbf' : '?',
562 b'\xc0' : 'A',
563 b'\xc1' : 'A',
564 b'\xc2' : 'A',
565 b'\xc3' : 'A',
566 b'\xc4' : 'A',
567 b'\xc5' : 'A',
568 b'\xc6' : 'AE',
569 b'\xc7' : 'C',
570 b'\xc8' : 'E',
571 b'\xc9' : 'E',
572 b'\xca' : 'E',
573 b'\xcb' : 'E',
574 b'\xcc' : 'I',
575 b'\xcd' : 'I',
576 b'\xce' : 'I',
577 b'\xcf' : 'I',
578 b'\xd0' : 'D',
579 b'\xd1' : 'N',
580 b'\xd2' : 'O',
581 b'\xd3' : 'O',
582 b'\xd4' : 'O',
583 b'\xd5' : 'O',
584 b'\xd6' : 'O',
585 b'\xd7' : '*',
586 b'\xd8' : 'O',
587 b'\xd9' : 'U',
588 b'\xda' : 'U',
589 b'\xdb' : 'U',
590 b'\xdc' : 'U',
591 b'\xdd' : 'Y',
592 b'\xde' : 'b',
593 b'\xdf' : 'B',
594 b'\xe0' : 'a',
595 b'\xe1' : 'a',
596 b'\xe2' : 'a',
597 b'\xe3' : 'a',
598 b'\xe4' : 'a',
599 b'\xe5' : 'a',
600 b'\xe6' : 'ae',
601 b'\xe7' : 'c',
602 b'\xe8' : 'e',
603 b'\xe9' : 'e',
604 b'\xea' : 'e',
605 b'\xeb' : 'e',
606 b'\xec' : 'i',
607 b'\xed' : 'i',
608 b'\xee' : 'i',
609 b'\xef' : 'i',
610 b'\xf0' : 'o',
611 b'\xf1' : 'n',
612 b'\xf2' : 'o',
613 b'\xf3' : 'o',
614 b'\xf4' : 'o',
615 b'\xf5' : 'o',
616 b'\xf6' : 'o',
617 b'\xf7' : '/',
618 b'\xf8' : 'o',
619 b'\xf9' : 'u',
620 b'\xfa' : 'u',
621 b'\xfb' : 'u',
622 b'\xfc' : 'u',
623 b'\xfd' : 'y',
624 b'\xfe' : 'b',
625 b'\xff' : 'y',
626 }
627
628 # A map used when removing rogue Windows-1252/ISO-8859-1
629 # characters in otherwise UTF-8 documents.
630 #
631 # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
632 # Windows-1252.
633 WINDOWS_1252_TO_UTF8 = {
634 0x80 : b'\xe2\x82\xac', # €
635 0x82 : b'\xe2\x80\x9a', # ‚
636 0x83 : b'\xc6\x92', # ƒ
637 0x84 : b'\xe2\x80\x9e', # „
638 0x85 : b'\xe2\x80\xa6', # …
639 0x86 : b'\xe2\x80\xa0', # †
640 0x87 : b'\xe2\x80\xa1', # ‡
641 0x88 : b'\xcb\x86', # ˆ
642 0x89 : b'\xe2\x80\xb0', # ‰
643 0x8a : b'\xc5\xa0', # Š
644 0x8b : b'\xe2\x80\xb9', # ‹
645 0x8c : b'\xc5\x92', # Œ
646 0x8e : b'\xc5\xbd', # Ž
647 0x91 : b'\xe2\x80\x98', # ‘
648 0x92 : b'\xe2\x80\x99', # ’
649 0x93 : b'\xe2\x80\x9c', # “
650 0x94 : b'\xe2\x80\x9d', # ”
651 0x95 : b'\xe2\x80\xa2', # •
652 0x96 : b'\xe2\x80\x93', # –
653 0x97 : b'\xe2\x80\x94', # —
654 0x98 : b'\xcb\x9c', # ˜
655 0x99 : b'\xe2\x84\xa2', # ™
656 0x9a : b'\xc5\xa1', # š
657 0x9b : b'\xe2\x80\xba', # ›
658 0x9c : b'\xc5\x93', # œ
659 0x9e : b'\xc5\xbe', # ž
660 0x9f : b'\xc5\xb8', # Ÿ
661 0xa0 : b'\xc2\xa0', #  
662 0xa1 : b'\xc2\xa1', # ¡
663 0xa2 : b'\xc2\xa2', # ¢
664 0xa3 : b'\xc2\xa3', # £
665 0xa4 : b'\xc2\xa4', # ¤
666 0xa5 : b'\xc2\xa5', # ¥
667 0xa6 : b'\xc2\xa6', # ¦
668 0xa7 : b'\xc2\xa7', # §
669 0xa8 : b'\xc2\xa8', # ¨
670 0xa9 : b'\xc2\xa9', # ©
671 0xaa : b'\xc2\xaa', # ª
672 0xab : b'\xc2\xab', # «
673 0xac : b'\xc2\xac', # ¬
674 0xad : b'\xc2\xad', # ­
675 0xae : b'\xc2\xae', # ®
676 0xaf : b'\xc2\xaf', # ¯
677 0xb0 : b'\xc2\xb0', # °
678 0xb1 : b'\xc2\xb1', # ±
679 0xb2 : b'\xc2\xb2', # ²
680 0xb3 : b'\xc2\xb3', # ³
681 0xb4 : b'\xc2\xb4', # ´
682 0xb5 : b'\xc2\xb5', # µ
683 0xb6 : b'\xc2\xb6', # ¶
684 0xb7 : b'\xc2\xb7', # ·
685 0xb8 : b'\xc2\xb8', # ¸
686 0xb9 : b'\xc2\xb9', # ¹
687 0xba : b'\xc2\xba', # º
688 0xbb : b'\xc2\xbb', # »
689 0xbc : b'\xc2\xbc', # ¼
690 0xbd : b'\xc2\xbd', # ½
691 0xbe : b'\xc2\xbe', # ¾
692 0xbf : b'\xc2\xbf', # ¿
693 0xc0 : b'\xc3\x80', # À
694 0xc1 : b'\xc3\x81', # Á
695 0xc2 : b'\xc3\x82', # Â
696 0xc3 : b'\xc3\x83', # Ã
697 0xc4 : b'\xc3\x84', # Ä
698 0xc5 : b'\xc3\x85', # Å
699 0xc6 : b'\xc3\x86', # Æ
700 0xc7 : b'\xc3\x87', # Ç
701 0xc8 : b'\xc3\x88', # È
702 0xc9 : b'\xc3\x89', # É
703 0xca : b'\xc3\x8a', # Ê
704 0xcb : b'\xc3\x8b', # Ë
705 0xcc : b'\xc3\x8c', # Ì
706 0xcd : b'\xc3\x8d', # Í
707 0xce : b'\xc3\x8e', # Î
708 0xcf : b'\xc3\x8f', # Ï
709 0xd0 : b'\xc3\x90', # Ð
710 0xd1 : b'\xc3\x91', # Ñ
711 0xd2 : b'\xc3\x92', # Ò
712 0xd3 : b'\xc3\x93', # Ó
713 0xd4 : b'\xc3\x94', # Ô
714 0xd5 : b'\xc3\x95', # Õ
715 0xd6 : b'\xc3\x96', # Ö
716 0xd7 : b'\xc3\x97', # ×
717 0xd8 : b'\xc3\x98', # Ø
718 0xd9 : b'\xc3\x99', # Ù
719 0xda : b'\xc3\x9a', # Ú
720 0xdb : b'\xc3\x9b', # Û
721 0xdc : b'\xc3\x9c', # Ü
722 0xdd : b'\xc3\x9d', # Ý
723 0xde : b'\xc3\x9e', # Þ
724 0xdf : b'\xc3\x9f', # ß
725 0xe0 : b'\xc3\xa0', # à
726 0xe1 : b'\xa1', # á
727 0xe2 : b'\xc3\xa2', # â
728 0xe3 : b'\xc3\xa3', # ã
729 0xe4 : b'\xc3\xa4', # ä
730 0xe5 : b'\xc3\xa5', # å
731 0xe6 : b'\xc3\xa6', # æ
732 0xe7 : b'\xc3\xa7', # ç
733 0xe8 : b'\xc3\xa8', # è
734 0xe9 : b'\xc3\xa9', # é
735 0xea : b'\xc3\xaa', # ê
736 0xeb : b'\xc3\xab', # ë
737 0xec : b'\xc3\xac', # ì
738 0xed : b'\xc3\xad', # í
739 0xee : b'\xc3\xae', # î
740 0xef : b'\xc3\xaf', # ï
741 0xf0 : b'\xc3\xb0', # ð
742 0xf1 : b'\xc3\xb1', # ñ
743 0xf2 : b'\xc3\xb2', # ò
744 0xf3 : b'\xc3\xb3', # ó
745 0xf4 : b'\xc3\xb4', # ô
746 0xf5 : b'\xc3\xb5', # õ
747 0xf6 : b'\xc3\xb6', # ö
748 0xf7 : b'\xc3\xb7', # ÷
749 0xf8 : b'\xc3\xb8', # ø
750 0xf9 : b'\xc3\xb9', # ù
751 0xfa : b'\xc3\xba', # ú
752 0xfb : b'\xc3\xbb', # û
753 0xfc : b'\xc3\xbc', # ü
754 0xfd : b'\xc3\xbd', # ý
755 0xfe : b'\xc3\xbe', # þ
756 }
757
758 MULTIBYTE_MARKERS_AND_SIZES = [
759 (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
760 (0xe0, 0xef, 3), # 3-byte characters start with E0-EF
761 (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
762 ]
763
764 FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
765 LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
766
767 @classmethod
768 def detwingle(cls, in_bytes, main_encoding="utf8",
769 embedded_encoding="windows-1252"):
770 """Fix characters from one encoding embedded in some other encoding.
771
772 Currently the only situation supported is Windows-1252 (or its
773 subset ISO-8859-1), embedded in UTF-8.
774
775 The input must be a bytestring. If you've already converted
776 the document to Unicode, you're too late.
777
778 The output is a bytestring in which `embedded_encoding`
779 characters have been converted to their `main_encoding`
780 equivalents.
781 """
782 if embedded_encoding.replace('_', '-').lower() not in (
783 'windows-1252', 'windows_1252'):
784 raise NotImplementedError(
785 "Windows-1252 and ISO-8859-1 are the only currently supported "
786 "embedded encodings.")
787
788 if main_encoding.lower() not in ('utf8', 'utf-8'):
789 raise NotImplementedError(
790 "UTF-8 is the only currently supported main encoding.")
791
792 byte_chunks = []
793
794 chunk_start = 0
795 pos = 0
796 while pos < len(in_bytes):
797 byte = in_bytes[pos]
798 if not isinstance(byte, int):
799 # Python 2.x
800 byte = ord(byte)
801 if (byte >= cls.FIRST_MULTIBYTE_MARKER
802 and byte <= cls.LAST_MULTIBYTE_MARKER):
803 # This is the start of a UTF-8 multibyte character. Skip
804 # to the end.
805 for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
806 if byte >= start and byte <= end:
807 pos += size
808 break
809 elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
810 # We found a Windows-1252 character!
811 # Save the string up to this point as a chunk.
812 byte_chunks.append(in_bytes[chunk_start:pos])
813
814 # Now translate the Windows-1252 character into UTF-8
815 # and add it as another, one-byte chunk.
816 byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
817 pos += 1
818 chunk_start = pos
819 else:
820 # Go on to the next character.
821 pos += 1
822 if chunk_start == 0:
823 # The string is unchanged.
824 return in_bytes
825 else:
826 # Store the final chunk.
827 byte_chunks.append(in_bytes[chunk_start:])
828 return b''.join(byte_chunks)
829