Blame - import-layers/yocto-poky/bitbake/lib/bs4/dammit.py - stefanberger/openbmc

blob: 59640b7ce3a0f1386fdca863cd7eb95a3942a3ee [file] [log] [blame]

Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1	# -- coding: utf-8 --
				2	"""Beautiful Soup bonus library: Unicode, Dammit
				3
				4	This library converts a bytestream to Unicode through any means
				5	necessary. It is heavily based on code from Mark Pilgrim's Universal
				6	Feed Parser. It works best on XML and XML, but it does not rewrite the
				7	XML or HTML to reflect a new encoding; that's the tree builder's job.
				8	"""
				9
				10	import codecs
				11	from htmlentitydefs import codepoint2name
				12	import re
				13	import logging
				14	import string
				15
				16	# Import a library to autodetect character encodings.
				17	chardet_type = None
				18	try:
				19	# First try the fast C implementation.
				20	# PyPI package: cchardet
				21	import cchardet
				22	def chardet_dammit(s):
				23	return cchardet.detect(s)['encoding']
				24	except ImportError:
				25	try:
				26	# Fall back to the pure Python implementation
				27	# Debian package: python-chardet
				28	# PyPI package: chardet
				29	import chardet
				30	def chardet_dammit(s):
				31	return chardet.detect(s)['encoding']
				32	#import chardet.constants
				33	#chardet.constants._debug = 1
				34	except ImportError:
				35	# No chardet available.
				36	def chardet_dammit(s):
				37	return None
				38
				39	# Available from http://cjkpython.i18n.org/.
				40	try:
				41	import iconv_codec
				42	except ImportError:
				43	pass
				44
				45	xml_encoding_re = re.compile(
				46	'^<\?.encoding=[\'"](.?)[\'"].*\?>'.encode(), re.I)
				47	html_meta_re = re.compile(
				48	'<\smeta[^>]+charset\s=\s["\']?([^>]?)[ /;\'">]'.encode(), re.I)
				49
				50	class EntitySubstitution(object):
				51
				52	"""Substitute XML or HTML entities for the corresponding characters."""
				53
				54	def _populate_class_variables():
				55	lookup = {}
				56	reverse_lookup = {}
				57	characters_for_re = []
				58	for codepoint, name in list(codepoint2name.items()):
				59	character = unichr(codepoint)
				60	if codepoint != 34:
				61	# There's no point in turning the quotation mark into
				62	# ", unless it happens within an attribute value, which
				63	# is handled elsewhere.
				64	characters_for_re.append(character)
				65	lookup[character] = name
				66	# But we do want to turn " into the quotation mark.
				67	reverse_lookup[name] = character
				68	re_definition = "[%s]" % "".join(characters_for_re)
				69	return lookup, reverse_lookup, re.compile(re_definition)
				70	(CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
				71	CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
				72
				73	CHARACTER_TO_XML_ENTITY = {
				74	"'": "apos",
				75	'"': "quot",
				76	"&": "amp",
				77	"<": "lt",
				78	">": "gt",
				79	}
				80
				81	BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]\|"
				82	"&(?!#\d+;\|#x[0-9a-fA-F]+;\|\w+;)"
				83	")")
				84
				85	AMPERSAND_OR_BRACKET = re.compile("([<>&])")
				86
				87	@classmethod
				88	def _substitute_html_entity(cls, matchobj):
				89	entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
				90	return "&%s;" % entity
				91
				92	@classmethod
				93	def _substitute_xml_entity(cls, matchobj):
				94	"""Used with a regular expression to substitute the
				95	appropriate XML entity for an XML special character."""
				96	entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
				97	return "&%s;" % entity
				98
				99	@classmethod
				100	def quoted_attribute_value(self, value):
				101	"""Make a value into a quoted XML attribute, possibly escaping it.
				102
				103	Most strings will be quoted using double quotes.
				104
				105	Bob's Bar -> "Bob's Bar"
				106
				107	If a string contains double quotes, it will be quoted using
				108	single quotes.
				109
				110	Welcome to "my bar" -> 'Welcome to "my bar"'
				111
				112	If a string contains both single and double quotes, the
				113	double quotes will be escaped, and the string will be quoted
				114	using double quotes.
				115
				116	Welcome to "Bob's Bar" -> "Welcome to "Bob's bar"
				117	"""
				118	quote_with = '"'
				119	if '"' in value:
				120	if "'" in value:
				121	# The string contains both single and double
				122	# quotes. Turn the double quotes into
				123	# entities. We quote the double quotes rather than
				124	# the single quotes because the entity name is
				125	# """ whether this is HTML or XML. If we
				126	# quoted the single quotes, we'd have to decide
				127	# between ' and &squot;.
				128	replace_with = """
				129	value = value.replace('"', replace_with)
				130	else:
				131	# There are double quotes but no single quotes.
				132	# We can use single quotes to quote the attribute.
				133	quote_with = "'"
				134	return quote_with + value + quote_with
				135
				136	@classmethod
				137	def substitute_xml(cls, value, make_quoted_attribute=False):
				138	"""Substitute XML entities for special XML characters.
				139
				140	:param value: A string to be substituted. The less-than sign
				141	will become <, the greater-than sign will become >,
				142	and any ampersands will become &. If you want ampersands
				143	that appear to be part of an entity definition to be left
				144	alone, use substitute_xml_containing_entities() instead.
				145
				146	:param make_quoted_attribute: If True, then the string will be
				147	quoted, as befits an attribute value.
				148	"""
				149	# Escape angle brackets and ampersands.
				150	value = cls.AMPERSAND_OR_BRACKET.sub(
				151	cls._substitute_xml_entity, value)
				152
				153	if make_quoted_attribute:
				154	value = cls.quoted_attribute_value(value)
				155	return value
				156
				157	@classmethod
				158	def substitute_xml_containing_entities(
				159	cls, value, make_quoted_attribute=False):
				160	"""Substitute XML entities for special XML characters.
				161
				162	:param value: A string to be substituted. The less-than sign will
				163	become <, the greater-than sign will become >, and any
				164	ampersands that are not part of an entity defition will
				165	become &.
				166
				167	:param make_quoted_attribute: If True, then the string will be
				168	quoted, as befits an attribute value.
				169	"""
				170	# Escape angle brackets, and ampersands that aren't part of
				171	# entities.
				172	value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
				173	cls._substitute_xml_entity, value)
				174
				175	if make_quoted_attribute:
				176	value = cls.quoted_attribute_value(value)
				177	return value
				178
				179	@classmethod
				180	def substitute_html(cls, s):
				181	"""Replace certain Unicode characters with named HTML entities.
				182
				183	This differs from data.encode(encoding, 'xmlcharrefreplace')
				184	in that the goal is to make the result more readable (to those
				185	with ASCII displays) rather than to recover from
				186	errors. There's absolutely nothing wrong with a UTF-8 string
				187	containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
				188	character with "é" will make it more readable to some
				189	people.
				190	"""
				191	return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
				192	cls._substitute_html_entity, s)
				193
				194
				195	class EncodingDetector:
				196	"""Suggests a number of possible encodings for a bytestring.
				197
				198	Order of precedence:
				199
				200	1. Encodings you specifically tell EncodingDetector to try first
				201	(the override_encodings argument to the constructor).
				202
				203	2. An encoding declared within the bytestring itself, either in an
				204	XML declaration (if the bytestring is to be interpreted as an XML
				205	document), or in a <meta> tag (if the bytestring is to be
				206	interpreted as an HTML document.)
				207
				208	3. An encoding detected through textual analysis by chardet,
				209	cchardet, or a similar external library.
				210
				211	4. UTF-8.
				212
				213	5. Windows-1252.
				214	"""
				215	def __init__(self, markup, override_encodings=None, is_html=False):
				216	self.override_encodings = override_encodings or []
				217	self.chardet_encoding = None
				218	self.is_html = is_html
				219	self.declared_encoding = None
				220
				221	# First order of business: strip a byte-order mark.
				222	self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
				223
				224	def _usable(self, encoding, tried):
				225	if encoding is not None:
				226	encoding = encoding.lower()
				227	if encoding not in tried:
				228	tried.add(encoding)
				229	return True
				230	return False
				231
				232	@property
				233	def encodings(self):
				234	"""Yield a number of encodings that might work for this markup."""
				235	tried = set()
				236	for e in self.override_encodings:
				237	if self._usable(e, tried):
				238	yield e
				239
				240	# Did the document originally start with a byte-order mark
				241	# that indicated its encoding?
				242	if self._usable(self.sniffed_encoding, tried):
				243	yield self.sniffed_encoding
				244
				245	# Look within the document for an XML or HTML encoding
				246	# declaration.
				247	if self.declared_encoding is None:
				248	self.declared_encoding = self.find_declared_encoding(
				249	self.markup, self.is_html)
				250	if self._usable(self.declared_encoding, tried):
				251	yield self.declared_encoding
				252
				253	# Use third-party character set detection to guess at the
				254	# encoding.
				255	if self.chardet_encoding is None:
				256	self.chardet_encoding = chardet_dammit(self.markup)
				257	if self._usable(self.chardet_encoding, tried):
				258	yield self.chardet_encoding
				259
				260	# As a last-ditch effort, try utf-8 and windows-1252.
				261	for e in ('utf-8', 'windows-1252'):
				262	if self._usable(e, tried):
				263	yield e
				264
				265	@classmethod
				266	def strip_byte_order_mark(cls, data):
				267	"""If a byte-order mark is present, strip it and return the encoding it implies."""
				268	encoding = None
				269	if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
				270	and (data[2:4] != '\x00\x00'):
				271	encoding = 'utf-16be'
				272	data = data[2:]
				273	elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
				274	and (data[2:4] != '\x00\x00'):
				275	encoding = 'utf-16le'
				276	data = data[2:]
				277	elif data[:3] == b'\xef\xbb\xbf':
				278	encoding = 'utf-8'
				279	data = data[3:]
				280	elif data[:4] == b'\x00\x00\xfe\xff':
				281	encoding = 'utf-32be'
				282	data = data[4:]
				283	elif data[:4] == b'\xff\xfe\x00\x00':
				284	encoding = 'utf-32le'
				285	data = data[4:]
				286	return data, encoding
				287
				288	@classmethod
				289	def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):
				290	"""Given a document, tries to find its declared encoding.
				291
				292	An XML encoding is declared at the beginning of the document.
				293
				294	An HTML encoding is declared in a <meta> tag, hopefully near the
				295	beginning of the document.
				296	"""
				297	if search_entire_document:
				298	xml_endpos = html_endpos = len(markup)
				299	else:
				300	xml_endpos = 1024
				301	html_endpos = max(2048, int(len(markup) * 0.05))
				302
				303	declared_encoding = None
				304	declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
				305	if not declared_encoding_match and is_html:
				306	declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
				307	if declared_encoding_match is not None:
				308	declared_encoding = declared_encoding_match.groups()[0].decode(
				309	'ascii')
				310	if declared_encoding:
				311	return declared_encoding.lower()
				312	return None
				313
				314	class UnicodeDammit:
				315	"""A class for detecting the encoding of a *ML document and
				316	converting it to a Unicode string. If the source encoding is
				317	windows-1252, can replace MS smart quotes with their HTML or XML
				318	equivalents."""
				319
				320	# This dictionary maps commonly seen values for "charset" in HTML
				321	# meta tags to the corresponding Python codec names. It only covers
				322	# values that aren't in Python's aliases and can't be determined
				323	# by the heuristics in find_codec.
				324	CHARSET_ALIASES = {"macintosh": "mac-roman",
				325	"x-sjis": "shift-jis"}
				326
				327	ENCODINGS_WITH_SMART_QUOTES = [
				328	"windows-1252",
				329	"iso-8859-1",
				330	"iso-8859-2",
				331	]
				332
				333	def __init__(self, markup, override_encodings=[],
				334	smart_quotes_to=None, is_html=False):
				335	self.smart_quotes_to = smart_quotes_to
				336	self.tried_encodings = []
				337	self.contains_replacement_characters = False
				338	self.is_html = is_html
				339
				340	self.detector = EncodingDetector(markup, override_encodings, is_html)
				341
				342	# Short-circuit if the data is in Unicode to begin with.
				343	if isinstance(markup, unicode) or markup == '':
				344	self.markup = markup
				345	self.unicode_markup = unicode(markup)
				346	self.original_encoding = None
				347	return
				348
				349	# The encoding detector may have stripped a byte-order mark.
				350	# Use the stripped markup from this point on.
				351	self.markup = self.detector.markup
				352
				353	u = None
				354	for encoding in self.detector.encodings:
				355	markup = self.detector.markup
				356	u = self._convert_from(encoding)
				357	if u is not None:
				358	break
				359
				360	if not u:
				361	# None of the encodings worked. As an absolute last resort,
				362	# try them again with character replacement.
				363
				364	for encoding in self.detector.encodings:
				365	if encoding != "ascii":
				366	u = self._convert_from(encoding, "replace")
				367	if u is not None:
				368	logging.warning(
				369	"Some characters could not be decoded, and were "
				370	"replaced with REPLACEMENT CHARACTER.")
				371	self.contains_replacement_characters = True
				372	break
				373
				374	# If none of that worked, we could at this point force it to
				375	# ASCII, but that would destroy so much data that I think
				376	# giving up is better.
				377	self.unicode_markup = u
				378	if not u:
				379	self.original_encoding = None
				380
				381	def _sub_ms_char(self, match):
				382	"""Changes a MS smart quote character to an XML or HTML
				383	entity, or an ASCII character."""
				384	orig = match.group(1)
				385	if self.smart_quotes_to == 'ascii':
				386	sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
				387	else:
				388	sub = self.MS_CHARS.get(orig)
				389	if type(sub) == tuple:
				390	if self.smart_quotes_to == 'xml':
				391	sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
				392	else:
				393	sub = '&'.encode() + sub[0].encode() + ';'.encode()
				394	else:
				395	sub = sub.encode()
				396	return sub
				397
				398	def _convert_from(self, proposed, errors="strict"):
				399	proposed = self.find_codec(proposed)
				400	if not proposed or (proposed, errors) in self.tried_encodings:
				401	return None
				402	self.tried_encodings.append((proposed, errors))
				403	markup = self.markup
				404	# Convert smart quotes to HTML if coming from an encoding
				405	# that might have them.
				406	if (self.smart_quotes_to is not None
				407	and proposed in self.ENCODINGS_WITH_SMART_QUOTES):
				408	smart_quotes_re = b"([\x80-\x9f])"
				409	smart_quotes_compiled = re.compile(smart_quotes_re)
				410	markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
				411
				412	try:
				413	#print "Trying to convert document to %s (errors=%s)" % (
				414	# proposed, errors)
				415	u = self._to_unicode(markup, proposed, errors)
				416	self.markup = u
				417	self.original_encoding = proposed
				418	except Exception as e:
				419	#print "That didn't work!"
				420	#print e
				421	return None
				422	#print "Correct encoding: %s" % proposed
				423	return self.markup
				424
				425	def _to_unicode(self, data, encoding, errors="strict"):
				426	'''Given a string and its encoding, decodes the string into Unicode.
				427	%encoding is a string recognized by encodings.aliases'''
				428	return unicode(data, encoding, errors)
				429
				430	@property
				431	def declared_html_encoding(self):
				432	if not self.is_html:
				433	return None
				434	return self.detector.declared_encoding
				435
				436	def find_codec(self, charset):
				437	value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
				438	or (charset and self._codec(charset.replace("-", "")))
				439	or (charset and self._codec(charset.replace("-", "_")))
				440	or (charset and charset.lower())
				441	or charset
				442	)
				443	if value:
				444	return value.lower()
				445	return None
				446
				447	def _codec(self, charset):
				448	if not charset:
				449	return charset
				450	codec = None
				451	try:
				452	codecs.lookup(charset)
				453	codec = charset
				454	except (LookupError, ValueError):
				455	pass
				456	return codec
				457
				458
				459	# A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
				460	MS_CHARS = {b'\x80': ('euro', '20AC'),
				461	b'\x81': ' ',
				462	b'\x82': ('sbquo', '201A'),
				463	b'\x83': ('fnof', '192'),
				464	b'\x84': ('bdquo', '201E'),
				465	b'\x85': ('hellip', '2026'),
				466	b'\x86': ('dagger', '2020'),
				467	b'\x87': ('Dagger', '2021'),
				468	b'\x88': ('circ', '2C6'),
				469	b'\x89': ('permil', '2030'),
				470	b'\x8A': ('Scaron', '160'),
				471	b'\x8B': ('lsaquo', '2039'),
				472	b'\x8C': ('OElig', '152'),
				473	b'\x8D': '?',
				474	b'\x8E': ('#x17D', '17D'),
				475	b'\x8F': '?',
				476	b'\x90': '?',
				477	b'\x91': ('lsquo', '2018'),
				478	b'\x92': ('rsquo', '2019'),
				479	b'\x93': ('ldquo', '201C'),
				480	b'\x94': ('rdquo', '201D'),
				481	b'\x95': ('bull', '2022'),
				482	b'\x96': ('ndash', '2013'),
				483	b'\x97': ('mdash', '2014'),
				484	b'\x98': ('tilde', '2DC'),
				485	b'\x99': ('trade', '2122'),
				486	b'\x9a': ('scaron', '161'),
				487	b'\x9b': ('rsaquo', '203A'),
				488	b'\x9c': ('oelig', '153'),
				489	b'\x9d': '?',
				490	b'\x9e': ('#x17E', '17E'),
				491	b'\x9f': ('Yuml', ''),}
				492
				493	# A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
				494	# horrors like stripping diacritical marks to turn á into a, but also
				495	# contains non-horrors like turning “ into ".
				496	MS_CHARS_TO_ASCII = {
				497	b'\x80' : 'EUR',
				498	b'\x81' : ' ',
				499	b'\x82' : ',',
				500	b'\x83' : 'f',
				501	b'\x84' : ',,',
				502	b'\x85' : '...',
				503	b'\x86' : '+',
				504	b'\x87' : '++',
				505	b'\x88' : '^',
				506	b'\x89' : '%',
				507	b'\x8a' : 'S',
				508	b'\x8b' : '<',
				509	b'\x8c' : 'OE',
				510	b'\x8d' : '?',
				511	b'\x8e' : 'Z',
				512	b'\x8f' : '?',
				513	b'\x90' : '?',
				514	b'\x91' : "'",
				515	b'\x92' : "'",
				516	b'\x93' : '"',
				517	b'\x94' : '"',
				518	b'\x95' : '*',
				519	b'\x96' : '-',
				520	b'\x97' : '--',
				521	b'\x98' : '~',
				522	b'\x99' : '(TM)',
				523	b'\x9a' : 's',
				524	b'\x9b' : '>',
				525	b'\x9c' : 'oe',
				526	b'\x9d' : '?',
				527	b'\x9e' : 'z',
				528	b'\x9f' : 'Y',
				529	b'\xa0' : ' ',
				530	b'\xa1' : '!',
				531	b'\xa2' : 'c',
				532	b'\xa3' : 'GBP',
				533	b'\xa4' : '$', #This approximation is especially parochial--this is the
				534	#generic currency symbol.
				535	b'\xa5' : 'YEN',
				536	b'\xa6' : '\|',
				537	b'\xa7' : 'S',
				538	b'\xa8' : '..',
				539	b'\xa9' : '',
				540	b'\xaa' : '(th)',
				541	b'\xab' : '<<',
				542	b'\xac' : '!',
				543	b'\xad' : ' ',
				544	b'\xae' : '(R)',
				545	b'\xaf' : '-',
				546	b'\xb0' : 'o',
				547	b'\xb1' : '+-',
				548	b'\xb2' : '2',
				549	b'\xb3' : '3',
				550	b'\xb4' : ("'", 'acute'),
				551	b'\xb5' : 'u',
				552	b'\xb6' : 'P',
				553	b'\xb7' : '*',
				554	b'\xb8' : ',',
				555	b'\xb9' : '1',
				556	b'\xba' : '(th)',
				557	b'\xbb' : '>>',
				558	b'\xbc' : '1/4',
				559	b'\xbd' : '1/2',
				560	b'\xbe' : '3/4',
				561	b'\xbf' : '?',
				562	b'\xc0' : 'A',
				563	b'\xc1' : 'A',
				564	b'\xc2' : 'A',
				565	b'\xc3' : 'A',
				566	b'\xc4' : 'A',
				567	b'\xc5' : 'A',
				568	b'\xc6' : 'AE',
				569	b'\xc7' : 'C',
				570	b'\xc8' : 'E',
				571	b'\xc9' : 'E',
				572	b'\xca' : 'E',
				573	b'\xcb' : 'E',
				574	b'\xcc' : 'I',
				575	b'\xcd' : 'I',
				576	b'\xce' : 'I',
				577	b'\xcf' : 'I',
				578	b'\xd0' : 'D',
				579	b'\xd1' : 'N',
				580	b'\xd2' : 'O',
				581	b'\xd3' : 'O',
				582	b'\xd4' : 'O',
				583	b'\xd5' : 'O',
				584	b'\xd6' : 'O',
				585	b'\xd7' : '*',
				586	b'\xd8' : 'O',
				587	b'\xd9' : 'U',
				588	b'\xda' : 'U',
				589	b'\xdb' : 'U',
				590	b'\xdc' : 'U',
				591	b'\xdd' : 'Y',
				592	b'\xde' : 'b',
				593	b'\xdf' : 'B',
				594	b'\xe0' : 'a',
				595	b'\xe1' : 'a',
				596	b'\xe2' : 'a',
				597	b'\xe3' : 'a',
				598	b'\xe4' : 'a',
				599	b'\xe5' : 'a',
				600	b'\xe6' : 'ae',
				601	b'\xe7' : 'c',
				602	b'\xe8' : 'e',
				603	b'\xe9' : 'e',
				604	b'\xea' : 'e',
				605	b'\xeb' : 'e',
				606	b'\xec' : 'i',
				607	b'\xed' : 'i',
				608	b'\xee' : 'i',
				609	b'\xef' : 'i',
				610	b'\xf0' : 'o',
				611	b'\xf1' : 'n',
				612	b'\xf2' : 'o',
				613	b'\xf3' : 'o',
				614	b'\xf4' : 'o',
				615	b'\xf5' : 'o',
				616	b'\xf6' : 'o',
				617	b'\xf7' : '/',
				618	b'\xf8' : 'o',
				619	b'\xf9' : 'u',
				620	b'\xfa' : 'u',
				621	b'\xfb' : 'u',
				622	b'\xfc' : 'u',
				623	b'\xfd' : 'y',
				624	b'\xfe' : 'b',
				625	b'\xff' : 'y',
				626	}
				627
				628	# A map used when removing rogue Windows-1252/ISO-8859-1
				629	# characters in otherwise UTF-8 documents.
				630	#
				631	# Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
				632	# Windows-1252.
				633	WINDOWS_1252_TO_UTF8 = {
				634	0x80 : b'\xe2\x82\xac', # €
				635	0x82 : b'\xe2\x80\x9a', # ‚
				636	0x83 : b'\xc6\x92', # ƒ
				637	0x84 : b'\xe2\x80\x9e', # „
				638	0x85 : b'\xe2\x80\xa6', # …
				639	0x86 : b'\xe2\x80\xa0', # †
				640	0x87 : b'\xe2\x80\xa1', # ‡
				641	0x88 : b'\xcb\x86', # ˆ
				642	0x89 : b'\xe2\x80\xb0', # ‰
				643	0x8a : b'\xc5\xa0', # Š
				644	0x8b : b'\xe2\x80\xb9', # ‹
				645	0x8c : b'\xc5\x92', # Œ
				646	0x8e : b'\xc5\xbd', # Ž
				647	0x91 : b'\xe2\x80\x98', # ‘
				648	0x92 : b'\xe2\x80\x99', # ’
				649	0x93 : b'\xe2\x80\x9c', # “
				650	0x94 : b'\xe2\x80\x9d', # ”
				651	0x95 : b'\xe2\x80\xa2', # •
				652	0x96 : b'\xe2\x80\x93', # –
				653	0x97 : b'\xe2\x80\x94', # —
				654	0x98 : b'\xcb\x9c', # ˜
				655	0x99 : b'\xe2\x84\xa2', # ™
				656	0x9a : b'\xc5\xa1', # š
				657	0x9b : b'\xe2\x80\xba', # ›
				658	0x9c : b'\xc5\x93', # œ
				659	0x9e : b'\xc5\xbe', # ž
				660	0x9f : b'\xc5\xb8', # Ÿ
				661	0xa0 : b'\xc2\xa0', #
				662	0xa1 : b'\xc2\xa1', # ¡
				663	0xa2 : b'\xc2\xa2', # ¢
				664	0xa3 : b'\xc2\xa3', # £
				665	0xa4 : b'\xc2\xa4', # ¤
				666	0xa5 : b'\xc2\xa5', # ¥
				667	0xa6 : b'\xc2\xa6', # ¦
				668	0xa7 : b'\xc2\xa7', # §
				669	0xa8 : b'\xc2\xa8', # ¨
				670	0xa9 : b'\xc2\xa9', # ©
				671	0xaa : b'\xc2\xaa', # ª
				672	0xab : b'\xc2\xab', # «
				673	0xac : b'\xc2\xac', # ¬
				674	0xad : b'\xc2\xad', #
				675	0xae : b'\xc2\xae', # ®
				676	0xaf : b'\xc2\xaf', # ¯
				677	0xb0 : b'\xc2\xb0', # °
				678	0xb1 : b'\xc2\xb1', # ±
				679	0xb2 : b'\xc2\xb2', # ²
				680	0xb3 : b'\xc2\xb3', # ³
				681	0xb4 : b'\xc2\xb4', # ´
				682	0xb5 : b'\xc2\xb5', # µ
				683	0xb6 : b'\xc2\xb6', # ¶
				684	0xb7 : b'\xc2\xb7', # ·
				685	0xb8 : b'\xc2\xb8', # ¸
				686	0xb9 : b'\xc2\xb9', # ¹
				687	0xba : b'\xc2\xba', # º
				688	0xbb : b'\xc2\xbb', # »
				689	0xbc : b'\xc2\xbc', # ¼
				690	0xbd : b'\xc2\xbd', # ½
				691	0xbe : b'\xc2\xbe', # ¾
				692	0xbf : b'\xc2\xbf', # ¿
				693	0xc0 : b'\xc3\x80', # À
				694	0xc1 : b'\xc3\x81', # Á
				695	0xc2 : b'\xc3\x82', # Â
				696	0xc3 : b'\xc3\x83', # Ã
				697	0xc4 : b'\xc3\x84', # Ä
				698	0xc5 : b'\xc3\x85', # Å
				699	0xc6 : b'\xc3\x86', # Æ
				700	0xc7 : b'\xc3\x87', # Ç
				701	0xc8 : b'\xc3\x88', # È
				702	0xc9 : b'\xc3\x89', # É
				703	0xca : b'\xc3\x8a', # Ê
				704	0xcb : b'\xc3\x8b', # Ë
				705	0xcc : b'\xc3\x8c', # Ì
				706	0xcd : b'\xc3\x8d', # Í
				707	0xce : b'\xc3\x8e', # Î
				708	0xcf : b'\xc3\x8f', # Ï
				709	0xd0 : b'\xc3\x90', # Ð
				710	0xd1 : b'\xc3\x91', # Ñ
				711	0xd2 : b'\xc3\x92', # Ò
				712	0xd3 : b'\xc3\x93', # Ó
				713	0xd4 : b'\xc3\x94', # Ô
				714	0xd5 : b'\xc3\x95', # Õ
				715	0xd6 : b'\xc3\x96', # Ö
				716	0xd7 : b'\xc3\x97', # ×
				717	0xd8 : b'\xc3\x98', # Ø
				718	0xd9 : b'\xc3\x99', # Ù
				719	0xda : b'\xc3\x9a', # Ú
				720	0xdb : b'\xc3\x9b', # Û
				721	0xdc : b'\xc3\x9c', # Ü
				722	0xdd : b'\xc3\x9d', # Ý
				723	0xde : b'\xc3\x9e', # Þ
				724	0xdf : b'\xc3\x9f', # ß
				725	0xe0 : b'\xc3\xa0', # à
				726	0xe1 : b'\xa1', # á
				727	0xe2 : b'\xc3\xa2', # â
				728	0xe3 : b'\xc3\xa3', # ã
				729	0xe4 : b'\xc3\xa4', # ä
				730	0xe5 : b'\xc3\xa5', # å
				731	0xe6 : b'\xc3\xa6', # æ
				732	0xe7 : b'\xc3\xa7', # ç
				733	0xe8 : b'\xc3\xa8', # è
				734	0xe9 : b'\xc3\xa9', # é
				735	0xea : b'\xc3\xaa', # ê
				736	0xeb : b'\xc3\xab', # ë
				737	0xec : b'\xc3\xac', # ì
				738	0xed : b'\xc3\xad', # í
				739	0xee : b'\xc3\xae', # î
				740	0xef : b'\xc3\xaf', # ï
				741	0xf0 : b'\xc3\xb0', # ð
				742	0xf1 : b'\xc3\xb1', # ñ
				743	0xf2 : b'\xc3\xb2', # ò
				744	0xf3 : b'\xc3\xb3', # ó
				745	0xf4 : b'\xc3\xb4', # ô
				746	0xf5 : b'\xc3\xb5', # õ
				747	0xf6 : b'\xc3\xb6', # ö
				748	0xf7 : b'\xc3\xb7', # ÷
				749	0xf8 : b'\xc3\xb8', # ø
				750	0xf9 : b'\xc3\xb9', # ù
				751	0xfa : b'\xc3\xba', # ú
				752	0xfb : b'\xc3\xbb', # û
				753	0xfc : b'\xc3\xbc', # ü
				754	0xfd : b'\xc3\xbd', # ý
				755	0xfe : b'\xc3\xbe', # þ
				756	}
				757
				758	MULTIBYTE_MARKERS_AND_SIZES = [
				759	(0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
				760	(0xe0, 0xef, 3), # 3-byte characters start with E0-EF
				761	(0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
				762	]
				763
				764	FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
				765	LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
				766
				767	@classmethod
				768	def detwingle(cls, in_bytes, main_encoding="utf8",
				769	embedded_encoding="windows-1252"):
				770	"""Fix characters from one encoding embedded in some other encoding.
				771
				772	Currently the only situation supported is Windows-1252 (or its
				773	subset ISO-8859-1), embedded in UTF-8.
				774
				775	The input must be a bytestring. If you've already converted
				776	the document to Unicode, you're too late.
				777
				778	The output is a bytestring in which `embedded_encoding`
				779	characters have been converted to their `main_encoding`
				780	equivalents.
				781	"""
				782	if embedded_encoding.replace('_', '-').lower() not in (
				783	'windows-1252', 'windows_1252'):
				784	raise NotImplementedError(
				785	"Windows-1252 and ISO-8859-1 are the only currently supported "
				786	"embedded encodings.")
				787
				788	if main_encoding.lower() not in ('utf8', 'utf-8'):
				789	raise NotImplementedError(
				790	"UTF-8 is the only currently supported main encoding.")
				791
				792	byte_chunks = []
				793
				794	chunk_start = 0
				795	pos = 0
				796	while pos < len(in_bytes):
				797	byte = in_bytes[pos]
				798	if not isinstance(byte, int):
				799	# Python 2.x
				800	byte = ord(byte)
				801	if (byte >= cls.FIRST_MULTIBYTE_MARKER
				802	and byte <= cls.LAST_MULTIBYTE_MARKER):
				803	# This is the start of a UTF-8 multibyte character. Skip
				804	# to the end.
				805	for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
				806	if byte >= start and byte <= end:
				807	pos += size
				808	break
				809	elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
				810	# We found a Windows-1252 character!
				811	# Save the string up to this point as a chunk.
				812	byte_chunks.append(in_bytes[chunk_start:pos])
				813
				814	# Now translate the Windows-1252 character into UTF-8
				815	# and add it as another, one-byte chunk.
				816	byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
				817	pos += 1
				818	chunk_start = pos
				819	else:
				820	# Go on to the next character.
				821	pos += 1
				822	if chunk_start == 0:
				823	# The string is unchanged.
				824	return in_bytes
				825	else:
				826	# Store the final chunk.
				827	byte_chunks.append(in_bytes[chunk_start:])
				828	return b''.join(byte_chunks)
				829