Blame - poky/bitbake/lib/bs4/dammit.py - mdmillerii/openbmc

blob: 805aa908a5576d05954c84ef326439a9ac7c32be [file] [log] [blame]

Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1	# -- coding: utf-8 --
				2	"""Beautiful Soup bonus library: Unicode, Dammit
				3
				4	This library converts a bytestream to Unicode through any means
				5	necessary. It is heavily based on code from Mark Pilgrim's Universal
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	6	Feed Parser. It works best on XML and HTML, but it does not rewrite the
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	7	XML or HTML to reflect a new encoding; that's the tree builder's job.
				8	"""
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	9	__license__ = "MIT"
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	10
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	11	from pdb import set_trace
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	12	import codecs
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	13	from html.entities import codepoint2name
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	14	import re
				15	import logging
				16	import string
				17
				18	# Import a library to autodetect character encodings.
				19	chardet_type = None
				20	try:
				21	# First try the fast C implementation.
				22	# PyPI package: cchardet
				23	import cchardet
				24	def chardet_dammit(s):
				25	return cchardet.detect(s)['encoding']
				26	except ImportError:
				27	try:
				28	# Fall back to the pure Python implementation
				29	# Debian package: python-chardet
				30	# PyPI package: chardet
				31	import chardet
				32	def chardet_dammit(s):
				33	return chardet.detect(s)['encoding']
				34	#import chardet.constants
				35	#chardet.constants._debug = 1
				36	except ImportError:
				37	# No chardet available.
				38	def chardet_dammit(s):
				39	return None
				40
				41	# Available from http://cjkpython.i18n.org/.
				42	try:
				43	import iconv_codec
				44	except ImportError:
				45	pass
				46
				47	xml_encoding_re = re.compile(
Brad Bishop	1932369	2019-04-05 15:28:33 -0400	[diff] [blame]	48	r'^<\?.encoding=[\'"](.?)[\'"].*\?>'.encode(), re.I)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	49	html_meta_re = re.compile(
Brad Bishop	1932369	2019-04-05 15:28:33 -0400	[diff] [blame]	50	r'<\smeta[^>]+charset\s=\s["\']?([^>]?)[ /;\'">]'.encode(), re.I)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	51
				52	class EntitySubstitution(object):
				53
				54	"""Substitute XML or HTML entities for the corresponding characters."""
				55
				56	def _populate_class_variables():
				57	lookup = {}
				58	reverse_lookup = {}
				59	characters_for_re = []
				60	for codepoint, name in list(codepoint2name.items()):
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	61	character = chr(codepoint)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	62	if codepoint != 34:
				63	# There's no point in turning the quotation mark into
				64	# ", unless it happens within an attribute value, which
				65	# is handled elsewhere.
				66	characters_for_re.append(character)
				67	lookup[character] = name
				68	# But we do want to turn " into the quotation mark.
				69	reverse_lookup[name] = character
				70	re_definition = "[%s]" % "".join(characters_for_re)
				71	return lookup, reverse_lookup, re.compile(re_definition)
				72	(CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
				73	CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
				74
				75	CHARACTER_TO_XML_ENTITY = {
				76	"'": "apos",
				77	'"': "quot",
				78	"&": "amp",
				79	"<": "lt",
				80	">": "gt",
				81	}
				82
Brad Bishop	1932369	2019-04-05 15:28:33 -0400	[diff] [blame]	83	BARE_AMPERSAND_OR_BRACKET = re.compile(r"([<>]\|"
				84	r"&(?!#\d+;\|#x[0-9a-fA-F]+;\|\w+;)"
				85	r")")
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	86
Brad Bishop	1932369	2019-04-05 15:28:33 -0400	[diff] [blame]	87	AMPERSAND_OR_BRACKET = re.compile(r"([<>&])")
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	88
				89	@classmethod
				90	def _substitute_html_entity(cls, matchobj):
				91	entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
				92	return "&%s;" % entity
				93
				94	@classmethod
				95	def _substitute_xml_entity(cls, matchobj):
				96	"""Used with a regular expression to substitute the
				97	appropriate XML entity for an XML special character."""
				98	entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
				99	return "&%s;" % entity
				100
				101	@classmethod
				102	def quoted_attribute_value(self, value):
				103	"""Make a value into a quoted XML attribute, possibly escaping it.
				104
				105	Most strings will be quoted using double quotes.
				106
				107	Bob's Bar -> "Bob's Bar"
				108
				109	If a string contains double quotes, it will be quoted using
				110	single quotes.
				111
				112	Welcome to "my bar" -> 'Welcome to "my bar"'
				113
				114	If a string contains both single and double quotes, the
				115	double quotes will be escaped, and the string will be quoted
				116	using double quotes.
				117
				118	Welcome to "Bob's Bar" -> "Welcome to "Bob's bar"
				119	"""
				120	quote_with = '"'
				121	if '"' in value:
				122	if "'" in value:
				123	# The string contains both single and double
				124	# quotes. Turn the double quotes into
				125	# entities. We quote the double quotes rather than
				126	# the single quotes because the entity name is
				127	# """ whether this is HTML or XML. If we
				128	# quoted the single quotes, we'd have to decide
				129	# between ' and &squot;.
				130	replace_with = """
				131	value = value.replace('"', replace_with)
				132	else:
				133	# There are double quotes but no single quotes.
				134	# We can use single quotes to quote the attribute.
				135	quote_with = "'"
				136	return quote_with + value + quote_with
				137
				138	@classmethod
				139	def substitute_xml(cls, value, make_quoted_attribute=False):
				140	"""Substitute XML entities for special XML characters.
				141
				142	:param value: A string to be substituted. The less-than sign
				143	will become <, the greater-than sign will become >,
				144	and any ampersands will become &. If you want ampersands
				145	that appear to be part of an entity definition to be left
				146	alone, use substitute_xml_containing_entities() instead.
				147
				148	:param make_quoted_attribute: If True, then the string will be
				149	quoted, as befits an attribute value.
				150	"""
				151	# Escape angle brackets and ampersands.
				152	value = cls.AMPERSAND_OR_BRACKET.sub(
				153	cls._substitute_xml_entity, value)
				154
				155	if make_quoted_attribute:
				156	value = cls.quoted_attribute_value(value)
				157	return value
				158
				159	@classmethod
				160	def substitute_xml_containing_entities(
				161	cls, value, make_quoted_attribute=False):
				162	"""Substitute XML entities for special XML characters.
				163
				164	:param value: A string to be substituted. The less-than sign will
				165	become <, the greater-than sign will become >, and any
				166	ampersands that are not part of an entity defition will
				167	become &.
				168
				169	:param make_quoted_attribute: If True, then the string will be
				170	quoted, as befits an attribute value.
				171	"""
				172	# Escape angle brackets, and ampersands that aren't part of
				173	# entities.
				174	value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
				175	cls._substitute_xml_entity, value)
				176
				177	if make_quoted_attribute:
				178	value = cls.quoted_attribute_value(value)
				179	return value
				180
				181	@classmethod
				182	def substitute_html(cls, s):
				183	"""Replace certain Unicode characters with named HTML entities.
				184
				185	This differs from data.encode(encoding, 'xmlcharrefreplace')
				186	in that the goal is to make the result more readable (to those
				187	with ASCII displays) rather than to recover from
				188	errors. There's absolutely nothing wrong with a UTF-8 string
				189	containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
				190	character with "é" will make it more readable to some
				191	people.
				192	"""
				193	return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
				194	cls._substitute_html_entity, s)
				195
				196
				197	class EncodingDetector:
				198	"""Suggests a number of possible encodings for a bytestring.
				199
				200	Order of precedence:
				201
				202	1. Encodings you specifically tell EncodingDetector to try first
				203	(the override_encodings argument to the constructor).
				204
				205	2. An encoding declared within the bytestring itself, either in an
				206	XML declaration (if the bytestring is to be interpreted as an XML
				207	document), or in a <meta> tag (if the bytestring is to be
				208	interpreted as an HTML document.)
				209
				210	3. An encoding detected through textual analysis by chardet,
				211	cchardet, or a similar external library.
				212
				213	4. UTF-8.
				214
				215	5. Windows-1252.
				216	"""
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	217	def __init__(self, markup, override_encodings=None, is_html=False,
				218	exclude_encodings=None):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	219	self.override_encodings = override_encodings or []
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	220	exclude_encodings = exclude_encodings or []
				221	self.exclude_encodings = set([x.lower() for x in exclude_encodings])
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	222	self.chardet_encoding = None
				223	self.is_html = is_html
				224	self.declared_encoding = None
				225
				226	# First order of business: strip a byte-order mark.
				227	self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
				228
				229	def _usable(self, encoding, tried):
				230	if encoding is not None:
				231	encoding = encoding.lower()
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	232	if encoding in self.exclude_encodings:
				233	return False
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	234	if encoding not in tried:
				235	tried.add(encoding)
				236	return True
				237	return False
				238
				239	@property
				240	def encodings(self):
				241	"""Yield a number of encodings that might work for this markup."""
				242	tried = set()
				243	for e in self.override_encodings:
				244	if self._usable(e, tried):
				245	yield e
				246
				247	# Did the document originally start with a byte-order mark
				248	# that indicated its encoding?
				249	if self._usable(self.sniffed_encoding, tried):
				250	yield self.sniffed_encoding
				251
				252	# Look within the document for an XML or HTML encoding
				253	# declaration.
				254	if self.declared_encoding is None:
				255	self.declared_encoding = self.find_declared_encoding(
				256	self.markup, self.is_html)
				257	if self._usable(self.declared_encoding, tried):
				258	yield self.declared_encoding
				259
				260	# Use third-party character set detection to guess at the
				261	# encoding.
				262	if self.chardet_encoding is None:
				263	self.chardet_encoding = chardet_dammit(self.markup)
				264	if self._usable(self.chardet_encoding, tried):
				265	yield self.chardet_encoding
				266
				267	# As a last-ditch effort, try utf-8 and windows-1252.
				268	for e in ('utf-8', 'windows-1252'):
				269	if self._usable(e, tried):
				270	yield e
				271
				272	@classmethod
				273	def strip_byte_order_mark(cls, data):
				274	"""If a byte-order mark is present, strip it and return the encoding it implies."""
				275	encoding = None
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	276	if isinstance(data, str):
				277	# Unicode data cannot have a byte-order mark.
				278	return data, encoding
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	279	if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
				280	and (data[2:4] != '\x00\x00'):
				281	encoding = 'utf-16be'
				282	data = data[2:]
				283	elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
				284	and (data[2:4] != '\x00\x00'):
				285	encoding = 'utf-16le'
				286	data = data[2:]
				287	elif data[:3] == b'\xef\xbb\xbf':
				288	encoding = 'utf-8'
				289	data = data[3:]
				290	elif data[:4] == b'\x00\x00\xfe\xff':
				291	encoding = 'utf-32be'
				292	data = data[4:]
				293	elif data[:4] == b'\xff\xfe\x00\x00':
				294	encoding = 'utf-32le'
				295	data = data[4:]
				296	return data, encoding
				297
				298	@classmethod
				299	def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):
				300	"""Given a document, tries to find its declared encoding.
				301
				302	An XML encoding is declared at the beginning of the document.
				303
				304	An HTML encoding is declared in a <meta> tag, hopefully near the
				305	beginning of the document.
				306	"""
				307	if search_entire_document:
				308	xml_endpos = html_endpos = len(markup)
				309	else:
				310	xml_endpos = 1024
				311	html_endpos = max(2048, int(len(markup) * 0.05))
				312
				313	declared_encoding = None
				314	declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
				315	if not declared_encoding_match and is_html:
				316	declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
				317	if declared_encoding_match is not None:
				318	declared_encoding = declared_encoding_match.groups()[0].decode(
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	319	'ascii', 'replace')
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	320	if declared_encoding:
				321	return declared_encoding.lower()
				322	return None
				323
				324	class UnicodeDammit:
				325	"""A class for detecting the encoding of a *ML document and
				326	converting it to a Unicode string. If the source encoding is
				327	windows-1252, can replace MS smart quotes with their HTML or XML
				328	equivalents."""
				329
				330	# This dictionary maps commonly seen values for "charset" in HTML
				331	# meta tags to the corresponding Python codec names. It only covers
				332	# values that aren't in Python's aliases and can't be determined
				333	# by the heuristics in find_codec.
				334	CHARSET_ALIASES = {"macintosh": "mac-roman",
				335	"x-sjis": "shift-jis"}
				336
				337	ENCODINGS_WITH_SMART_QUOTES = [
				338	"windows-1252",
				339	"iso-8859-1",
				340	"iso-8859-2",
				341	]
				342
				343	def __init__(self, markup, override_encodings=[],
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	344	smart_quotes_to=None, is_html=False, exclude_encodings=[]):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	345	self.smart_quotes_to = smart_quotes_to
				346	self.tried_encodings = []
				347	self.contains_replacement_characters = False
				348	self.is_html = is_html
				349
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	350	self.detector = EncodingDetector(
				351	markup, override_encodings, is_html, exclude_encodings)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	352
				353	# Short-circuit if the data is in Unicode to begin with.
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	354	if isinstance(markup, str) or markup == '':
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	355	self.markup = markup
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	356	self.unicode_markup = str(markup)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	357	self.original_encoding = None
				358	return
				359
				360	# The encoding detector may have stripped a byte-order mark.
				361	# Use the stripped markup from this point on.
				362	self.markup = self.detector.markup
				363
				364	u = None
				365	for encoding in self.detector.encodings:
				366	markup = self.detector.markup
				367	u = self._convert_from(encoding)
				368	if u is not None:
				369	break
				370
				371	if not u:
				372	# None of the encodings worked. As an absolute last resort,
				373	# try them again with character replacement.
				374
				375	for encoding in self.detector.encodings:
				376	if encoding != "ascii":
				377	u = self._convert_from(encoding, "replace")
				378	if u is not None:
				379	logging.warning(
				380	"Some characters could not be decoded, and were "
				381	"replaced with REPLACEMENT CHARACTER.")
				382	self.contains_replacement_characters = True
				383	break
				384
				385	# If none of that worked, we could at this point force it to
				386	# ASCII, but that would destroy so much data that I think
				387	# giving up is better.
				388	self.unicode_markup = u
				389	if not u:
				390	self.original_encoding = None
				391
				392	def _sub_ms_char(self, match):
				393	"""Changes a MS smart quote character to an XML or HTML
				394	entity, or an ASCII character."""
				395	orig = match.group(1)
				396	if self.smart_quotes_to == 'ascii':
				397	sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
				398	else:
				399	sub = self.MS_CHARS.get(orig)
				400	if type(sub) == tuple:
				401	if self.smart_quotes_to == 'xml':
				402	sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
				403	else:
				404	sub = '&'.encode() + sub[0].encode() + ';'.encode()
				405	else:
				406	sub = sub.encode()
				407	return sub
				408
				409	def _convert_from(self, proposed, errors="strict"):
				410	proposed = self.find_codec(proposed)
				411	if not proposed or (proposed, errors) in self.tried_encodings:
				412	return None
				413	self.tried_encodings.append((proposed, errors))
				414	markup = self.markup
				415	# Convert smart quotes to HTML if coming from an encoding
				416	# that might have them.
				417	if (self.smart_quotes_to is not None
				418	and proposed in self.ENCODINGS_WITH_SMART_QUOTES):
				419	smart_quotes_re = b"([\x80-\x9f])"
				420	smart_quotes_compiled = re.compile(smart_quotes_re)
				421	markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
				422
				423	try:
				424	#print "Trying to convert document to %s (errors=%s)" % (
				425	# proposed, errors)
				426	u = self._to_unicode(markup, proposed, errors)
				427	self.markup = u
				428	self.original_encoding = proposed
				429	except Exception as e:
				430	#print "That didn't work!"
				431	#print e
				432	return None
				433	#print "Correct encoding: %s" % proposed
				434	return self.markup
				435
				436	def _to_unicode(self, data, encoding, errors="strict"):
				437	'''Given a string and its encoding, decodes the string into Unicode.
				438	%encoding is a string recognized by encodings.aliases'''
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	439	return str(data, encoding, errors)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	440
				441	@property
				442	def declared_html_encoding(self):
				443	if not self.is_html:
				444	return None
				445	return self.detector.declared_encoding
				446
				447	def find_codec(self, charset):
				448	value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
				449	or (charset and self._codec(charset.replace("-", "")))
				450	or (charset and self._codec(charset.replace("-", "_")))
				451	or (charset and charset.lower())
				452	or charset
				453	)
				454	if value:
				455	return value.lower()
				456	return None
				457
				458	def _codec(self, charset):
				459	if not charset:
				460	return charset
				461	codec = None
				462	try:
				463	codecs.lookup(charset)
				464	codec = charset
				465	except (LookupError, ValueError):
				466	pass
				467	return codec
				468
				469
				470	# A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
				471	MS_CHARS = {b'\x80': ('euro', '20AC'),
				472	b'\x81': ' ',
				473	b'\x82': ('sbquo', '201A'),
				474	b'\x83': ('fnof', '192'),
				475	b'\x84': ('bdquo', '201E'),
				476	b'\x85': ('hellip', '2026'),
				477	b'\x86': ('dagger', '2020'),
				478	b'\x87': ('Dagger', '2021'),
				479	b'\x88': ('circ', '2C6'),
				480	b'\x89': ('permil', '2030'),
				481	b'\x8A': ('Scaron', '160'),
				482	b'\x8B': ('lsaquo', '2039'),
				483	b'\x8C': ('OElig', '152'),
				484	b'\x8D': '?',
				485	b'\x8E': ('#x17D', '17D'),
				486	b'\x8F': '?',
				487	b'\x90': '?',
				488	b'\x91': ('lsquo', '2018'),
				489	b'\x92': ('rsquo', '2019'),
				490	b'\x93': ('ldquo', '201C'),
				491	b'\x94': ('rdquo', '201D'),
				492	b'\x95': ('bull', '2022'),
				493	b'\x96': ('ndash', '2013'),
				494	b'\x97': ('mdash', '2014'),
				495	b'\x98': ('tilde', '2DC'),
				496	b'\x99': ('trade', '2122'),
				497	b'\x9a': ('scaron', '161'),
				498	b'\x9b': ('rsaquo', '203A'),
				499	b'\x9c': ('oelig', '153'),
				500	b'\x9d': '?',
				501	b'\x9e': ('#x17E', '17E'),
				502	b'\x9f': ('Yuml', ''),}
				503
				504	# A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
				505	# horrors like stripping diacritical marks to turn á into a, but also
				506	# contains non-horrors like turning “ into ".
				507	MS_CHARS_TO_ASCII = {
				508	b'\x80' : 'EUR',
				509	b'\x81' : ' ',
				510	b'\x82' : ',',
				511	b'\x83' : 'f',
				512	b'\x84' : ',,',
				513	b'\x85' : '...',
				514	b'\x86' : '+',
				515	b'\x87' : '++',
				516	b'\x88' : '^',
				517	b'\x89' : '%',
				518	b'\x8a' : 'S',
				519	b'\x8b' : '<',
				520	b'\x8c' : 'OE',
				521	b'\x8d' : '?',
				522	b'\x8e' : 'Z',
				523	b'\x8f' : '?',
				524	b'\x90' : '?',
				525	b'\x91' : "'",
				526	b'\x92' : "'",
				527	b'\x93' : '"',
				528	b'\x94' : '"',
				529	b'\x95' : '*',
				530	b'\x96' : '-',
				531	b'\x97' : '--',
				532	b'\x98' : '~',
				533	b'\x99' : '(TM)',
				534	b'\x9a' : 's',
				535	b'\x9b' : '>',
				536	b'\x9c' : 'oe',
				537	b'\x9d' : '?',
				538	b'\x9e' : 'z',
				539	b'\x9f' : 'Y',
				540	b'\xa0' : ' ',
				541	b'\xa1' : '!',
				542	b'\xa2' : 'c',
				543	b'\xa3' : 'GBP',
				544	b'\xa4' : '$', #This approximation is especially parochial--this is the
				545	#generic currency symbol.
				546	b'\xa5' : 'YEN',
				547	b'\xa6' : '\|',
				548	b'\xa7' : 'S',
				549	b'\xa8' : '..',
				550	b'\xa9' : '',
				551	b'\xaa' : '(th)',
				552	b'\xab' : '<<',
				553	b'\xac' : '!',
				554	b'\xad' : ' ',
				555	b'\xae' : '(R)',
				556	b'\xaf' : '-',
				557	b'\xb0' : 'o',
				558	b'\xb1' : '+-',
				559	b'\xb2' : '2',
				560	b'\xb3' : '3',
				561	b'\xb4' : ("'", 'acute'),
				562	b'\xb5' : 'u',
				563	b'\xb6' : 'P',
				564	b'\xb7' : '*',
				565	b'\xb8' : ',',
				566	b'\xb9' : '1',
				567	b'\xba' : '(th)',
				568	b'\xbb' : '>>',
				569	b'\xbc' : '1/4',
				570	b'\xbd' : '1/2',
				571	b'\xbe' : '3/4',
				572	b'\xbf' : '?',
				573	b'\xc0' : 'A',
				574	b'\xc1' : 'A',
				575	b'\xc2' : 'A',
				576	b'\xc3' : 'A',
				577	b'\xc4' : 'A',
				578	b'\xc5' : 'A',
				579	b'\xc6' : 'AE',
				580	b'\xc7' : 'C',
				581	b'\xc8' : 'E',
				582	b'\xc9' : 'E',
				583	b'\xca' : 'E',
				584	b'\xcb' : 'E',
				585	b'\xcc' : 'I',
				586	b'\xcd' : 'I',
				587	b'\xce' : 'I',
				588	b'\xcf' : 'I',
				589	b'\xd0' : 'D',
				590	b'\xd1' : 'N',
				591	b'\xd2' : 'O',
				592	b'\xd3' : 'O',
				593	b'\xd4' : 'O',
				594	b'\xd5' : 'O',
				595	b'\xd6' : 'O',
				596	b'\xd7' : '*',
				597	b'\xd8' : 'O',
				598	b'\xd9' : 'U',
				599	b'\xda' : 'U',
				600	b'\xdb' : 'U',
				601	b'\xdc' : 'U',
				602	b'\xdd' : 'Y',
				603	b'\xde' : 'b',
				604	b'\xdf' : 'B',
				605	b'\xe0' : 'a',
				606	b'\xe1' : 'a',
				607	b'\xe2' : 'a',
				608	b'\xe3' : 'a',
				609	b'\xe4' : 'a',
				610	b'\xe5' : 'a',
				611	b'\xe6' : 'ae',
				612	b'\xe7' : 'c',
				613	b'\xe8' : 'e',
				614	b'\xe9' : 'e',
				615	b'\xea' : 'e',
				616	b'\xeb' : 'e',
				617	b'\xec' : 'i',
				618	b'\xed' : 'i',
				619	b'\xee' : 'i',
				620	b'\xef' : 'i',
				621	b'\xf0' : 'o',
				622	b'\xf1' : 'n',
				623	b'\xf2' : 'o',
				624	b'\xf3' : 'o',
				625	b'\xf4' : 'o',
				626	b'\xf5' : 'o',
				627	b'\xf6' : 'o',
				628	b'\xf7' : '/',
				629	b'\xf8' : 'o',
				630	b'\xf9' : 'u',
				631	b'\xfa' : 'u',
				632	b'\xfb' : 'u',
				633	b'\xfc' : 'u',
				634	b'\xfd' : 'y',
				635	b'\xfe' : 'b',
				636	b'\xff' : 'y',
				637	}
				638
				639	# A map used when removing rogue Windows-1252/ISO-8859-1
				640	# characters in otherwise UTF-8 documents.
				641	#
				642	# Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
				643	# Windows-1252.
				644	WINDOWS_1252_TO_UTF8 = {
				645	0x80 : b'\xe2\x82\xac', # €
				646	0x82 : b'\xe2\x80\x9a', # ‚
				647	0x83 : b'\xc6\x92', # ƒ
				648	0x84 : b'\xe2\x80\x9e', # „
				649	0x85 : b'\xe2\x80\xa6', # …
				650	0x86 : b'\xe2\x80\xa0', # †
				651	0x87 : b'\xe2\x80\xa1', # ‡
				652	0x88 : b'\xcb\x86', # ˆ
				653	0x89 : b'\xe2\x80\xb0', # ‰
				654	0x8a : b'\xc5\xa0', # Š
				655	0x8b : b'\xe2\x80\xb9', # ‹
				656	0x8c : b'\xc5\x92', # Œ
				657	0x8e : b'\xc5\xbd', # Ž
				658	0x91 : b'\xe2\x80\x98', # ‘
				659	0x92 : b'\xe2\x80\x99', # ’
				660	0x93 : b'\xe2\x80\x9c', # “
				661	0x94 : b'\xe2\x80\x9d', # ”
				662	0x95 : b'\xe2\x80\xa2', # •
				663	0x96 : b'\xe2\x80\x93', # –
				664	0x97 : b'\xe2\x80\x94', # —
				665	0x98 : b'\xcb\x9c', # ˜
				666	0x99 : b'\xe2\x84\xa2', # ™
				667	0x9a : b'\xc5\xa1', # š
				668	0x9b : b'\xe2\x80\xba', # ›
				669	0x9c : b'\xc5\x93', # œ
				670	0x9e : b'\xc5\xbe', # ž
				671	0x9f : b'\xc5\xb8', # Ÿ
				672	0xa0 : b'\xc2\xa0', #
				673	0xa1 : b'\xc2\xa1', # ¡
				674	0xa2 : b'\xc2\xa2', # ¢
				675	0xa3 : b'\xc2\xa3', # £
				676	0xa4 : b'\xc2\xa4', # ¤
				677	0xa5 : b'\xc2\xa5', # ¥
				678	0xa6 : b'\xc2\xa6', # ¦
				679	0xa7 : b'\xc2\xa7', # §
				680	0xa8 : b'\xc2\xa8', # ¨
				681	0xa9 : b'\xc2\xa9', # ©
				682	0xaa : b'\xc2\xaa', # ª
				683	0xab : b'\xc2\xab', # «
				684	0xac : b'\xc2\xac', # ¬
				685	0xad : b'\xc2\xad', #
				686	0xae : b'\xc2\xae', # ®
				687	0xaf : b'\xc2\xaf', # ¯
				688	0xb0 : b'\xc2\xb0', # °
				689	0xb1 : b'\xc2\xb1', # ±
				690	0xb2 : b'\xc2\xb2', # ²
				691	0xb3 : b'\xc2\xb3', # ³
				692	0xb4 : b'\xc2\xb4', # ´
				693	0xb5 : b'\xc2\xb5', # µ
				694	0xb6 : b'\xc2\xb6', # ¶
				695	0xb7 : b'\xc2\xb7', # ·
				696	0xb8 : b'\xc2\xb8', # ¸
				697	0xb9 : b'\xc2\xb9', # ¹
				698	0xba : b'\xc2\xba', # º
				699	0xbb : b'\xc2\xbb', # »
				700	0xbc : b'\xc2\xbc', # ¼
				701	0xbd : b'\xc2\xbd', # ½
				702	0xbe : b'\xc2\xbe', # ¾
				703	0xbf : b'\xc2\xbf', # ¿
				704	0xc0 : b'\xc3\x80', # À
				705	0xc1 : b'\xc3\x81', # Á
				706	0xc2 : b'\xc3\x82', # Â
				707	0xc3 : b'\xc3\x83', # Ã
				708	0xc4 : b'\xc3\x84', # Ä
				709	0xc5 : b'\xc3\x85', # Å
				710	0xc6 : b'\xc3\x86', # Æ
				711	0xc7 : b'\xc3\x87', # Ç
				712	0xc8 : b'\xc3\x88', # È
				713	0xc9 : b'\xc3\x89', # É
				714	0xca : b'\xc3\x8a', # Ê
				715	0xcb : b'\xc3\x8b', # Ë
				716	0xcc : b'\xc3\x8c', # Ì
				717	0xcd : b'\xc3\x8d', # Í
				718	0xce : b'\xc3\x8e', # Î
				719	0xcf : b'\xc3\x8f', # Ï
				720	0xd0 : b'\xc3\x90', # Ð
				721	0xd1 : b'\xc3\x91', # Ñ
				722	0xd2 : b'\xc3\x92', # Ò
				723	0xd3 : b'\xc3\x93', # Ó
				724	0xd4 : b'\xc3\x94', # Ô
				725	0xd5 : b'\xc3\x95', # Õ
				726	0xd6 : b'\xc3\x96', # Ö
				727	0xd7 : b'\xc3\x97', # ×
				728	0xd8 : b'\xc3\x98', # Ø
				729	0xd9 : b'\xc3\x99', # Ù
				730	0xda : b'\xc3\x9a', # Ú
				731	0xdb : b'\xc3\x9b', # Û
				732	0xdc : b'\xc3\x9c', # Ü
				733	0xdd : b'\xc3\x9d', # Ý
				734	0xde : b'\xc3\x9e', # Þ
				735	0xdf : b'\xc3\x9f', # ß
				736	0xe0 : b'\xc3\xa0', # à
				737	0xe1 : b'\xa1', # á
				738	0xe2 : b'\xc3\xa2', # â
				739	0xe3 : b'\xc3\xa3', # ã
				740	0xe4 : b'\xc3\xa4', # ä
				741	0xe5 : b'\xc3\xa5', # å
				742	0xe6 : b'\xc3\xa6', # æ
				743	0xe7 : b'\xc3\xa7', # ç
				744	0xe8 : b'\xc3\xa8', # è
				745	0xe9 : b'\xc3\xa9', # é
				746	0xea : b'\xc3\xaa', # ê
				747	0xeb : b'\xc3\xab', # ë
				748	0xec : b'\xc3\xac', # ì
				749	0xed : b'\xc3\xad', # í
				750	0xee : b'\xc3\xae', # î
				751	0xef : b'\xc3\xaf', # ï
				752	0xf0 : b'\xc3\xb0', # ð
				753	0xf1 : b'\xc3\xb1', # ñ
				754	0xf2 : b'\xc3\xb2', # ò
				755	0xf3 : b'\xc3\xb3', # ó
				756	0xf4 : b'\xc3\xb4', # ô
				757	0xf5 : b'\xc3\xb5', # õ
				758	0xf6 : b'\xc3\xb6', # ö
				759	0xf7 : b'\xc3\xb7', # ÷
				760	0xf8 : b'\xc3\xb8', # ø
				761	0xf9 : b'\xc3\xb9', # ù
				762	0xfa : b'\xc3\xba', # ú
				763	0xfb : b'\xc3\xbb', # û
				764	0xfc : b'\xc3\xbc', # ü
				765	0xfd : b'\xc3\xbd', # ý
				766	0xfe : b'\xc3\xbe', # þ
				767	}
				768
				769	MULTIBYTE_MARKERS_AND_SIZES = [
				770	(0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
				771	(0xe0, 0xef, 3), # 3-byte characters start with E0-EF
				772	(0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
				773	]
				774
				775	FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
				776	LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
				777
				778	@classmethod
				779	def detwingle(cls, in_bytes, main_encoding="utf8",
				780	embedded_encoding="windows-1252"):
				781	"""Fix characters from one encoding embedded in some other encoding.
				782
				783	Currently the only situation supported is Windows-1252 (or its
				784	subset ISO-8859-1), embedded in UTF-8.
				785
				786	The input must be a bytestring. If you've already converted
				787	the document to Unicode, you're too late.
				788
				789	The output is a bytestring in which `embedded_encoding`
				790	characters have been converted to their `main_encoding`
				791	equivalents.
				792	"""
				793	if embedded_encoding.replace('_', '-').lower() not in (
				794	'windows-1252', 'windows_1252'):
				795	raise NotImplementedError(
				796	"Windows-1252 and ISO-8859-1 are the only currently supported "
				797	"embedded encodings.")
				798
				799	if main_encoding.lower() not in ('utf8', 'utf-8'):
				800	raise NotImplementedError(
				801	"UTF-8 is the only currently supported main encoding.")
				802
				803	byte_chunks = []
				804
				805	chunk_start = 0
				806	pos = 0
				807	while pos < len(in_bytes):
				808	byte = in_bytes[pos]
				809	if not isinstance(byte, int):
				810	# Python 2.x
				811	byte = ord(byte)
				812	if (byte >= cls.FIRST_MULTIBYTE_MARKER
				813	and byte <= cls.LAST_MULTIBYTE_MARKER):
				814	# This is the start of a UTF-8 multibyte character. Skip
				815	# to the end.
				816	for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
				817	if byte >= start and byte <= end:
				818	pos += size
				819	break
				820	elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
				821	# We found a Windows-1252 character!
				822	# Save the string up to this point as a chunk.
				823	byte_chunks.append(in_bytes[chunk_start:pos])
				824
				825	# Now translate the Windows-1252 character into UTF-8
				826	# and add it as another, one-byte chunk.
				827	byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
				828	pos += 1
				829	chunk_start = pos
				830	else:
				831	# Go on to the next character.
				832	pos += 1
				833	if chunk_start == 0:
				834	# The string is unchanged.
				835	return in_bytes
				836	else:
				837	# Store the final chunk.
				838	byte_chunks.append(in_bytes[chunk_start:])
				839	return b''.join(byte_chunks)
				840