Blame - poky/bitbake/lib/bs4/dammit.py - mdmillerii/openbmc

blob: 7ad9e0dd1eb4a93d36be069052fe2ac0a7bf6edb [file] [log] [blame]

Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1	# -- coding: utf-8 --
				2	"""Beautiful Soup bonus library: Unicode, Dammit
				3
				4	This library converts a bytestream to Unicode through any means
				5	necessary. It is heavily based on code from Mark Pilgrim's Universal
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	6	Feed Parser. It works best on XML and HTML, but it does not rewrite the
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	7	XML or HTML to reflect a new encoding; that's the tree builder's job.
				8	"""
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	9	__license__ = "MIT"
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	10
				11	import codecs
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	12	from html.entities import codepoint2name
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	13	import re
				14	import logging
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	15
				16	# Import a library to autodetect character encodings.
				17	chardet_type = None
				18	try:
				19	# First try the fast C implementation.
				20	# PyPI package: cchardet
				21	import cchardet
				22	def chardet_dammit(s):
				23	return cchardet.detect(s)['encoding']
				24	except ImportError:
				25	try:
				26	# Fall back to the pure Python implementation
				27	# Debian package: python-chardet
				28	# PyPI package: chardet
				29	import chardet
				30	def chardet_dammit(s):
				31	return chardet.detect(s)['encoding']
				32	#import chardet.constants
				33	#chardet.constants._debug = 1
				34	except ImportError:
				35	# No chardet available.
				36	def chardet_dammit(s):
				37	return None
				38
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	39	xml_encoding_re = re.compile(
Brad Bishop	1932369	2019-04-05 15:28:33 -0400	[diff] [blame]	40	r'^<\?.encoding=[\'"](.?)[\'"].*\?>'.encode(), re.I)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	41	html_meta_re = re.compile(
Brad Bishop	1932369	2019-04-05 15:28:33 -0400	[diff] [blame]	42	r'<\smeta[^>]+charset\s=\s["\']?([^>]?)[ /;\'">]'.encode(), re.I)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	43
				44	class EntitySubstitution(object):
				45
				46	"""Substitute XML or HTML entities for the corresponding characters."""
				47
				48	def _populate_class_variables():
				49	lookup = {}
				50	reverse_lookup = {}
				51	characters_for_re = []
				52	for codepoint, name in list(codepoint2name.items()):
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	53	character = chr(codepoint)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	54	if codepoint != 34:
				55	# There's no point in turning the quotation mark into
				56	# ", unless it happens within an attribute value, which
				57	# is handled elsewhere.
				58	characters_for_re.append(character)
				59	lookup[character] = name
				60	# But we do want to turn " into the quotation mark.
				61	reverse_lookup[name] = character
				62	re_definition = "[%s]" % "".join(characters_for_re)
				63	return lookup, reverse_lookup, re.compile(re_definition)
				64	(CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
				65	CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
				66
				67	CHARACTER_TO_XML_ENTITY = {
				68	"'": "apos",
				69	'"': "quot",
				70	"&": "amp",
				71	"<": "lt",
				72	">": "gt",
				73	}
				74
Brad Bishop	1932369	2019-04-05 15:28:33 -0400	[diff] [blame]	75	BARE_AMPERSAND_OR_BRACKET = re.compile(r"([<>]\|"
				76	r"&(?!#\d+;\|#x[0-9a-fA-F]+;\|\w+;)"
				77	r")")
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	78
Brad Bishop	1932369	2019-04-05 15:28:33 -0400	[diff] [blame]	79	AMPERSAND_OR_BRACKET = re.compile(r"([<>&])")
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	80
				81	@classmethod
				82	def _substitute_html_entity(cls, matchobj):
				83	entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
				84	return "&%s;" % entity
				85
				86	@classmethod
				87	def _substitute_xml_entity(cls, matchobj):
				88	"""Used with a regular expression to substitute the
				89	appropriate XML entity for an XML special character."""
				90	entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
				91	return "&%s;" % entity
				92
				93	@classmethod
				94	def quoted_attribute_value(self, value):
				95	"""Make a value into a quoted XML attribute, possibly escaping it.
				96
				97	Most strings will be quoted using double quotes.
				98
				99	Bob's Bar -> "Bob's Bar"
				100
				101	If a string contains double quotes, it will be quoted using
				102	single quotes.
				103
				104	Welcome to "my bar" -> 'Welcome to "my bar"'
				105
				106	If a string contains both single and double quotes, the
				107	double quotes will be escaped, and the string will be quoted
				108	using double quotes.
				109
				110	Welcome to "Bob's Bar" -> "Welcome to "Bob's bar"
				111	"""
				112	quote_with = '"'
				113	if '"' in value:
				114	if "'" in value:
				115	# The string contains both single and double
				116	# quotes. Turn the double quotes into
				117	# entities. We quote the double quotes rather than
				118	# the single quotes because the entity name is
				119	# """ whether this is HTML or XML. If we
				120	# quoted the single quotes, we'd have to decide
				121	# between ' and &squot;.
				122	replace_with = """
				123	value = value.replace('"', replace_with)
				124	else:
				125	# There are double quotes but no single quotes.
				126	# We can use single quotes to quote the attribute.
				127	quote_with = "'"
				128	return quote_with + value + quote_with
				129
				130	@classmethod
				131	def substitute_xml(cls, value, make_quoted_attribute=False):
				132	"""Substitute XML entities for special XML characters.
				133
				134	:param value: A string to be substituted. The less-than sign
				135	will become <, the greater-than sign will become >,
				136	and any ampersands will become &. If you want ampersands
				137	that appear to be part of an entity definition to be left
				138	alone, use substitute_xml_containing_entities() instead.
				139
				140	:param make_quoted_attribute: If True, then the string will be
				141	quoted, as befits an attribute value.
				142	"""
				143	# Escape angle brackets and ampersands.
				144	value = cls.AMPERSAND_OR_BRACKET.sub(
				145	cls._substitute_xml_entity, value)
				146
				147	if make_quoted_attribute:
				148	value = cls.quoted_attribute_value(value)
				149	return value
				150
				151	@classmethod
				152	def substitute_xml_containing_entities(
				153	cls, value, make_quoted_attribute=False):
				154	"""Substitute XML entities for special XML characters.
				155
				156	:param value: A string to be substituted. The less-than sign will
				157	become <, the greater-than sign will become >, and any
				158	ampersands that are not part of an entity defition will
				159	become &.
				160
				161	:param make_quoted_attribute: If True, then the string will be
				162	quoted, as befits an attribute value.
				163	"""
				164	# Escape angle brackets, and ampersands that aren't part of
				165	# entities.
				166	value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
				167	cls._substitute_xml_entity, value)
				168
				169	if make_quoted_attribute:
				170	value = cls.quoted_attribute_value(value)
				171	return value
				172
				173	@classmethod
				174	def substitute_html(cls, s):
				175	"""Replace certain Unicode characters with named HTML entities.
				176
				177	This differs from data.encode(encoding, 'xmlcharrefreplace')
				178	in that the goal is to make the result more readable (to those
				179	with ASCII displays) rather than to recover from
				180	errors. There's absolutely nothing wrong with a UTF-8 string
				181	containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
				182	character with "é" will make it more readable to some
				183	people.
				184	"""
				185	return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
				186	cls._substitute_html_entity, s)
				187
				188
				189	class EncodingDetector:
				190	"""Suggests a number of possible encodings for a bytestring.
				191
				192	Order of precedence:
				193
				194	1. Encodings you specifically tell EncodingDetector to try first
				195	(the override_encodings argument to the constructor).
				196
				197	2. An encoding declared within the bytestring itself, either in an
				198	XML declaration (if the bytestring is to be interpreted as an XML
				199	document), or in a <meta> tag (if the bytestring is to be
				200	interpreted as an HTML document.)
				201
				202	3. An encoding detected through textual analysis by chardet,
				203	cchardet, or a similar external library.
				204
				205	4. UTF-8.
				206
				207	5. Windows-1252.
				208	"""
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	209	def __init__(self, markup, override_encodings=None, is_html=False,
				210	exclude_encodings=None):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	211	self.override_encodings = override_encodings or []
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	212	exclude_encodings = exclude_encodings or []
				213	self.exclude_encodings = set([x.lower() for x in exclude_encodings])
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	214	self.chardet_encoding = None
				215	self.is_html = is_html
				216	self.declared_encoding = None
				217
				218	# First order of business: strip a byte-order mark.
				219	self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
				220
				221	def _usable(self, encoding, tried):
				222	if encoding is not None:
				223	encoding = encoding.lower()
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	224	if encoding in self.exclude_encodings:
				225	return False
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	226	if encoding not in tried:
				227	tried.add(encoding)
				228	return True
				229	return False
				230
				231	@property
				232	def encodings(self):
				233	"""Yield a number of encodings that might work for this markup."""
				234	tried = set()
				235	for e in self.override_encodings:
				236	if self._usable(e, tried):
				237	yield e
				238
				239	# Did the document originally start with a byte-order mark
				240	# that indicated its encoding?
				241	if self._usable(self.sniffed_encoding, tried):
				242	yield self.sniffed_encoding
				243
				244	# Look within the document for an XML or HTML encoding
				245	# declaration.
				246	if self.declared_encoding is None:
				247	self.declared_encoding = self.find_declared_encoding(
				248	self.markup, self.is_html)
				249	if self._usable(self.declared_encoding, tried):
				250	yield self.declared_encoding
				251
				252	# Use third-party character set detection to guess at the
				253	# encoding.
				254	if self.chardet_encoding is None:
				255	self.chardet_encoding = chardet_dammit(self.markup)
				256	if self._usable(self.chardet_encoding, tried):
				257	yield self.chardet_encoding
				258
				259	# As a last-ditch effort, try utf-8 and windows-1252.
				260	for e in ('utf-8', 'windows-1252'):
				261	if self._usable(e, tried):
				262	yield e
				263
				264	@classmethod
				265	def strip_byte_order_mark(cls, data):
				266	"""If a byte-order mark is present, strip it and return the encoding it implies."""
				267	encoding = None
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	268	if isinstance(data, str):
				269	# Unicode data cannot have a byte-order mark.
				270	return data, encoding
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	271	if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
				272	and (data[2:4] != '\x00\x00'):
				273	encoding = 'utf-16be'
				274	data = data[2:]
				275	elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
				276	and (data[2:4] != '\x00\x00'):
				277	encoding = 'utf-16le'
				278	data = data[2:]
				279	elif data[:3] == b'\xef\xbb\xbf':
				280	encoding = 'utf-8'
				281	data = data[3:]
				282	elif data[:4] == b'\x00\x00\xfe\xff':
				283	encoding = 'utf-32be'
				284	data = data[4:]
				285	elif data[:4] == b'\xff\xfe\x00\x00':
				286	encoding = 'utf-32le'
				287	data = data[4:]
				288	return data, encoding
				289
				290	@classmethod
				291	def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):
				292	"""Given a document, tries to find its declared encoding.
				293
				294	An XML encoding is declared at the beginning of the document.
				295
				296	An HTML encoding is declared in a <meta> tag, hopefully near the
				297	beginning of the document.
				298	"""
				299	if search_entire_document:
				300	xml_endpos = html_endpos = len(markup)
				301	else:
				302	xml_endpos = 1024
				303	html_endpos = max(2048, int(len(markup) * 0.05))
				304
				305	declared_encoding = None
				306	declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
				307	if not declared_encoding_match and is_html:
				308	declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
				309	if declared_encoding_match is not None:
				310	declared_encoding = declared_encoding_match.groups()[0].decode(
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	311	'ascii', 'replace')
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	312	if declared_encoding:
				313	return declared_encoding.lower()
				314	return None
				315
				316	class UnicodeDammit:
				317	"""A class for detecting the encoding of a *ML document and
				318	converting it to a Unicode string. If the source encoding is
				319	windows-1252, can replace MS smart quotes with their HTML or XML
				320	equivalents."""
				321
				322	# This dictionary maps commonly seen values for "charset" in HTML
				323	# meta tags to the corresponding Python codec names. It only covers
				324	# values that aren't in Python's aliases and can't be determined
				325	# by the heuristics in find_codec.
				326	CHARSET_ALIASES = {"macintosh": "mac-roman",
				327	"x-sjis": "shift-jis"}
				328
				329	ENCODINGS_WITH_SMART_QUOTES = [
				330	"windows-1252",
				331	"iso-8859-1",
				332	"iso-8859-2",
				333	]
				334
				335	def __init__(self, markup, override_encodings=[],
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	336	smart_quotes_to=None, is_html=False, exclude_encodings=[]):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	337	self.smart_quotes_to = smart_quotes_to
				338	self.tried_encodings = []
				339	self.contains_replacement_characters = False
				340	self.is_html = is_html
				341
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	342	self.detector = EncodingDetector(
				343	markup, override_encodings, is_html, exclude_encodings)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	344
				345	# Short-circuit if the data is in Unicode to begin with.
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	346	if isinstance(markup, str) or markup == '':
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	347	self.markup = markup
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	348	self.unicode_markup = str(markup)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	349	self.original_encoding = None
				350	return
				351
				352	# The encoding detector may have stripped a byte-order mark.
				353	# Use the stripped markup from this point on.
				354	self.markup = self.detector.markup
				355
				356	u = None
				357	for encoding in self.detector.encodings:
				358	markup = self.detector.markup
				359	u = self._convert_from(encoding)
				360	if u is not None:
				361	break
				362
				363	if not u:
				364	# None of the encodings worked. As an absolute last resort,
				365	# try them again with character replacement.
				366
				367	for encoding in self.detector.encodings:
				368	if encoding != "ascii":
				369	u = self._convert_from(encoding, "replace")
				370	if u is not None:
				371	logging.warning(
				372	"Some characters could not be decoded, and were "
				373	"replaced with REPLACEMENT CHARACTER.")
				374	self.contains_replacement_characters = True
				375	break
				376
				377	# If none of that worked, we could at this point force it to
				378	# ASCII, but that would destroy so much data that I think
				379	# giving up is better.
				380	self.unicode_markup = u
				381	if not u:
				382	self.original_encoding = None
				383
				384	def _sub_ms_char(self, match):
				385	"""Changes a MS smart quote character to an XML or HTML
				386	entity, or an ASCII character."""
				387	orig = match.group(1)
				388	if self.smart_quotes_to == 'ascii':
				389	sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
				390	else:
				391	sub = self.MS_CHARS.get(orig)
				392	if type(sub) == tuple:
				393	if self.smart_quotes_to == 'xml':
				394	sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
				395	else:
				396	sub = '&'.encode() + sub[0].encode() + ';'.encode()
				397	else:
				398	sub = sub.encode()
				399	return sub
				400
				401	def _convert_from(self, proposed, errors="strict"):
				402	proposed = self.find_codec(proposed)
				403	if not proposed or (proposed, errors) in self.tried_encodings:
				404	return None
				405	self.tried_encodings.append((proposed, errors))
				406	markup = self.markup
				407	# Convert smart quotes to HTML if coming from an encoding
				408	# that might have them.
				409	if (self.smart_quotes_to is not None
				410	and proposed in self.ENCODINGS_WITH_SMART_QUOTES):
				411	smart_quotes_re = b"([\x80-\x9f])"
				412	smart_quotes_compiled = re.compile(smart_quotes_re)
				413	markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
				414
				415	try:
				416	#print "Trying to convert document to %s (errors=%s)" % (
				417	# proposed, errors)
				418	u = self._to_unicode(markup, proposed, errors)
				419	self.markup = u
				420	self.original_encoding = proposed
				421	except Exception as e:
				422	#print "That didn't work!"
				423	#print e
				424	return None
				425	#print "Correct encoding: %s" % proposed
				426	return self.markup
				427
				428	def _to_unicode(self, data, encoding, errors="strict"):
				429	'''Given a string and its encoding, decodes the string into Unicode.
				430	%encoding is a string recognized by encodings.aliases'''
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	431	return str(data, encoding, errors)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	432
				433	@property
				434	def declared_html_encoding(self):
				435	if not self.is_html:
				436	return None
				437	return self.detector.declared_encoding
				438
				439	def find_codec(self, charset):
				440	value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
				441	or (charset and self._codec(charset.replace("-", "")))
				442	or (charset and self._codec(charset.replace("-", "_")))
				443	or (charset and charset.lower())
				444	or charset
				445	)
				446	if value:
				447	return value.lower()
				448	return None
				449
				450	def _codec(self, charset):
				451	if not charset:
				452	return charset
				453	codec = None
				454	try:
				455	codecs.lookup(charset)
				456	codec = charset
				457	except (LookupError, ValueError):
				458	pass
				459	return codec
				460
				461
				462	# A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
				463	MS_CHARS = {b'\x80': ('euro', '20AC'),
				464	b'\x81': ' ',
				465	b'\x82': ('sbquo', '201A'),
				466	b'\x83': ('fnof', '192'),
				467	b'\x84': ('bdquo', '201E'),
				468	b'\x85': ('hellip', '2026'),
				469	b'\x86': ('dagger', '2020'),
				470	b'\x87': ('Dagger', '2021'),
				471	b'\x88': ('circ', '2C6'),
				472	b'\x89': ('permil', '2030'),
				473	b'\x8A': ('Scaron', '160'),
				474	b'\x8B': ('lsaquo', '2039'),
				475	b'\x8C': ('OElig', '152'),
				476	b'\x8D': '?',
				477	b'\x8E': ('#x17D', '17D'),
				478	b'\x8F': '?',
				479	b'\x90': '?',
				480	b'\x91': ('lsquo', '2018'),
				481	b'\x92': ('rsquo', '2019'),
				482	b'\x93': ('ldquo', '201C'),
				483	b'\x94': ('rdquo', '201D'),
				484	b'\x95': ('bull', '2022'),
				485	b'\x96': ('ndash', '2013'),
				486	b'\x97': ('mdash', '2014'),
				487	b'\x98': ('tilde', '2DC'),
				488	b'\x99': ('trade', '2122'),
				489	b'\x9a': ('scaron', '161'),
				490	b'\x9b': ('rsaquo', '203A'),
				491	b'\x9c': ('oelig', '153'),
				492	b'\x9d': '?',
				493	b'\x9e': ('#x17E', '17E'),
				494	b'\x9f': ('Yuml', ''),}
				495
				496	# A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
				497	# horrors like stripping diacritical marks to turn á into a, but also
				498	# contains non-horrors like turning “ into ".
				499	MS_CHARS_TO_ASCII = {
				500	b'\x80' : 'EUR',
				501	b'\x81' : ' ',
				502	b'\x82' : ',',
				503	b'\x83' : 'f',
				504	b'\x84' : ',,',
				505	b'\x85' : '...',
				506	b'\x86' : '+',
				507	b'\x87' : '++',
				508	b'\x88' : '^',
				509	b'\x89' : '%',
				510	b'\x8a' : 'S',
				511	b'\x8b' : '<',
				512	b'\x8c' : 'OE',
				513	b'\x8d' : '?',
				514	b'\x8e' : 'Z',
				515	b'\x8f' : '?',
				516	b'\x90' : '?',
				517	b'\x91' : "'",
				518	b'\x92' : "'",
				519	b'\x93' : '"',
				520	b'\x94' : '"',
				521	b'\x95' : '*',
				522	b'\x96' : '-',
				523	b'\x97' : '--',
				524	b'\x98' : '~',
				525	b'\x99' : '(TM)',
				526	b'\x9a' : 's',
				527	b'\x9b' : '>',
				528	b'\x9c' : 'oe',
				529	b'\x9d' : '?',
				530	b'\x9e' : 'z',
				531	b'\x9f' : 'Y',
				532	b'\xa0' : ' ',
				533	b'\xa1' : '!',
				534	b'\xa2' : 'c',
				535	b'\xa3' : 'GBP',
				536	b'\xa4' : '$', #This approximation is especially parochial--this is the
				537	#generic currency symbol.
				538	b'\xa5' : 'YEN',
				539	b'\xa6' : '\|',
				540	b'\xa7' : 'S',
				541	b'\xa8' : '..',
				542	b'\xa9' : '',
				543	b'\xaa' : '(th)',
				544	b'\xab' : '<<',
				545	b'\xac' : '!',
				546	b'\xad' : ' ',
				547	b'\xae' : '(R)',
				548	b'\xaf' : '-',
				549	b'\xb0' : 'o',
				550	b'\xb1' : '+-',
				551	b'\xb2' : '2',
				552	b'\xb3' : '3',
				553	b'\xb4' : ("'", 'acute'),
				554	b'\xb5' : 'u',
				555	b'\xb6' : 'P',
				556	b'\xb7' : '*',
				557	b'\xb8' : ',',
				558	b'\xb9' : '1',
				559	b'\xba' : '(th)',
				560	b'\xbb' : '>>',
				561	b'\xbc' : '1/4',
				562	b'\xbd' : '1/2',
				563	b'\xbe' : '3/4',
				564	b'\xbf' : '?',
				565	b'\xc0' : 'A',
				566	b'\xc1' : 'A',
				567	b'\xc2' : 'A',
				568	b'\xc3' : 'A',
				569	b'\xc4' : 'A',
				570	b'\xc5' : 'A',
				571	b'\xc6' : 'AE',
				572	b'\xc7' : 'C',
				573	b'\xc8' : 'E',
				574	b'\xc9' : 'E',
				575	b'\xca' : 'E',
				576	b'\xcb' : 'E',
				577	b'\xcc' : 'I',
				578	b'\xcd' : 'I',
				579	b'\xce' : 'I',
				580	b'\xcf' : 'I',
				581	b'\xd0' : 'D',
				582	b'\xd1' : 'N',
				583	b'\xd2' : 'O',
				584	b'\xd3' : 'O',
				585	b'\xd4' : 'O',
				586	b'\xd5' : 'O',
				587	b'\xd6' : 'O',
				588	b'\xd7' : '*',
				589	b'\xd8' : 'O',
				590	b'\xd9' : 'U',
				591	b'\xda' : 'U',
				592	b'\xdb' : 'U',
				593	b'\xdc' : 'U',
				594	b'\xdd' : 'Y',
				595	b'\xde' : 'b',
				596	b'\xdf' : 'B',
				597	b'\xe0' : 'a',
				598	b'\xe1' : 'a',
				599	b'\xe2' : 'a',
				600	b'\xe3' : 'a',
				601	b'\xe4' : 'a',
				602	b'\xe5' : 'a',
				603	b'\xe6' : 'ae',
				604	b'\xe7' : 'c',
				605	b'\xe8' : 'e',
				606	b'\xe9' : 'e',
				607	b'\xea' : 'e',
				608	b'\xeb' : 'e',
				609	b'\xec' : 'i',
				610	b'\xed' : 'i',
				611	b'\xee' : 'i',
				612	b'\xef' : 'i',
				613	b'\xf0' : 'o',
				614	b'\xf1' : 'n',
				615	b'\xf2' : 'o',
				616	b'\xf3' : 'o',
				617	b'\xf4' : 'o',
				618	b'\xf5' : 'o',
				619	b'\xf6' : 'o',
				620	b'\xf7' : '/',
				621	b'\xf8' : 'o',
				622	b'\xf9' : 'u',
				623	b'\xfa' : 'u',
				624	b'\xfb' : 'u',
				625	b'\xfc' : 'u',
				626	b'\xfd' : 'y',
				627	b'\xfe' : 'b',
				628	b'\xff' : 'y',
				629	}
				630
				631	# A map used when removing rogue Windows-1252/ISO-8859-1
				632	# characters in otherwise UTF-8 documents.
				633	#
				634	# Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
				635	# Windows-1252.
				636	WINDOWS_1252_TO_UTF8 = {
				637	0x80 : b'\xe2\x82\xac', # €
				638	0x82 : b'\xe2\x80\x9a', # ‚
				639	0x83 : b'\xc6\x92', # ƒ
				640	0x84 : b'\xe2\x80\x9e', # „
				641	0x85 : b'\xe2\x80\xa6', # …
				642	0x86 : b'\xe2\x80\xa0', # †
				643	0x87 : b'\xe2\x80\xa1', # ‡
				644	0x88 : b'\xcb\x86', # ˆ
				645	0x89 : b'\xe2\x80\xb0', # ‰
				646	0x8a : b'\xc5\xa0', # Š
				647	0x8b : b'\xe2\x80\xb9', # ‹
				648	0x8c : b'\xc5\x92', # Œ
				649	0x8e : b'\xc5\xbd', # Ž
				650	0x91 : b'\xe2\x80\x98', # ‘
				651	0x92 : b'\xe2\x80\x99', # ’
				652	0x93 : b'\xe2\x80\x9c', # “
				653	0x94 : b'\xe2\x80\x9d', # ”
				654	0x95 : b'\xe2\x80\xa2', # •
				655	0x96 : b'\xe2\x80\x93', # –
				656	0x97 : b'\xe2\x80\x94', # —
				657	0x98 : b'\xcb\x9c', # ˜
				658	0x99 : b'\xe2\x84\xa2', # ™
				659	0x9a : b'\xc5\xa1', # š
				660	0x9b : b'\xe2\x80\xba', # ›
				661	0x9c : b'\xc5\x93', # œ
				662	0x9e : b'\xc5\xbe', # ž
				663	0x9f : b'\xc5\xb8', # Ÿ
				664	0xa0 : b'\xc2\xa0', #
				665	0xa1 : b'\xc2\xa1', # ¡
				666	0xa2 : b'\xc2\xa2', # ¢
				667	0xa3 : b'\xc2\xa3', # £
				668	0xa4 : b'\xc2\xa4', # ¤
				669	0xa5 : b'\xc2\xa5', # ¥
				670	0xa6 : b'\xc2\xa6', # ¦
				671	0xa7 : b'\xc2\xa7', # §
				672	0xa8 : b'\xc2\xa8', # ¨
				673	0xa9 : b'\xc2\xa9', # ©
				674	0xaa : b'\xc2\xaa', # ª
				675	0xab : b'\xc2\xab', # «
				676	0xac : b'\xc2\xac', # ¬
				677	0xad : b'\xc2\xad', #
				678	0xae : b'\xc2\xae', # ®
				679	0xaf : b'\xc2\xaf', # ¯
				680	0xb0 : b'\xc2\xb0', # °
				681	0xb1 : b'\xc2\xb1', # ±
				682	0xb2 : b'\xc2\xb2', # ²
				683	0xb3 : b'\xc2\xb3', # ³
				684	0xb4 : b'\xc2\xb4', # ´
				685	0xb5 : b'\xc2\xb5', # µ
				686	0xb6 : b'\xc2\xb6', # ¶
				687	0xb7 : b'\xc2\xb7', # ·
				688	0xb8 : b'\xc2\xb8', # ¸
				689	0xb9 : b'\xc2\xb9', # ¹
				690	0xba : b'\xc2\xba', # º
				691	0xbb : b'\xc2\xbb', # »
				692	0xbc : b'\xc2\xbc', # ¼
				693	0xbd : b'\xc2\xbd', # ½
				694	0xbe : b'\xc2\xbe', # ¾
				695	0xbf : b'\xc2\xbf', # ¿
				696	0xc0 : b'\xc3\x80', # À
				697	0xc1 : b'\xc3\x81', # Á
				698	0xc2 : b'\xc3\x82', # Â
				699	0xc3 : b'\xc3\x83', # Ã
				700	0xc4 : b'\xc3\x84', # Ä
				701	0xc5 : b'\xc3\x85', # Å
				702	0xc6 : b'\xc3\x86', # Æ
				703	0xc7 : b'\xc3\x87', # Ç
				704	0xc8 : b'\xc3\x88', # È
				705	0xc9 : b'\xc3\x89', # É
				706	0xca : b'\xc3\x8a', # Ê
				707	0xcb : b'\xc3\x8b', # Ë
				708	0xcc : b'\xc3\x8c', # Ì
				709	0xcd : b'\xc3\x8d', # Í
				710	0xce : b'\xc3\x8e', # Î
				711	0xcf : b'\xc3\x8f', # Ï
				712	0xd0 : b'\xc3\x90', # Ð
				713	0xd1 : b'\xc3\x91', # Ñ
				714	0xd2 : b'\xc3\x92', # Ò
				715	0xd3 : b'\xc3\x93', # Ó
				716	0xd4 : b'\xc3\x94', # Ô
				717	0xd5 : b'\xc3\x95', # Õ
				718	0xd6 : b'\xc3\x96', # Ö
				719	0xd7 : b'\xc3\x97', # ×
				720	0xd8 : b'\xc3\x98', # Ø
				721	0xd9 : b'\xc3\x99', # Ù
				722	0xda : b'\xc3\x9a', # Ú
				723	0xdb : b'\xc3\x9b', # Û
				724	0xdc : b'\xc3\x9c', # Ü
				725	0xdd : b'\xc3\x9d', # Ý
				726	0xde : b'\xc3\x9e', # Þ
				727	0xdf : b'\xc3\x9f', # ß
				728	0xe0 : b'\xc3\xa0', # à
				729	0xe1 : b'\xa1', # á
				730	0xe2 : b'\xc3\xa2', # â
				731	0xe3 : b'\xc3\xa3', # ã
				732	0xe4 : b'\xc3\xa4', # ä
				733	0xe5 : b'\xc3\xa5', # å
				734	0xe6 : b'\xc3\xa6', # æ
				735	0xe7 : b'\xc3\xa7', # ç
				736	0xe8 : b'\xc3\xa8', # è
				737	0xe9 : b'\xc3\xa9', # é
				738	0xea : b'\xc3\xaa', # ê
				739	0xeb : b'\xc3\xab', # ë
				740	0xec : b'\xc3\xac', # ì
				741	0xed : b'\xc3\xad', # í
				742	0xee : b'\xc3\xae', # î
				743	0xef : b'\xc3\xaf', # ï
				744	0xf0 : b'\xc3\xb0', # ð
				745	0xf1 : b'\xc3\xb1', # ñ
				746	0xf2 : b'\xc3\xb2', # ò
				747	0xf3 : b'\xc3\xb3', # ó
				748	0xf4 : b'\xc3\xb4', # ô
				749	0xf5 : b'\xc3\xb5', # õ
				750	0xf6 : b'\xc3\xb6', # ö
				751	0xf7 : b'\xc3\xb7', # ÷
				752	0xf8 : b'\xc3\xb8', # ø
				753	0xf9 : b'\xc3\xb9', # ù
				754	0xfa : b'\xc3\xba', # ú
				755	0xfb : b'\xc3\xbb', # û
				756	0xfc : b'\xc3\xbc', # ü
				757	0xfd : b'\xc3\xbd', # ý
				758	0xfe : b'\xc3\xbe', # þ
				759	}
				760
				761	MULTIBYTE_MARKERS_AND_SIZES = [
				762	(0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
				763	(0xe0, 0xef, 3), # 3-byte characters start with E0-EF
				764	(0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
				765	]
				766
				767	FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
				768	LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
				769
				770	@classmethod
				771	def detwingle(cls, in_bytes, main_encoding="utf8",
				772	embedded_encoding="windows-1252"):
				773	"""Fix characters from one encoding embedded in some other encoding.
				774
				775	Currently the only situation supported is Windows-1252 (or its
				776	subset ISO-8859-1), embedded in UTF-8.
				777
				778	The input must be a bytestring. If you've already converted
				779	the document to Unicode, you're too late.
				780
				781	The output is a bytestring in which `embedded_encoding`
				782	characters have been converted to their `main_encoding`
				783	equivalents.
				784	"""
				785	if embedded_encoding.replace('_', '-').lower() not in (
				786	'windows-1252', 'windows_1252'):
				787	raise NotImplementedError(
				788	"Windows-1252 and ISO-8859-1 are the only currently supported "
				789	"embedded encodings.")
				790
				791	if main_encoding.lower() not in ('utf8', 'utf-8'):
				792	raise NotImplementedError(
				793	"UTF-8 is the only currently supported main encoding.")
				794
				795	byte_chunks = []
				796
				797	chunk_start = 0
				798	pos = 0
				799	while pos < len(in_bytes):
				800	byte = in_bytes[pos]
				801	if not isinstance(byte, int):
				802	# Python 2.x
				803	byte = ord(byte)
				804	if (byte >= cls.FIRST_MULTIBYTE_MARKER
				805	and byte <= cls.LAST_MULTIBYTE_MARKER):
				806	# This is the start of a UTF-8 multibyte character. Skip
				807	# to the end.
				808	for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
				809	if byte >= start and byte <= end:
				810	pos += size
				811	break
				812	elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
				813	# We found a Windows-1252 character!
				814	# Save the string up to this point as a chunk.
				815	byte_chunks.append(in_bytes[chunk_start:pos])
				816
				817	# Now translate the Windows-1252 character into UTF-8
				818	# and add it as another, one-byte chunk.
				819	byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
				820	pos += 1
				821	chunk_start = pos
				822	else:
				823	# Go on to the next character.
				824	pos += 1
				825	if chunk_start == 0:
				826	# The string is unchanged.
				827	return in_bytes
				828	else:
				829	# Store the final chunk.
				830	byte_chunks.append(in_bytes[chunk_start:])
				831	return b''.join(byte_chunks)
				832