Blame - poky/bitbake/lib/bs4/tests/test_soup.py - mdmillerii/openbmc

blob: f87949e3d349f1ce761f6acc2e06e78af102d7ad [file] [log] [blame]

Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1	# -- coding: utf-8 --
				2	"""Tests of Beautiful Soup as a whole."""
				3
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	4	from pdb import set_trace
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	5	import logging
				6	import unittest
				7	import sys
				8	import tempfile
				9
				10	from bs4 import (
				11	BeautifulSoup,
				12	BeautifulStoneSoup,
				13	)
				14	from bs4.element import (
				15	CharsetMetaAttributeValue,
				16	ContentMetaAttributeValue,
				17	SoupStrainer,
				18	NamespacedAttribute,
				19	)
				20	import bs4.dammit
				21	from bs4.dammit import (
				22	EntitySubstitution,
				23	UnicodeDammit,
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	24	EncodingDetector,
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	25	)
				26	from bs4.testing import (
				27	SoupTest,
				28	skipIf,
				29	)
				30	import warnings
				31
				32	try:
				33	from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
				34	LXML_PRESENT = True
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	35	except ImportError as e:
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	36	LXML_PRESENT = False
				37
				38	PYTHON_2_PRE_2_7 = (sys.version_info < (2,7))
				39	PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
				40
				41	class TestConstructor(SoupTest):
				42
				43	def test_short_unicode_input(self):
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	44	data = "<h1>éé</h1>"
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	45	soup = self.soup(data)
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	46	self.assertEqual("éé", soup.h1.string)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	47
				48	def test_embedded_null(self):
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	49	data = "<h1>foo\0bar</h1>"
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	50	soup = self.soup(data)
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	51	self.assertEqual("foo\0bar", soup.h1.string)
				52
				53	def test_exclude_encodings(self):
				54	utf8_data = "Räksmörgås".encode("utf-8")
				55	soup = self.soup(utf8_data, exclude_encodings=["utf-8"])
				56	self.assertEqual("windows-1252", soup.original_encoding)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	57
				58
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	59	class TestWarnings(SoupTest):
				60
				61	def _no_parser_specified(self, s, is_there=True):
				62	v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80])
				63	self.assertTrue(v)
				64
				65	def test_warning_if_no_parser_specified(self):
				66	with warnings.catch_warnings(record=True) as w:
				67	soup = self.soup("<a><b></b></a>")
				68	msg = str(w[0].message)
				69	self._assert_no_parser_specified(msg)
				70
				71	def test_warning_if_parser_specified_too_vague(self):
				72	with warnings.catch_warnings(record=True) as w:
				73	soup = self.soup("<a><b></b></a>", "html")
				74	msg = str(w[0].message)
				75	self._assert_no_parser_specified(msg)
				76
				77	def test_no_warning_if_explicit_parser_specified(self):
				78	with warnings.catch_warnings(record=True) as w:
				79	soup = self.soup("<a><b></b></a>", "html.parser")
				80	self.assertEqual([], w)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	81
				82	def test_parseOnlyThese_renamed_to_parse_only(self):
				83	with warnings.catch_warnings(record=True) as w:
				84	soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b"))
				85	msg = str(w[0].message)
				86	self.assertTrue("parseOnlyThese" in msg)
				87	self.assertTrue("parse_only" in msg)
				88	self.assertEqual(b"<b></b>", soup.encode())
				89
				90	def test_fromEncoding_renamed_to_from_encoding(self):
				91	with warnings.catch_warnings(record=True) as w:
				92	utf8 = b"\xc3\xa9"
				93	soup = self.soup(utf8, fromEncoding="utf8")
				94	msg = str(w[0].message)
				95	self.assertTrue("fromEncoding" in msg)
				96	self.assertTrue("from_encoding" in msg)
				97	self.assertEqual("utf8", soup.original_encoding)
				98
				99	def test_unrecognized_keyword_argument(self):
				100	self.assertRaises(
				101	TypeError, self.soup, "<a>", no_such_argument=True)
				102
				103	class TestWarnings(SoupTest):
				104
				105	def test_disk_file_warning(self):
				106	filehandle = tempfile.NamedTemporaryFile()
				107	filename = filehandle.name
				108	try:
				109	with warnings.catch_warnings(record=True) as w:
				110	soup = self.soup(filename)
				111	msg = str(w[0].message)
				112	self.assertTrue("looks like a filename" in msg)
				113	finally:
				114	filehandle.close()
				115
				116	# The file no longer exists, so Beautiful Soup will no longer issue the warning.
				117	with warnings.catch_warnings(record=True) as w:
				118	soup = self.soup(filename)
				119	self.assertEqual(0, len(w))
				120
				121	def test_url_warning(self):
				122	with warnings.catch_warnings(record=True) as w:
				123	soup = self.soup("http://www.crummy.com/")
				124	msg = str(w[0].message)
				125	self.assertTrue("looks like a URL" in msg)
				126
				127	with warnings.catch_warnings(record=True) as w:
				128	soup = self.soup("http://www.crummy.com/ is great")
				129	self.assertEqual(0, len(w))
				130
				131	class TestSelectiveParsing(SoupTest):
				132
				133	def test_parse_with_soupstrainer(self):
				134	markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>"
				135	strainer = SoupStrainer("b")
				136	soup = self.soup(markup, parse_only=strainer)
				137	self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>")
				138
				139
				140	class TestEntitySubstitution(unittest.TestCase):
				141	"""Standalone tests of the EntitySubstitution class."""
				142	def setUp(self):
				143	self.sub = EntitySubstitution
				144
				145	def test_simple_html_substitution(self):
				146	# Unicode characters corresponding to named HTML entites
				147	# are substituted, and no others.
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	148	s = "foo\u2200\N{SNOWMAN}\u00f5bar"
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	149	self.assertEqual(self.sub.substitute_html(s),
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	150	"foo∀\N{SNOWMAN}õbar")
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	151
				152	def test_smart_quote_substitution(self):
				153	# MS smart quotes are a common source of frustration, so we
				154	# give them a special test.
				155	quotes = b"\x91\x92foo\x93\x94"
				156	dammit = UnicodeDammit(quotes)
				157	self.assertEqual(self.sub.substitute_html(dammit.markup),
				158	"‘’foo“”")
				159
				160	def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
				161	s = 'Welcome to "my bar"'
				162	self.assertEqual(self.sub.substitute_xml(s, False), s)
				163
				164	def test_xml_attribute_quoting_normally_uses_double_quotes(self):
				165	self.assertEqual(self.sub.substitute_xml("Welcome", True),
				166	'"Welcome"')
				167	self.assertEqual(self.sub.substitute_xml("Bob's Bar", True),
				168	'"Bob\'s Bar"')
				169
				170	def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self):
				171	s = 'Welcome to "my bar"'
				172	self.assertEqual(self.sub.substitute_xml(s, True),
				173	"'Welcome to \"my bar\"'")
				174
				175	def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self):
				176	s = 'Welcome to "Bob\'s Bar"'
				177	self.assertEqual(
				178	self.sub.substitute_xml(s, True),
				179	'"Welcome to "Bob\'s Bar""')
				180
				181	def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self):
				182	quoted = 'Welcome to "Bob\'s Bar"'
				183	self.assertEqual(self.sub.substitute_xml(quoted), quoted)
				184
				185	def test_xml_quoting_handles_angle_brackets(self):
				186	self.assertEqual(
				187	self.sub.substitute_xml("foo<bar>"),
				188	"foo<bar>")
				189
				190	def test_xml_quoting_handles_ampersands(self):
				191	self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&T")
				192
				193	def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self):
				194	self.assertEqual(
				195	self.sub.substitute_xml("ÁT&T"),
				196	"&Aacute;T&T")
				197
				198	def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self):
				199	self.assertEqual(
				200	self.sub.substitute_xml_containing_entities("ÁT&T"),
				201	"ÁT&T")
				202
				203	def test_quotes_not_html_substituted(self):
				204	"""There's no need to do this except inside attribute values."""
				205	text = 'Bob\'s "bar"'
				206	self.assertEqual(self.sub.substitute_html(text), text)
				207
				208
				209	class TestEncodingConversion(SoupTest):
				210	# Test Beautiful Soup's ability to decode and encode from various
				211	# encodings.
				212
				213	def setUp(self):
				214	super(TestEncodingConversion, self).setUp()
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	215	self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	216	self.utf8_data = self.unicode_data.encode("utf-8")
				217	# Just so you know what it looks like.
				218	self.assertEqual(
				219	self.utf8_data,
				220	b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>')
				221
				222	def test_ascii_in_unicode_out(self):
				223	# ASCII input is converted to Unicode. The original_encoding
				224	# attribute is set to 'utf-8', a superset of ASCII.
				225	chardet = bs4.dammit.chardet_dammit
				226	logging.disable(logging.WARNING)
				227	try:
				228	def noop(str):
				229	return None
				230	# Disable chardet, which will realize that the ASCII is ASCII.
				231	bs4.dammit.chardet_dammit = noop
				232	ascii = b"<foo>a</foo>"
				233	soup_from_ascii = self.soup(ascii)
				234	unicode_output = soup_from_ascii.decode()
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	235	self.assertTrue(isinstance(unicode_output, str))
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	236	self.assertEqual(unicode_output, self.document_for(ascii.decode()))
				237	self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
				238	finally:
				239	logging.disable(logging.NOTSET)
				240	bs4.dammit.chardet_dammit = chardet
				241
				242	def test_unicode_in_unicode_out(self):
				243	# Unicode input is left alone. The original_encoding attribute
				244	# is not set.
				245	soup_from_unicode = self.soup(self.unicode_data)
				246	self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	247	self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!')
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	248	self.assertEqual(soup_from_unicode.original_encoding, None)
				249
				250	def test_utf8_in_unicode_out(self):
				251	# UTF-8 input is converted to Unicode. The original_encoding
				252	# attribute is set.
				253	soup_from_utf8 = self.soup(self.utf8_data)
				254	self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	255	self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!')
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	256
				257	def test_utf8_out(self):
				258	# The internal data structures can be encoded as UTF-8.
				259	soup_from_unicode = self.soup(self.unicode_data)
				260	self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data)
				261
				262	@skipIf(
				263	PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2,
				264	"Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
				265	def test_attribute_name_containing_unicode_characters(self):
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	266	markup = '<div><a \N{SNOWMAN}="snowman"></a></div>'
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	267	self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
				268
				269	class TestUnicodeDammit(unittest.TestCase):
				270	"""Standalone tests of UnicodeDammit."""
				271
				272	def test_unicode_input(self):
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	273	markup = "I'm already Unicode! \N{SNOWMAN}"
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	274	dammit = UnicodeDammit(markup)
				275	self.assertEqual(dammit.unicode_markup, markup)
				276
				277	def test_smart_quotes_to_unicode(self):
				278	markup = b"<foo>\x91\x92\x93\x94</foo>"
				279	dammit = UnicodeDammit(markup)
				280	self.assertEqual(
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	281	dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>")
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	282
				283	def test_smart_quotes_to_xml_entities(self):
				284	markup = b"<foo>\x91\x92\x93\x94</foo>"
				285	dammit = UnicodeDammit(markup, smart_quotes_to="xml")
				286	self.assertEqual(
				287	dammit.unicode_markup, "<foo>‘’“”</foo>")
				288
				289	def test_smart_quotes_to_html_entities(self):
				290	markup = b"<foo>\x91\x92\x93\x94</foo>"
				291	dammit = UnicodeDammit(markup, smart_quotes_to="html")
				292	self.assertEqual(
				293	dammit.unicode_markup, "<foo>‘’“”</foo>")
				294
				295	def test_smart_quotes_to_ascii(self):
				296	markup = b"<foo>\x91\x92\x93\x94</foo>"
				297	dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
				298	self.assertEqual(
				299	dammit.unicode_markup, """<foo>''""</foo>""")
				300
				301	def test_detect_utf8(self):
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	302	utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	303	dammit = UnicodeDammit(utf8)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	304	self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	305	self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}')
				306
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	307
				308	def test_convert_hebrew(self):
				309	hebrew = b"\xed\xe5\xec\xf9"
				310	dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
				311	self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	312	self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9')
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	313
				314	def test_dont_see_smart_quotes_where_there_are_none(self):
				315	utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
				316	dammit = UnicodeDammit(utf_8)
				317	self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
				318	self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
				319
				320	def test_ignore_inappropriate_codecs(self):
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	321	utf8_data = "Räksmörgås".encode("utf-8")
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	322	dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
				323	self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
				324
				325	def test_ignore_invalid_codecs(self):
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	326	utf8_data = "Räksmörgås".encode("utf-8")
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	327	for bad_encoding in ['.utf8', '...', 'utF---16.!']:
				328	dammit = UnicodeDammit(utf8_data, [bad_encoding])
				329	self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
				330
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	331	def test_exclude_encodings(self):
				332	# This is UTF-8.
				333	utf8_data = "Räksmörgås".encode("utf-8")
				334
				335	# But if we exclude UTF-8 from consideration, the guess is
				336	# Windows-1252.
				337	dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"])
				338	self.assertEqual(dammit.original_encoding.lower(), 'windows-1252')
				339
				340	# And if we exclude that, there is no valid guess at all.
				341	dammit = UnicodeDammit(
				342	utf8_data, exclude_encodings=["utf-8", "windows-1252"])
				343	self.assertEqual(dammit.original_encoding, None)
				344
				345	def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self):
				346	detected = EncodingDetector(
				347	b'<?xml version="1.0" encoding="UTF-\xdb" ?>')
				348	encodings = list(detected.encodings)
				349	assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings
				350
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	351	def test_detect_html5_style_meta_tag(self):
				352
				353	for data in (
				354	b'<html><meta charset="euc-jp" /></html>',
				355	b"<html><meta charset='euc-jp' /></html>",
				356	b"<html><meta charset=euc-jp /></html>",
				357	b"<html><meta charset=euc-jp/></html>"):
				358	dammit = UnicodeDammit(data, is_html=True)
				359	self.assertEqual(
				360	"euc-jp", dammit.original_encoding)
				361
				362	def test_last_ditch_entity_replacement(self):
				363	# This is a UTF-8 document that contains bytestrings
				364	# completely incompatible with UTF-8 (ie. encoded with some other
				365	# encoding).
				366	#
				367	# Since there is no consistent encoding for the document,
				368	# Unicode, Dammit will eventually encode the document as UTF-8
				369	# and encode the incompatible characters as REPLACEMENT
				370	# CHARACTER.
				371	#
				372	# If chardet is installed, it will detect that the document
				373	# can be converted into ISO-8859-1 without errors. This happens
				374	# to be the wrong encoding, but it is a consistent encoding, so the
				375	# code we're testing here won't run.
				376	#
				377	# So we temporarily disable chardet if it's present.
				378	doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
				379	<html><b>\330\250\330\252\330\261</b>
				380	<i>\310\322\321\220\312\321\355\344</i></html>"""
				381	chardet = bs4.dammit.chardet_dammit
				382	logging.disable(logging.WARNING)
				383	try:
				384	def noop(str):
				385	return None
				386	bs4.dammit.chardet_dammit = noop
				387	dammit = UnicodeDammit(doc)
				388	self.assertEqual(True, dammit.contains_replacement_characters)
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	389	self.assertTrue("\ufffd" in dammit.unicode_markup)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	390
				391	soup = BeautifulSoup(doc, "html.parser")
				392	self.assertTrue(soup.contains_replacement_characters)
				393	finally:
				394	logging.disable(logging.NOTSET)
				395	bs4.dammit.chardet_dammit = chardet
				396
				397	def test_byte_order_mark_removed(self):
				398	# A document written in UTF-16LE will have its byte order marker stripped.
				399	data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
				400	dammit = UnicodeDammit(data)
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	401	self.assertEqual("<a>áé</a>", dammit.unicode_markup)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	402	self.assertEqual("utf-16le", dammit.original_encoding)
				403
				404	def test_detwingle(self):
				405	# Here's a UTF8 document.
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	406	utf8 = ("\N{SNOWMAN}" * 3).encode("utf8")
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	407
				408	# Here's a Windows-1252 document.
				409	windows_1252 = (
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	410	"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
				411	"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	412
				413	# Through some unholy alchemy, they've been stuck together.
				414	doc = utf8 + windows_1252 + utf8
				415
				416	# The document can't be turned into UTF-8:
				417	self.assertRaises(UnicodeDecodeError, doc.decode, "utf8")
				418
				419	# Unicode, Dammit thinks the whole document is Windows-1252,
				420	# and decodes it into "â˜ƒâ˜ƒâ˜ƒ“Hi, I like Windows!”â˜ƒâ˜ƒâ˜ƒ"
				421
				422	# But if we run it through fix_embedded_windows_1252, it's fixed:
				423
				424	fixed = UnicodeDammit.detwingle(doc)
				425	self.assertEqual(
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	426	"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	427
				428	def test_detwingle_ignores_multibyte_characters(self):
				429	# Each of these characters has a UTF-8 representation ending
				430	# in \x93. \x93 is a smart quote if interpreted as
				431	# Windows-1252. But our code knows to skip over multibyte
				432	# UTF-8 characters, so they'll survive the process unscathed.
				433	for tricky_unicode_char in (
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	434	"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
				435	"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
				436	"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	437	):
				438	input = tricky_unicode_char.encode("utf8")
				439	self.assertTrue(input.endswith(b'\x93'))
				440	output = UnicodeDammit.detwingle(input)
				441	self.assertEqual(output, input)
				442
				443	class TestNamedspacedAttribute(SoupTest):
				444
				445	def test_name_may_be_none(self):
				446	a = NamespacedAttribute("xmlns", None)
				447	self.assertEqual(a, "xmlns")
				448
				449	def test_attribute_is_equivalent_to_colon_separated_string(self):
				450	a = NamespacedAttribute("a", "b")
				451	self.assertEqual("a:b", a)
				452
				453	def test_attributes_are_equivalent_if_prefix_and_name_identical(self):
				454	a = NamespacedAttribute("a", "b", "c")
				455	b = NamespacedAttribute("a", "b", "c")
				456	self.assertEqual(a, b)
				457
				458	# The actual namespace is not considered.
				459	c = NamespacedAttribute("a", "b", None)
				460	self.assertEqual(a, c)
				461
				462	# But name and prefix are important.
				463	d = NamespacedAttribute("a", "z", "c")
				464	self.assertNotEqual(a, d)
				465
				466	e = NamespacedAttribute("z", "b", "c")
				467	self.assertNotEqual(a, e)
				468
				469
				470	class TestAttributeValueWithCharsetSubstitution(unittest.TestCase):
				471
				472	def test_content_meta_attribute_value(self):
				473	value = CharsetMetaAttributeValue("euc-jp")
				474	self.assertEqual("euc-jp", value)
				475	self.assertEqual("euc-jp", value.original_value)
				476	self.assertEqual("utf8", value.encode("utf8"))
				477
				478
				479	def test_content_meta_attribute_value(self):
				480	value = ContentMetaAttributeValue("text/html; charset=euc-jp")
				481	self.assertEqual("text/html; charset=euc-jp", value)
				482	self.assertEqual("text/html; charset=euc-jp", value.original_value)
				483	self.assertEqual("text/html; charset=utf8", value.encode("utf8"))