Blame - poky/bitbake/lib/bs4/testing.py - mdmillerii/openbmc

blob: 3a2f260e24ee6ac4fcf6e6ab372eb1e7ced5d994 [file] [log] [blame]

Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1	"""Helper classes for tests."""
				2
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	3	__license__ = "MIT"
				4
				5	import pickle
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	6	import copy
				7	import functools
				8	import unittest
				9	from unittest import TestCase
				10	from bs4 import BeautifulSoup
				11	from bs4.element import (
				12	CharsetMetaAttributeValue,
				13	Comment,
				14	ContentMetaAttributeValue,
				15	Doctype,
				16	SoupStrainer,
				17	)
				18
				19	from bs4.builder import HTMLParserTreeBuilder
				20	default_builder = HTMLParserTreeBuilder
				21
				22
				23	class SoupTest(unittest.TestCase):
				24
				25	@property
				26	def default_builder(self):
				27	return default_builder()
				28
				29	def soup(self, markup, **kwargs):
				30	"""Build a Beautiful Soup object from markup."""
				31	builder = kwargs.pop('builder', self.default_builder)
				32	return BeautifulSoup(markup, builder=builder, **kwargs)
				33
				34	def document_for(self, markup):
				35	"""Turn an HTML fragment into a document.
				36
				37	The details depend on the builder.
				38	"""
				39	return self.default_builder.test_fragment_to_document(markup)
				40
				41	def assertSoupEquals(self, to_parse, compare_parsed_to=None):
				42	builder = self.default_builder
				43	obj = BeautifulSoup(to_parse, builder=builder)
				44	if compare_parsed_to is None:
				45	compare_parsed_to = to_parse
				46
				47	self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
				48
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	49	def assertConnectedness(self, element):
				50	"""Ensure that next_element and previous_element are properly
				51	set for all descendants of the given element.
				52	"""
				53	earlier = None
				54	for e in element.descendants:
				55	if earlier:
				56	self.assertEqual(e, earlier.next_element)
				57	self.assertEqual(earlier, e.previous_element)
				58	earlier = e
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	59
				60	class HTMLTreeBuilderSmokeTest(object):
				61
				62	"""A basic test of a treebuilder's competence.
				63
				64	Any HTML treebuilder, present or future, should be able to pass
				65	these tests. With invalid markup, there's room for interpretation,
				66	and different parsers can handle it differently. But with the
				67	markup in these tests, there's not much room for interpretation.
				68	"""
				69
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	70	def test_pickle_and_unpickle_identity(self):
				71	# Pickling a tree, then unpickling it, yields a tree identical
				72	# to the original.
				73	tree = self.soup("<a><b>foo</a>")
				74	dumped = pickle.dumps(tree, 2)
				75	loaded = pickle.loads(dumped)
				76	self.assertEqual(loaded.__class__, BeautifulSoup)
				77	self.assertEqual(loaded.decode(), tree.decode())
				78
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	79	def assertDoctypeHandled(self, doctype_fragment):
				80	"""Assert that a given doctype string is handled correctly."""
				81	doctype_str, soup = self._document_with_doctype(doctype_fragment)
				82
				83	# Make sure a Doctype object was created.
				84	doctype = soup.contents[0]
				85	self.assertEqual(doctype.__class__, Doctype)
				86	self.assertEqual(doctype, doctype_fragment)
				87	self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
				88
				89	# Make sure that the doctype was correctly associated with the
				90	# parse tree and that the rest of the document parsed.
				91	self.assertEqual(soup.p.contents[0], 'foo')
				92
				93	def _document_with_doctype(self, doctype_fragment):
				94	"""Generate and parse a document with the given doctype."""
				95	doctype = '<!DOCTYPE %s>' % doctype_fragment
				96	markup = doctype + '\n<p>foo</p>'
				97	soup = self.soup(markup)
				98	return doctype, soup
				99
				100	def test_normal_doctypes(self):
				101	"""Make sure normal, everyday HTML doctypes are handled correctly."""
				102	self.assertDoctypeHandled("html")
				103	self.assertDoctypeHandled(
				104	'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
				105
				106	def test_empty_doctype(self):
				107	soup = self.soup("<!DOCTYPE>")
				108	doctype = soup.contents[0]
				109	self.assertEqual("", doctype.strip())
				110
				111	def test_public_doctype_with_url(self):
				112	doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
				113	self.assertDoctypeHandled(doctype)
				114
				115	def test_system_doctype(self):
				116	self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
				117
				118	def test_namespaced_system_doctype(self):
				119	# We can handle a namespaced doctype with a system ID.
				120	self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
				121
				122	def test_namespaced_public_doctype(self):
				123	# Test a namespaced doctype with a public id.
				124	self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
				125
				126	def test_real_xhtml_document(self):
				127	"""A real XHTML document should come out more or less the same as it went in."""
				128	markup = b"""<?xml version="1.0" encoding="utf-8"?>
				129	<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
				130	<html xmlns="http://www.w3.org/1999/xhtml">
				131	<head><title>Hello.</title></head>
				132	<body>Goodbye.</body>
				133	</html>"""
				134	soup = self.soup(markup)
				135	self.assertEqual(
				136	soup.encode("utf-8").replace(b"\n", b""),
				137	markup.replace(b"\n", b""))
				138
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	139	def test_processing_instruction(self):
				140	markup = b"""<?PITarget PIContent?>"""
				141	soup = self.soup(markup)
				142	self.assertEqual(markup, soup.encode("utf8"))
				143
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	144	def test_deepcopy(self):
				145	"""Make sure you can copy the tree builder.
				146
				147	This is important because the builder is part of a
				148	BeautifulSoup object, and we want to be able to copy that.
				149	"""
				150	copy.deepcopy(self.default_builder)
				151
				152	def test_p_tag_is_never_empty_element(self):
				153	"""A <p> tag is never designated as an empty-element tag.
				154
				155	Even if the markup shows it as an empty-element tag, it
				156	shouldn't be presented that way.
				157	"""
				158	soup = self.soup("<p/>")
				159	self.assertFalse(soup.p.is_empty_element)
				160	self.assertEqual(str(soup.p), "<p></p>")
				161
				162	def test_unclosed_tags_get_closed(self):
				163	"""A tag that's not closed by the end of the document should be closed.
				164
				165	This applies to all tags except empty-element tags.
				166	"""
				167	self.assertSoupEquals("<p>", "<p></p>")
				168	self.assertSoupEquals("<b>", "<b></b>")
				169
				170	self.assertSoupEquals("<br>", "<br/>")
				171
				172	def test_br_is_always_empty_element_tag(self):
				173	"""A <br> tag is designated as an empty-element tag.
				174
				175	Some parsers treat <br></br> as one <br/> tag, some parsers as
				176	two tags, but it should always be an empty-element tag.
				177	"""
				178	soup = self.soup("<br></br>")
				179	self.assertTrue(soup.br.is_empty_element)
				180	self.assertEqual(str(soup.br), "<br/>")
				181
				182	def test_nested_formatting_elements(self):
				183	self.assertSoupEquals("<em><em></em></em>")
				184
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	185	def test_double_head(self):
				186	html = '''<!DOCTYPE html>
				187	<html>
				188	<head>
				189	<title>Ordinary HEAD element test</title>
				190	</head>
				191	<script type="text/javascript">
				192	alert("Help!");
				193	</script>
				194	<body>
				195	Hello, world!
				196	</body>
				197	</html>
				198	'''
				199	soup = self.soup(html)
				200	self.assertEqual("text/javascript", soup.find('script')['type'])
				201
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	202	def test_comment(self):
				203	# Comments are represented as Comment objects.
				204	markup = "<p>foo<!--foobar-->baz</p>"
				205	self.assertSoupEquals(markup)
				206
				207	soup = self.soup(markup)
				208	comment = soup.find(text="foobar")
				209	self.assertEqual(comment.__class__, Comment)
				210
				211	# The comment is properly integrated into the tree.
				212	foo = soup.find(text="foo")
				213	self.assertEqual(comment, foo.next_element)
				214	baz = soup.find(text="baz")
				215	self.assertEqual(comment, baz.previous_element)
				216
				217	def test_preserved_whitespace_in_pre_and_textarea(self):
				218	"""Whitespace must be preserved in <pre> and <textarea> tags."""
				219	self.assertSoupEquals("<pre> </pre>")
				220	self.assertSoupEquals("<textarea> woo </textarea>")
				221
				222	def test_nested_inline_elements(self):
				223	"""Inline elements can be nested indefinitely."""
				224	b_tag = "<b>Inside a B tag</b>"
				225	self.assertSoupEquals(b_tag)
				226
				227	nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>"
				228	self.assertSoupEquals(nested_b_tag)
				229
				230	double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>"
				231	self.assertSoupEquals(nested_b_tag)
				232
				233	def test_nested_block_level_elements(self):
				234	"""Block elements can be nested."""
				235	soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>')
				236	blockquote = soup.blockquote
				237	self.assertEqual(blockquote.p.b.string, 'Foo')
				238	self.assertEqual(blockquote.b.string, 'Foo')
				239
				240	def test_correctly_nested_tables(self):
				241	"""One table can go inside another one."""
				242	markup = ('<table id="1">'
				243	'<tr>'
				244	"<td>Here's another table:"
				245	'<table id="2">'
				246	'<tr><td>foo</td></tr>'
				247	'</table></td>')
				248
				249	self.assertSoupEquals(
				250	markup,
				251	'<table id="1"><tr><td>Here\'s another table:'
				252	'<table id="2"><tr><td>foo</td></tr></table>'
				253	'</td></tr></table>')
				254
				255	self.assertSoupEquals(
				256	"<table><thead><tr><td>Foo</td></tr></thead>"
				257	"<tbody><tr><td>Bar</td></tr></tbody>"
				258	"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
				259
				260	def test_deeply_nested_multivalued_attribute(self):
				261	# html5lib can set the attributes of the same tag many times
				262	# as it rearranges the tree. This has caused problems with
				263	# multivalued attributes.
				264	markup = '<table><div><div class="css"></div></div></table>'
				265	soup = self.soup(markup)
				266	self.assertEqual(["css"], soup.div.div['class'])
				267
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	268	def test_multivalued_attribute_on_html(self):
				269	# html5lib uses a different API to set the attributes ot the
				270	# <html> tag. This has caused problems with multivalued
				271	# attributes.
				272	markup = '<html class="a b"></html>'
				273	soup = self.soup(markup)
				274	self.assertEqual(["a", "b"], soup.html['class'])
				275
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	276	def test_angle_brackets_in_attribute_values_are_escaped(self):
				277	self.assertSoupEquals('<a b="<a>"></a>', '<a b="<a>"></a>')
				278
				279	def test_entities_in_attributes_converted_to_unicode(self):
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	280	expect = '<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	281	self.assertSoupEquals('<p id="piñata"></p>', expect)
				282	self.assertSoupEquals('<p id="piñata"></p>', expect)
				283	self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect)
				284	self.assertSoupEquals('<p id="piñata"></p>', expect)
				285
				286	def test_entities_in_text_converted_to_unicode(self):
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	287	expect = '<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	288	self.assertSoupEquals("<p>piñata</p>", expect)
				289	self.assertSoupEquals("<p>piñata</p>", expect)
				290	self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect)
				291	self.assertSoupEquals("<p>piñata</p>", expect)
				292
				293	def test_quot_entity_converted_to_quotation_mark(self):
				294	self.assertSoupEquals("<p>I said "good day!"</p>",
				295	'<p>I said "good day!"</p>')
				296
				297	def test_out_of_range_entity(self):
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	298	expect = "\N{REPLACEMENT CHARACTER}"
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	299	self.assertSoupEquals("&#10000000000000;", expect)
				300	self.assertSoupEquals("&#x10000000000000;", expect)
				301	self.assertSoupEquals("&#1000000000;", expect)
				302
				303	def test_multipart_strings(self):
				304	"Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
				305	soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
				306	self.assertEqual("p", soup.h2.string.next_element.name)
				307	self.assertEqual("p", soup.p.name)
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	308	self.assertConnectedness(soup)
				309
				310	def test_head_tag_between_head_and_body(self):
				311	"Prevent recurrence of a bug in the html5lib treebuilder."
				312	content = """<html><head></head>
				313	<link></link>
				314	<body>foo</body>
				315	</html>
				316	"""
				317	soup = self.soup(content)
				318	self.assertNotEqual(None, soup.html.body)
				319	self.assertConnectedness(soup)
				320
				321	def test_multiple_copies_of_a_tag(self):
				322	"Prevent recurrence of a bug in the html5lib treebuilder."
				323	content = """<!DOCTYPE html>
				324	<html>
				325	<body>
				326	<article id="a" >
				327	<div><a href="1"></div>
				328	<footer>
				329	<a href="2"></a>
				330	</footer>
				331	</article>
				332	</body>
				333	</html>
				334	"""
				335	soup = self.soup(content)
				336	self.assertConnectedness(soup.article)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	337
				338	def test_basic_namespaces(self):
				339	"""Parsers don't need to understand namespaces, but at the
				340	very least they should not choke on namespaces or lose
				341	data."""
				342
				343	markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>'
				344	soup = self.soup(markup)
				345	self.assertEqual(markup, soup.encode())
				346	html = soup.html
				347	self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns'])
				348	self.assertEqual(
				349	'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml'])
				350	self.assertEqual(
				351	'http://www.w3.org/2000/svg', soup.html['xmlns:svg'])
				352
				353	def test_multivalued_attribute_value_becomes_list(self):
				354	markup = b'<a class="foo bar">'
				355	soup = self.soup(markup)
				356	self.assertEqual(['foo', 'bar'], soup.a['class'])
				357
				358	#
				359	# Generally speaking, tests below this point are more tests of
				360	# Beautiful Soup than tests of the tree builders. But parsers are
				361	# weird, so we run these tests separately for every tree builder
				362	# to detect any differences between them.
				363	#
				364
				365	def test_can_parse_unicode_document(self):
				366	# A seemingly innocuous document... but it's in Unicode! And
				367	# it contains characters that can't be represented in the
				368	# encoding found in the declaration! The horror!
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	369	markup = '<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	370	soup = self.soup(markup)
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	371	self.assertEqual('Sacr\xe9 bleu!', soup.body.string)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	372
				373	def test_soupstrainer(self):
				374	"""Parsers should be able to work with SoupStrainers."""
				375	strainer = SoupStrainer("b")
				376	soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>",
				377	parse_only=strainer)
				378	self.assertEqual(soup.decode(), "<b>bold</b>")
				379
				380	def test_single_quote_attribute_values_become_double_quotes(self):
				381	self.assertSoupEquals("<foo attr='bar'></foo>",
				382	'<foo attr="bar"></foo>')
				383
				384	def test_attribute_values_with_nested_quotes_are_left_alone(self):
				385	text = """<foo attr='bar "brawls" happen'>a</foo>"""
				386	self.assertSoupEquals(text)
				387
				388	def test_attribute_values_with_double_nested_quotes_get_quoted(self):
				389	text = """<foo attr='bar "brawls" happen'>a</foo>"""
				390	soup = self.soup(text)
				391	soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
				392	self.assertSoupEquals(
				393	soup.foo.decode(),
				394	"""<foo attr="Brawls happen at "Bob\'s Bar"">a</foo>""")
				395
				396	def test_ampersand_in_attribute_value_gets_escaped(self):
				397	self.assertSoupEquals('<this is="really messed up & stuff"></this>',
				398	'<this is="really messed up & stuff"></this>')
				399
				400	self.assertSoupEquals(
				401	'<a href="http://example.org?a=1&b=2;3">foo</a>',
				402	'<a href="http://example.org?a=1&b=2;3">foo</a>')
				403
				404	def test_escaped_ampersand_in_attribute_value_is_left_alone(self):
				405	self.assertSoupEquals('<a href="http://example.org?a=1&b=2;3"></a>')
				406
				407	def test_entities_in_strings_converted_during_parsing(self):
				408	# Both XML and HTML entities are converted to Unicode characters
				409	# during parsing.
				410	text = "<p><<sacré bleu!>></p>"
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	411	expected = "<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>"
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	412	self.assertSoupEquals(text, expected)
				413
				414	def test_smart_quotes_converted_on_the_way_in(self):
				415	# Microsoft smart quotes are converted to Unicode characters during
				416	# parsing.
				417	quote = b"<p>\x91Foo\x92</p>"
				418	soup = self.soup(quote)
				419	self.assertEqual(
				420	soup.p.string,
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	421	"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	422
				423	def test_non_breaking_spaces_converted_on_the_way_in(self):
				424	soup = self.soup("<a>  </a>")
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	425	self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	426
				427	def test_entities_converted_on_the_way_out(self):
				428	text = "<p><<sacré bleu!>></p>"
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	429	expected = "<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>".encode("utf-8")
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	430	soup = self.soup(text)
				431	self.assertEqual(soup.p.encode("utf-8"), expected)
				432
				433	def test_real_iso_latin_document(self):
				434	# Smoke test of interrelated functionality, using an
				435	# easy-to-understand document.
				436
				437	# Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	438	unicode_html = '<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	439
				440	# That's because we're going to encode it into ISO-Latin-1, and use
				441	# that to test.
				442	iso_latin_html = unicode_html.encode("iso-8859-1")
				443
				444	# Parse the ISO-Latin-1 HTML.
				445	soup = self.soup(iso_latin_html)
				446	# Encode it to UTF-8.
				447	result = soup.encode("utf-8")
				448
				449	# What do we expect the result to look like? Well, it would
				450	# look like unicode_html, except that the META tag would say
				451	# UTF-8 instead of ISO-Latin-1.
				452	expected = unicode_html.replace("ISO-Latin-1", "utf-8")
				453
				454	# And, of course, it would be in UTF-8, not Unicode.
				455	expected = expected.encode("utf-8")
				456
				457	# Ta-da!
				458	self.assertEqual(result, expected)
				459
				460	def test_real_shift_jis_document(self):
				461	# Smoke test to make sure the parser can handle a document in
				462	# Shift-JIS encoding, without choking.
				463	shift_jis_html = (
				464	b'<html><head></head><body><pre>'
				465	b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
				466	b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
				467	b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
				468	b'</pre></body></html>')
				469	unicode_html = shift_jis_html.decode("shift-jis")
				470	soup = self.soup(unicode_html)
				471
				472	# Make sure the parse tree is correctly encoded to various
				473	# encodings.
				474	self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8"))
				475	self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp"))
				476
				477	def test_real_hebrew_document(self):
				478	# A real-world test to make sure we can convert ISO-8859-9 (a
				479	# Hebrew encoding) to UTF-8.
				480	hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
				481	soup = self.soup(
				482	hebrew_document, from_encoding="iso8859-8")
				483	self.assertEqual(soup.original_encoding, 'iso8859-8')
				484	self.assertEqual(
				485	soup.encode('utf-8'),
				486	hebrew_document.decode("iso8859-8").encode("utf-8"))
				487
				488	def test_meta_tag_reflects_current_encoding(self):
				489	# Here's the <meta> tag saying that a document is
				490	# encoded in Shift-JIS.
				491	meta_tag = ('<meta content="text/html; charset=x-sjis" '
				492	'http-equiv="Content-type"/>')
				493
				494	# Here's a document incorporating that meta tag.
				495	shift_jis_html = (
				496	'<html><head>\n%s\n'
				497	'<meta http-equiv="Content-language" content="ja"/>'
				498	'</head><body>Shift-JIS markup goes here.') % meta_tag
				499	soup = self.soup(shift_jis_html)
				500
				501	# Parse the document, and the charset is seemingly unaffected.
				502	parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
				503	content = parsed_meta['content']
				504	self.assertEqual('text/html; charset=x-sjis', content)
				505
				506	# But that value is actually a ContentMetaAttributeValue object.
				507	self.assertTrue(isinstance(content, ContentMetaAttributeValue))
				508
				509	# And it will take on a value that reflects its current
				510	# encoding.
				511	self.assertEqual('text/html; charset=utf8', content.encode("utf8"))
				512
				513	# For the rest of the story, see TestSubstitutions in
				514	# test_tree.py.
				515
				516	def test_html5_style_meta_tag_reflects_current_encoding(self):
				517	# Here's the <meta> tag saying that a document is
				518	# encoded in Shift-JIS.
				519	meta_tag = ('<meta id="encoding" charset="x-sjis" />')
				520
				521	# Here's a document incorporating that meta tag.
				522	shift_jis_html = (
				523	'<html><head>\n%s\n'
				524	'<meta http-equiv="Content-language" content="ja"/>'
				525	'</head><body>Shift-JIS markup goes here.') % meta_tag
				526	soup = self.soup(shift_jis_html)
				527
				528	# Parse the document, and the charset is seemingly unaffected.
				529	parsed_meta = soup.find('meta', id="encoding")
				530	charset = parsed_meta['charset']
				531	self.assertEqual('x-sjis', charset)
				532
				533	# But that value is actually a CharsetMetaAttributeValue object.
				534	self.assertTrue(isinstance(charset, CharsetMetaAttributeValue))
				535
				536	# And it will take on a value that reflects its current
				537	# encoding.
				538	self.assertEqual('utf8', charset.encode("utf8"))
				539
				540	def test_tag_with_no_attributes_can_have_attributes_added(self):
				541	data = self.soup("<a>text</a>")
				542	data.a['foo'] = 'bar'
				543	self.assertEqual('<a foo="bar">text</a>', data.a.decode())
				544
				545	class XMLTreeBuilderSmokeTest(object):
				546
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	547	def test_pickle_and_unpickle_identity(self):
				548	# Pickling a tree, then unpickling it, yields a tree identical
				549	# to the original.
				550	tree = self.soup("<a><b>foo</a>")
				551	dumped = pickle.dumps(tree, 2)
				552	loaded = pickle.loads(dumped)
				553	self.assertEqual(loaded.__class__, BeautifulSoup)
				554	self.assertEqual(loaded.decode(), tree.decode())
				555
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	556	def test_docstring_generated(self):
				557	soup = self.soup("<root/>")
				558	self.assertEqual(
				559	soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
				560
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	561	def test_xml_declaration(self):
				562	markup = b"""<?xml version="1.0" encoding="utf8"?>\n<foo/>"""
				563	soup = self.soup(markup)
				564	self.assertEqual(markup, soup.encode("utf8"))
				565
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	566	def test_real_xhtml_document(self):
				567	"""A real XHTML document should come out exactly the same as it went in."""
				568	markup = b"""<?xml version="1.0" encoding="utf-8"?>
				569	<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
				570	<html xmlns="http://www.w3.org/1999/xhtml">
				571	<head><title>Hello.</title></head>
				572	<body>Goodbye.</body>
				573	</html>"""
				574	soup = self.soup(markup)
				575	self.assertEqual(
				576	soup.encode("utf-8"), markup)
				577
				578	def test_formatter_processes_script_tag_for_xml_documents(self):
				579	doc = """
				580	<script type="text/javascript">
				581	</script>
				582	"""
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	583	soup = BeautifulSoup(doc, "lxml-xml")
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	584	# lxml would have stripped this while parsing, but we can add
				585	# it later.
				586	soup.script.string = 'console.log("< < hey > > ");'
				587	encoded = soup.encode()
				588	self.assertTrue(b"< < hey > >" in encoded)
				589
				590	def test_can_parse_unicode_document(self):
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	591	markup = '<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	592	soup = self.soup(markup)
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	593	self.assertEqual('Sacr\xe9 bleu!', soup.root.string)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	594
				595	def test_popping_namespaced_tag(self):
				596	markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
				597	soup = self.soup(markup)
				598	self.assertEqual(
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	599	str(soup.rss), markup)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	600
				601	def test_docstring_includes_correct_encoding(self):
				602	soup = self.soup("<root/>")
				603	self.assertEqual(
				604	soup.encode("latin1"),
				605	b'<?xml version="1.0" encoding="latin1"?>\n<root/>')
				606
				607	def test_large_xml_document(self):
				608	"""A large XML document should come out the same as it went in."""
				609	markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>'
				610	+ b'0' * (2**12)
				611	+ b'</root>')
				612	soup = self.soup(markup)
				613	self.assertEqual(soup.encode("utf-8"), markup)
				614
				615
				616	def test_tags_are_empty_element_if_and_only_if_they_are_empty(self):
				617	self.assertSoupEquals("<p>", "<p/>")
				618	self.assertSoupEquals("<p>foo</p>")
				619
				620	def test_namespaces_are_preserved(self):
				621	markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>'
				622	soup = self.soup(markup)
				623	root = soup.root
				624	self.assertEqual("http://example.com/", root['xmlns:a'])
				625	self.assertEqual("http://example.net/", root['xmlns:b'])
				626
				627	def test_closing_namespaced_tag(self):
				628	markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
				629	soup = self.soup(markup)
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	630	self.assertEqual(str(soup.p), markup)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	631
				632	def test_namespaced_attributes(self):
				633	markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
				634	soup = self.soup(markup)
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	635	self.assertEqual(str(soup.foo), markup)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	636
				637	def test_namespaced_attributes_xml_namespace(self):
				638	markup = '<foo xml:lang="fr">bar</foo>'
				639	soup = self.soup(markup)
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	640	self.assertEqual(str(soup.foo), markup)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	641
				642	class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
				643	"""Smoke test for a tree builder that supports HTML5."""
				644
				645	def test_real_xhtml_document(self):
				646	# Since XHTML is not HTML5, HTML5 parsers are not tested to handle
				647	# XHTML documents in any particular way.
				648	pass
				649
				650	def test_html_tags_have_namespace(self):
				651	markup = "<a>"
				652	soup = self.soup(markup)
				653	self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace)
				654
				655	def test_svg_tags_have_namespace(self):
				656	markup = '<svg><circle/></svg>'
				657	soup = self.soup(markup)
				658	namespace = "http://www.w3.org/2000/svg"
				659	self.assertEqual(namespace, soup.svg.namespace)
				660	self.assertEqual(namespace, soup.circle.namespace)
				661
				662
				663	def test_mathml_tags_have_namespace(self):
				664	markup = '<math><msqrt>5</msqrt></math>'
				665	soup = self.soup(markup)
				666	namespace = 'http://www.w3.org/1998/Math/MathML'
				667	self.assertEqual(namespace, soup.math.namespace)
				668	self.assertEqual(namespace, soup.msqrt.namespace)
				669
				670	def test_xml_declaration_becomes_comment(self):
				671	markup = '<?xml version="1.0" encoding="utf-8"?><html></html>'
				672	soup = self.soup(markup)
				673	self.assertTrue(isinstance(soup.contents[0], Comment))
				674	self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?')
				675	self.assertEqual("html", soup.contents[0].next_element.name)
				676
				677	def skipIf(condition, reason):
				678	def nothing(test, args, *kwargs):
				679	return None
				680
				681	def decorator(test_item):
				682	if condition:
				683	return nothing
				684	else:
				685	return test_item
				686
				687	return decorator