Blame - poky/bitbake/lib/bs4/testing.py - mdmillerii/openbmc

blob: 6584ecf303e94085c8a864308e8c9b59fb891d70 [file] [log] [blame]

Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1	"""Helper classes for tests."""
				2
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	3	__license__ = "MIT"
				4
				5	import pickle
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	6	import copy
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	7	import unittest
				8	from unittest import TestCase
				9	from bs4 import BeautifulSoup
				10	from bs4.element import (
				11	CharsetMetaAttributeValue,
				12	Comment,
				13	ContentMetaAttributeValue,
				14	Doctype,
				15	SoupStrainer,
				16	)
				17
Andrew Geissler	4b740dc	2020-05-05 08:54:39 -0500	[diff] [blame]	18	from bs4.builder._htmlparser import HTMLParserTreeBuilder
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	19	default_builder = HTMLParserTreeBuilder
				20
				21
				22	class SoupTest(unittest.TestCase):
				23
				24	@property
				25	def default_builder(self):
				26	return default_builder()
				27
				28	def soup(self, markup, **kwargs):
				29	"""Build a Beautiful Soup object from markup."""
				30	builder = kwargs.pop('builder', self.default_builder)
				31	return BeautifulSoup(markup, builder=builder, **kwargs)
				32
				33	def document_for(self, markup):
				34	"""Turn an HTML fragment into a document.
				35
				36	The details depend on the builder.
				37	"""
				38	return self.default_builder.test_fragment_to_document(markup)
				39
				40	def assertSoupEquals(self, to_parse, compare_parsed_to=None):
				41	builder = self.default_builder
				42	obj = BeautifulSoup(to_parse, builder=builder)
				43	if compare_parsed_to is None:
				44	compare_parsed_to = to_parse
				45
				46	self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
				47
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	48	def assertConnectedness(self, element):
				49	"""Ensure that next_element and previous_element are properly
				50	set for all descendants of the given element.
				51	"""
				52	earlier = None
				53	for e in element.descendants:
				54	if earlier:
				55	self.assertEqual(e, earlier.next_element)
				56	self.assertEqual(earlier, e.previous_element)
				57	earlier = e
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	58
Andrew Geissler	4b740dc	2020-05-05 08:54:39 -0500	[diff] [blame]	59	class HTMLTreeBuilderSmokeTest(SoupTest):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	60
				61	"""A basic test of a treebuilder's competence.
				62
				63	Any HTML treebuilder, present or future, should be able to pass
				64	these tests. With invalid markup, there's room for interpretation,
				65	and different parsers can handle it differently. But with the
				66	markup in these tests, there's not much room for interpretation.
				67	"""
				68
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	69	def test_pickle_and_unpickle_identity(self):
				70	# Pickling a tree, then unpickling it, yields a tree identical
				71	# to the original.
				72	tree = self.soup("<a><b>foo</a>")
				73	dumped = pickle.dumps(tree, 2)
				74	loaded = pickle.loads(dumped)
				75	self.assertEqual(loaded.__class__, BeautifulSoup)
				76	self.assertEqual(loaded.decode(), tree.decode())
				77
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	78	def assertDoctypeHandled(self, doctype_fragment):
				79	"""Assert that a given doctype string is handled correctly."""
				80	doctype_str, soup = self._document_with_doctype(doctype_fragment)
				81
				82	# Make sure a Doctype object was created.
				83	doctype = soup.contents[0]
				84	self.assertEqual(doctype.__class__, Doctype)
				85	self.assertEqual(doctype, doctype_fragment)
				86	self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
				87
				88	# Make sure that the doctype was correctly associated with the
				89	# parse tree and that the rest of the document parsed.
				90	self.assertEqual(soup.p.contents[0], 'foo')
				91
				92	def _document_with_doctype(self, doctype_fragment):
				93	"""Generate and parse a document with the given doctype."""
				94	doctype = '<!DOCTYPE %s>' % doctype_fragment
				95	markup = doctype + '\n<p>foo</p>'
				96	soup = self.soup(markup)
				97	return doctype, soup
				98
				99	def test_normal_doctypes(self):
				100	"""Make sure normal, everyday HTML doctypes are handled correctly."""
				101	self.assertDoctypeHandled("html")
				102	self.assertDoctypeHandled(
				103	'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
				104
				105	def test_empty_doctype(self):
				106	soup = self.soup("<!DOCTYPE>")
				107	doctype = soup.contents[0]
				108	self.assertEqual("", doctype.strip())
				109
				110	def test_public_doctype_with_url(self):
				111	doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
				112	self.assertDoctypeHandled(doctype)
				113
				114	def test_system_doctype(self):
				115	self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
				116
				117	def test_namespaced_system_doctype(self):
				118	# We can handle a namespaced doctype with a system ID.
				119	self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
				120
				121	def test_namespaced_public_doctype(self):
				122	# Test a namespaced doctype with a public id.
				123	self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
				124
				125	def test_real_xhtml_document(self):
				126	"""A real XHTML document should come out more or less the same as it went in."""
				127	markup = b"""<?xml version="1.0" encoding="utf-8"?>
				128	<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
				129	<html xmlns="http://www.w3.org/1999/xhtml">
				130	<head><title>Hello.</title></head>
				131	<body>Goodbye.</body>
				132	</html>"""
				133	soup = self.soup(markup)
				134	self.assertEqual(
				135	soup.encode("utf-8").replace(b"\n", b""),
				136	markup.replace(b"\n", b""))
				137
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	138	def test_processing_instruction(self):
				139	markup = b"""<?PITarget PIContent?>"""
				140	soup = self.soup(markup)
				141	self.assertEqual(markup, soup.encode("utf8"))
				142
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	143	def test_deepcopy(self):
				144	"""Make sure you can copy the tree builder.
				145
				146	This is important because the builder is part of a
				147	BeautifulSoup object, and we want to be able to copy that.
				148	"""
				149	copy.deepcopy(self.default_builder)
				150
				151	def test_p_tag_is_never_empty_element(self):
				152	"""A <p> tag is never designated as an empty-element tag.
				153
				154	Even if the markup shows it as an empty-element tag, it
				155	shouldn't be presented that way.
				156	"""
				157	soup = self.soup("<p/>")
				158	self.assertFalse(soup.p.is_empty_element)
				159	self.assertEqual(str(soup.p), "<p></p>")
				160
				161	def test_unclosed_tags_get_closed(self):
				162	"""A tag that's not closed by the end of the document should be closed.
				163
				164	This applies to all tags except empty-element tags.
				165	"""
				166	self.assertSoupEquals("<p>", "<p></p>")
				167	self.assertSoupEquals("<b>", "<b></b>")
				168
				169	self.assertSoupEquals("<br>", "<br/>")
				170
				171	def test_br_is_always_empty_element_tag(self):
				172	"""A <br> tag is designated as an empty-element tag.
				173
				174	Some parsers treat <br></br> as one <br/> tag, some parsers as
				175	two tags, but it should always be an empty-element tag.
				176	"""
				177	soup = self.soup("<br></br>")
				178	self.assertTrue(soup.br.is_empty_element)
				179	self.assertEqual(str(soup.br), "<br/>")
				180
				181	def test_nested_formatting_elements(self):
				182	self.assertSoupEquals("<em><em></em></em>")
				183
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	184	def test_double_head(self):
				185	html = '''<!DOCTYPE html>
				186	<html>
				187	<head>
				188	<title>Ordinary HEAD element test</title>
				189	</head>
				190	<script type="text/javascript">
				191	alert("Help!");
				192	</script>
				193	<body>
				194	Hello, world!
				195	</body>
				196	</html>
				197	'''
				198	soup = self.soup(html)
				199	self.assertEqual("text/javascript", soup.find('script')['type'])
				200
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	201	def test_comment(self):
				202	# Comments are represented as Comment objects.
				203	markup = "<p>foo<!--foobar-->baz</p>"
				204	self.assertSoupEquals(markup)
				205
				206	soup = self.soup(markup)
				207	comment = soup.find(text="foobar")
				208	self.assertEqual(comment.__class__, Comment)
				209
				210	# The comment is properly integrated into the tree.
				211	foo = soup.find(text="foo")
				212	self.assertEqual(comment, foo.next_element)
				213	baz = soup.find(text="baz")
				214	self.assertEqual(comment, baz.previous_element)
				215
				216	def test_preserved_whitespace_in_pre_and_textarea(self):
				217	"""Whitespace must be preserved in <pre> and <textarea> tags."""
				218	self.assertSoupEquals("<pre> </pre>")
				219	self.assertSoupEquals("<textarea> woo </textarea>")
				220
				221	def test_nested_inline_elements(self):
				222	"""Inline elements can be nested indefinitely."""
				223	b_tag = "<b>Inside a B tag</b>"
				224	self.assertSoupEquals(b_tag)
				225
				226	nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>"
				227	self.assertSoupEquals(nested_b_tag)
				228
				229	double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>"
				230	self.assertSoupEquals(nested_b_tag)
				231
				232	def test_nested_block_level_elements(self):
				233	"""Block elements can be nested."""
				234	soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>')
				235	blockquote = soup.blockquote
				236	self.assertEqual(blockquote.p.b.string, 'Foo')
				237	self.assertEqual(blockquote.b.string, 'Foo')
				238
				239	def test_correctly_nested_tables(self):
				240	"""One table can go inside another one."""
				241	markup = ('<table id="1">'
				242	'<tr>'
				243	"<td>Here's another table:"
				244	'<table id="2">'
				245	'<tr><td>foo</td></tr>'
				246	'</table></td>')
				247
				248	self.assertSoupEquals(
				249	markup,
				250	'<table id="1"><tr><td>Here\'s another table:'
				251	'<table id="2"><tr><td>foo</td></tr></table>'
				252	'</td></tr></table>')
				253
				254	self.assertSoupEquals(
				255	"<table><thead><tr><td>Foo</td></tr></thead>"
				256	"<tbody><tr><td>Bar</td></tr></tbody>"
				257	"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
				258
				259	def test_deeply_nested_multivalued_attribute(self):
				260	# html5lib can set the attributes of the same tag many times
				261	# as it rearranges the tree. This has caused problems with
				262	# multivalued attributes.
				263	markup = '<table><div><div class="css"></div></div></table>'
				264	soup = self.soup(markup)
				265	self.assertEqual(["css"], soup.div.div['class'])
				266
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	267	def test_multivalued_attribute_on_html(self):
				268	# html5lib uses a different API to set the attributes ot the
				269	# <html> tag. This has caused problems with multivalued
				270	# attributes.
				271	markup = '<html class="a b"></html>'
				272	soup = self.soup(markup)
				273	self.assertEqual(["a", "b"], soup.html['class'])
				274
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	275	def test_angle_brackets_in_attribute_values_are_escaped(self):
				276	self.assertSoupEquals('<a b="<a>"></a>', '<a b="<a>"></a>')
				277
				278	def test_entities_in_attributes_converted_to_unicode(self):
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	279	expect = '<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	280	self.assertSoupEquals('<p id="piñata"></p>', expect)
				281	self.assertSoupEquals('<p id="piñata"></p>', expect)
				282	self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect)
				283	self.assertSoupEquals('<p id="piñata"></p>', expect)
				284
				285	def test_entities_in_text_converted_to_unicode(self):
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	286	expect = '<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	287	self.assertSoupEquals("<p>piñata</p>", expect)
				288	self.assertSoupEquals("<p>piñata</p>", expect)
				289	self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect)
				290	self.assertSoupEquals("<p>piñata</p>", expect)
				291
				292	def test_quot_entity_converted_to_quotation_mark(self):
				293	self.assertSoupEquals("<p>I said "good day!"</p>",
				294	'<p>I said "good day!"</p>')
				295
				296	def test_out_of_range_entity(self):
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	297	expect = "\N{REPLACEMENT CHARACTER}"
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	298	self.assertSoupEquals("&#10000000000000;", expect)
				299	self.assertSoupEquals("&#x10000000000000;", expect)
				300	self.assertSoupEquals("&#1000000000;", expect)
				301
				302	def test_multipart_strings(self):
				303	"Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
				304	soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
				305	self.assertEqual("p", soup.h2.string.next_element.name)
				306	self.assertEqual("p", soup.p.name)
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	307	self.assertConnectedness(soup)
				308
				309	def test_head_tag_between_head_and_body(self):
				310	"Prevent recurrence of a bug in the html5lib treebuilder."
				311	content = """<html><head></head>
				312	<link></link>
				313	<body>foo</body>
				314	</html>
				315	"""
				316	soup = self.soup(content)
				317	self.assertNotEqual(None, soup.html.body)
				318	self.assertConnectedness(soup)
				319
				320	def test_multiple_copies_of_a_tag(self):
				321	"Prevent recurrence of a bug in the html5lib treebuilder."
				322	content = """<!DOCTYPE html>
				323	<html>
				324	<body>
				325	<article id="a" >
				326	<div><a href="1"></div>
				327	<footer>
				328	<a href="2"></a>
				329	</footer>
				330	</article>
				331	</body>
				332	</html>
				333	"""
				334	soup = self.soup(content)
				335	self.assertConnectedness(soup.article)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	336
				337	def test_basic_namespaces(self):
				338	"""Parsers don't need to understand namespaces, but at the
				339	very least they should not choke on namespaces or lose
				340	data."""
				341
				342	markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>'
				343	soup = self.soup(markup)
				344	self.assertEqual(markup, soup.encode())
				345	html = soup.html
				346	self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns'])
				347	self.assertEqual(
				348	'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml'])
				349	self.assertEqual(
				350	'http://www.w3.org/2000/svg', soup.html['xmlns:svg'])
				351
				352	def test_multivalued_attribute_value_becomes_list(self):
				353	markup = b'<a class="foo bar">'
				354	soup = self.soup(markup)
				355	self.assertEqual(['foo', 'bar'], soup.a['class'])
				356
				357	#
				358	# Generally speaking, tests below this point are more tests of
				359	# Beautiful Soup than tests of the tree builders. But parsers are
				360	# weird, so we run these tests separately for every tree builder
				361	# to detect any differences between them.
				362	#
				363
				364	def test_can_parse_unicode_document(self):
				365	# A seemingly innocuous document... but it's in Unicode! And
				366	# it contains characters that can't be represented in the
				367	# encoding found in the declaration! The horror!
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	368	markup = '<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	369	soup = self.soup(markup)
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	370	self.assertEqual('Sacr\xe9 bleu!', soup.body.string)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	371
				372	def test_soupstrainer(self):
				373	"""Parsers should be able to work with SoupStrainers."""
				374	strainer = SoupStrainer("b")
				375	soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>",
				376	parse_only=strainer)
				377	self.assertEqual(soup.decode(), "<b>bold</b>")
				378
				379	def test_single_quote_attribute_values_become_double_quotes(self):
				380	self.assertSoupEquals("<foo attr='bar'></foo>",
				381	'<foo attr="bar"></foo>')
				382
				383	def test_attribute_values_with_nested_quotes_are_left_alone(self):
				384	text = """<foo attr='bar "brawls" happen'>a</foo>"""
				385	self.assertSoupEquals(text)
				386
				387	def test_attribute_values_with_double_nested_quotes_get_quoted(self):
				388	text = """<foo attr='bar "brawls" happen'>a</foo>"""
				389	soup = self.soup(text)
				390	soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
				391	self.assertSoupEquals(
				392	soup.foo.decode(),
				393	"""<foo attr="Brawls happen at "Bob\'s Bar"">a</foo>""")
				394
				395	def test_ampersand_in_attribute_value_gets_escaped(self):
				396	self.assertSoupEquals('<this is="really messed up & stuff"></this>',
				397	'<this is="really messed up & stuff"></this>')
				398
				399	self.assertSoupEquals(
				400	'<a href="http://example.org?a=1&b=2;3">foo</a>',
				401	'<a href="http://example.org?a=1&b=2;3">foo</a>')
				402
				403	def test_escaped_ampersand_in_attribute_value_is_left_alone(self):
				404	self.assertSoupEquals('<a href="http://example.org?a=1&b=2;3"></a>')
				405
				406	def test_entities_in_strings_converted_during_parsing(self):
				407	# Both XML and HTML entities are converted to Unicode characters
				408	# during parsing.
				409	text = "<p><<sacré bleu!>></p>"
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	410	expected = "<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>"
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	411	self.assertSoupEquals(text, expected)
				412
				413	def test_smart_quotes_converted_on_the_way_in(self):
				414	# Microsoft smart quotes are converted to Unicode characters during
				415	# parsing.
				416	quote = b"<p>\x91Foo\x92</p>"
				417	soup = self.soup(quote)
				418	self.assertEqual(
				419	soup.p.string,
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	420	"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	421
				422	def test_non_breaking_spaces_converted_on_the_way_in(self):
				423	soup = self.soup("<a>  </a>")
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	424	self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	425
				426	def test_entities_converted_on_the_way_out(self):
				427	text = "<p><<sacré bleu!>></p>"
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	428	expected = "<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>".encode("utf-8")
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	429	soup = self.soup(text)
				430	self.assertEqual(soup.p.encode("utf-8"), expected)
				431
				432	def test_real_iso_latin_document(self):
				433	# Smoke test of interrelated functionality, using an
				434	# easy-to-understand document.
				435
				436	# Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	437	unicode_html = '<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	438
				439	# That's because we're going to encode it into ISO-Latin-1, and use
				440	# that to test.
				441	iso_latin_html = unicode_html.encode("iso-8859-1")
				442
				443	# Parse the ISO-Latin-1 HTML.
				444	soup = self.soup(iso_latin_html)
				445	# Encode it to UTF-8.
				446	result = soup.encode("utf-8")
				447
				448	# What do we expect the result to look like? Well, it would
				449	# look like unicode_html, except that the META tag would say
				450	# UTF-8 instead of ISO-Latin-1.
				451	expected = unicode_html.replace("ISO-Latin-1", "utf-8")
				452
				453	# And, of course, it would be in UTF-8, not Unicode.
				454	expected = expected.encode("utf-8")
				455
				456	# Ta-da!
				457	self.assertEqual(result, expected)
				458
				459	def test_real_shift_jis_document(self):
				460	# Smoke test to make sure the parser can handle a document in
				461	# Shift-JIS encoding, without choking.
				462	shift_jis_html = (
				463	b'<html><head></head><body><pre>'
				464	b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
				465	b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
				466	b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
				467	b'</pre></body></html>')
				468	unicode_html = shift_jis_html.decode("shift-jis")
				469	soup = self.soup(unicode_html)
				470
				471	# Make sure the parse tree is correctly encoded to various
				472	# encodings.
				473	self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8"))
				474	self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp"))
				475
				476	def test_real_hebrew_document(self):
				477	# A real-world test to make sure we can convert ISO-8859-9 (a
				478	# Hebrew encoding) to UTF-8.
				479	hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
				480	soup = self.soup(
				481	hebrew_document, from_encoding="iso8859-8")
				482	self.assertEqual(soup.original_encoding, 'iso8859-8')
				483	self.assertEqual(
				484	soup.encode('utf-8'),
				485	hebrew_document.decode("iso8859-8").encode("utf-8"))
				486
				487	def test_meta_tag_reflects_current_encoding(self):
				488	# Here's the <meta> tag saying that a document is
				489	# encoded in Shift-JIS.
				490	meta_tag = ('<meta content="text/html; charset=x-sjis" '
				491	'http-equiv="Content-type"/>')
				492
				493	# Here's a document incorporating that meta tag.
				494	shift_jis_html = (
				495	'<html><head>\n%s\n'
				496	'<meta http-equiv="Content-language" content="ja"/>'
				497	'</head><body>Shift-JIS markup goes here.') % meta_tag
				498	soup = self.soup(shift_jis_html)
				499
				500	# Parse the document, and the charset is seemingly unaffected.
				501	parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
				502	content = parsed_meta['content']
				503	self.assertEqual('text/html; charset=x-sjis', content)
				504
				505	# But that value is actually a ContentMetaAttributeValue object.
				506	self.assertTrue(isinstance(content, ContentMetaAttributeValue))
				507
				508	# And it will take on a value that reflects its current
				509	# encoding.
				510	self.assertEqual('text/html; charset=utf8', content.encode("utf8"))
				511
				512	# For the rest of the story, see TestSubstitutions in
				513	# test_tree.py.
				514
				515	def test_html5_style_meta_tag_reflects_current_encoding(self):
				516	# Here's the <meta> tag saying that a document is
				517	# encoded in Shift-JIS.
				518	meta_tag = ('<meta id="encoding" charset="x-sjis" />')
				519
				520	# Here's a document incorporating that meta tag.
				521	shift_jis_html = (
				522	'<html><head>\n%s\n'
				523	'<meta http-equiv="Content-language" content="ja"/>'
				524	'</head><body>Shift-JIS markup goes here.') % meta_tag
				525	soup = self.soup(shift_jis_html)
				526
				527	# Parse the document, and the charset is seemingly unaffected.
				528	parsed_meta = soup.find('meta', id="encoding")
				529	charset = parsed_meta['charset']
				530	self.assertEqual('x-sjis', charset)
				531
				532	# But that value is actually a CharsetMetaAttributeValue object.
				533	self.assertTrue(isinstance(charset, CharsetMetaAttributeValue))
				534
				535	# And it will take on a value that reflects its current
				536	# encoding.
				537	self.assertEqual('utf8', charset.encode("utf8"))
				538
				539	def test_tag_with_no_attributes_can_have_attributes_added(self):
				540	data = self.soup("<a>text</a>")
				541	data.a['foo'] = 'bar'
				542	self.assertEqual('<a foo="bar">text</a>', data.a.decode())
				543
Andrew Geissler	4b740dc	2020-05-05 08:54:39 -0500	[diff] [blame]	544	class XMLTreeBuilderSmokeTest(SoupTest):
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	545
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	546	def test_pickle_and_unpickle_identity(self):
				547	# Pickling a tree, then unpickling it, yields a tree identical
				548	# to the original.
				549	tree = self.soup("<a><b>foo</a>")
				550	dumped = pickle.dumps(tree, 2)
				551	loaded = pickle.loads(dumped)
				552	self.assertEqual(loaded.__class__, BeautifulSoup)
				553	self.assertEqual(loaded.decode(), tree.decode())
				554
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	555	def test_docstring_generated(self):
				556	soup = self.soup("<root/>")
				557	self.assertEqual(
				558	soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
				559
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	560	def test_xml_declaration(self):
				561	markup = b"""<?xml version="1.0" encoding="utf8"?>\n<foo/>"""
				562	soup = self.soup(markup)
				563	self.assertEqual(markup, soup.encode("utf8"))
				564
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	565	def test_real_xhtml_document(self):
				566	"""A real XHTML document should come out exactly the same as it went in."""
				567	markup = b"""<?xml version="1.0" encoding="utf-8"?>
				568	<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
				569	<html xmlns="http://www.w3.org/1999/xhtml">
				570	<head><title>Hello.</title></head>
				571	<body>Goodbye.</body>
				572	</html>"""
				573	soup = self.soup(markup)
				574	self.assertEqual(
				575	soup.encode("utf-8"), markup)
				576
				577	def test_formatter_processes_script_tag_for_xml_documents(self):
				578	doc = """
				579	<script type="text/javascript">
				580	</script>
				581	"""
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	582	soup = BeautifulSoup(doc, "lxml-xml")
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	583	# lxml would have stripped this while parsing, but we can add
				584	# it later.
				585	soup.script.string = 'console.log("< < hey > > ");'
				586	encoded = soup.encode()
				587	self.assertTrue(b"< < hey > >" in encoded)
				588
				589	def test_can_parse_unicode_document(self):
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	590	markup = '<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	591	soup = self.soup(markup)
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	592	self.assertEqual('Sacr\xe9 bleu!', soup.root.string)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	593
				594	def test_popping_namespaced_tag(self):
				595	markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
				596	soup = self.soup(markup)
				597	self.assertEqual(
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	598	str(soup.rss), markup)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	599
				600	def test_docstring_includes_correct_encoding(self):
				601	soup = self.soup("<root/>")
				602	self.assertEqual(
				603	soup.encode("latin1"),
				604	b'<?xml version="1.0" encoding="latin1"?>\n<root/>')
				605
				606	def test_large_xml_document(self):
				607	"""A large XML document should come out the same as it went in."""
				608	markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>'
				609	+ b'0' * (2**12)
				610	+ b'</root>')
				611	soup = self.soup(markup)
				612	self.assertEqual(soup.encode("utf-8"), markup)
				613
				614
				615	def test_tags_are_empty_element_if_and_only_if_they_are_empty(self):
				616	self.assertSoupEquals("<p>", "<p/>")
				617	self.assertSoupEquals("<p>foo</p>")
				618
				619	def test_namespaces_are_preserved(self):
				620	markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>'
				621	soup = self.soup(markup)
				622	root = soup.root
				623	self.assertEqual("http://example.com/", root['xmlns:a'])
				624	self.assertEqual("http://example.net/", root['xmlns:b'])
				625
				626	def test_closing_namespaced_tag(self):
				627	markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
				628	soup = self.soup(markup)
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	629	self.assertEqual(str(soup.p), markup)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	630
				631	def test_namespaced_attributes(self):
				632	markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
				633	soup = self.soup(markup)
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	634	self.assertEqual(str(soup.foo), markup)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	635
				636	def test_namespaced_attributes_xml_namespace(self):
				637	markup = '<foo xml:lang="fr">bar</foo>'
				638	soup = self.soup(markup)
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	639	self.assertEqual(str(soup.foo), markup)
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	640
				641	class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
				642	"""Smoke test for a tree builder that supports HTML5."""
				643
				644	def test_real_xhtml_document(self):
				645	# Since XHTML is not HTML5, HTML5 parsers are not tested to handle
				646	# XHTML documents in any particular way.
				647	pass
				648
				649	def test_html_tags_have_namespace(self):
				650	markup = "<a>"
				651	soup = self.soup(markup)
				652	self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace)
				653
				654	def test_svg_tags_have_namespace(self):
				655	markup = '<svg><circle/></svg>'
				656	soup = self.soup(markup)
				657	namespace = "http://www.w3.org/2000/svg"
				658	self.assertEqual(namespace, soup.svg.namespace)
				659	self.assertEqual(namespace, soup.circle.namespace)
				660
				661
				662	def test_mathml_tags_have_namespace(self):
				663	markup = '<math><msqrt>5</msqrt></math>'
				664	soup = self.soup(markup)
				665	namespace = 'http://www.w3.org/1998/Math/MathML'
				666	self.assertEqual(namespace, soup.math.namespace)
				667	self.assertEqual(namespace, soup.msqrt.namespace)
				668
				669	def test_xml_declaration_becomes_comment(self):
				670	markup = '<?xml version="1.0" encoding="utf-8"?><html></html>'
				671	soup = self.soup(markup)
				672	self.assertTrue(isinstance(soup.contents[0], Comment))
				673	self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?')
				674	self.assertEqual("html", soup.contents[0].next_element.name)
				675
				676	def skipIf(condition, reason):
				677	def nothing(test, args, *kwargs):
				678	return None
				679
				680	def decorator(test_item):
				681	if condition:
				682	return nothing
				683	else:
				684	return test_item
				685
				686	return decorator