blob: 6584ecf303e94085c8a864308e8c9b59fb891d70 [file] [log] [blame]
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001"""Helper classes for tests."""
2
Patrick Williamsc0f7c042017-02-23 20:41:17 -06003__license__ = "MIT"
4
5import pickle
Patrick Williamsc124f4f2015-09-15 14:41:29 -05006import copy
Patrick Williamsc124f4f2015-09-15 14:41:29 -05007import unittest
8from unittest import TestCase
9from bs4 import BeautifulSoup
10from bs4.element import (
11 CharsetMetaAttributeValue,
12 Comment,
13 ContentMetaAttributeValue,
14 Doctype,
15 SoupStrainer,
16)
17
Andrew Geissler4b740dc2020-05-05 08:54:39 -050018from bs4.builder._htmlparser import HTMLParserTreeBuilder
Patrick Williamsc124f4f2015-09-15 14:41:29 -050019default_builder = HTMLParserTreeBuilder
20
21
22class SoupTest(unittest.TestCase):
23
24 @property
25 def default_builder(self):
26 return default_builder()
27
28 def soup(self, markup, **kwargs):
29 """Build a Beautiful Soup object from markup."""
30 builder = kwargs.pop('builder', self.default_builder)
31 return BeautifulSoup(markup, builder=builder, **kwargs)
32
33 def document_for(self, markup):
34 """Turn an HTML fragment into a document.
35
36 The details depend on the builder.
37 """
38 return self.default_builder.test_fragment_to_document(markup)
39
40 def assertSoupEquals(self, to_parse, compare_parsed_to=None):
41 builder = self.default_builder
42 obj = BeautifulSoup(to_parse, builder=builder)
43 if compare_parsed_to is None:
44 compare_parsed_to = to_parse
45
46 self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
47
Patrick Williamsc0f7c042017-02-23 20:41:17 -060048 def assertConnectedness(self, element):
49 """Ensure that next_element and previous_element are properly
50 set for all descendants of the given element.
51 """
52 earlier = None
53 for e in element.descendants:
54 if earlier:
55 self.assertEqual(e, earlier.next_element)
56 self.assertEqual(earlier, e.previous_element)
57 earlier = e
Patrick Williamsc124f4f2015-09-15 14:41:29 -050058
Andrew Geissler4b740dc2020-05-05 08:54:39 -050059class HTMLTreeBuilderSmokeTest(SoupTest):
Patrick Williamsc124f4f2015-09-15 14:41:29 -050060
61 """A basic test of a treebuilder's competence.
62
63 Any HTML treebuilder, present or future, should be able to pass
64 these tests. With invalid markup, there's room for interpretation,
65 and different parsers can handle it differently. But with the
66 markup in these tests, there's not much room for interpretation.
67 """
68
Patrick Williamsc0f7c042017-02-23 20:41:17 -060069 def test_pickle_and_unpickle_identity(self):
70 # Pickling a tree, then unpickling it, yields a tree identical
71 # to the original.
72 tree = self.soup("<a><b>foo</a>")
73 dumped = pickle.dumps(tree, 2)
74 loaded = pickle.loads(dumped)
75 self.assertEqual(loaded.__class__, BeautifulSoup)
76 self.assertEqual(loaded.decode(), tree.decode())
77
Patrick Williamsc124f4f2015-09-15 14:41:29 -050078 def assertDoctypeHandled(self, doctype_fragment):
79 """Assert that a given doctype string is handled correctly."""
80 doctype_str, soup = self._document_with_doctype(doctype_fragment)
81
82 # Make sure a Doctype object was created.
83 doctype = soup.contents[0]
84 self.assertEqual(doctype.__class__, Doctype)
85 self.assertEqual(doctype, doctype_fragment)
86 self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
87
88 # Make sure that the doctype was correctly associated with the
89 # parse tree and that the rest of the document parsed.
90 self.assertEqual(soup.p.contents[0], 'foo')
91
92 def _document_with_doctype(self, doctype_fragment):
93 """Generate and parse a document with the given doctype."""
94 doctype = '<!DOCTYPE %s>' % doctype_fragment
95 markup = doctype + '\n<p>foo</p>'
96 soup = self.soup(markup)
97 return doctype, soup
98
99 def test_normal_doctypes(self):
100 """Make sure normal, everyday HTML doctypes are handled correctly."""
101 self.assertDoctypeHandled("html")
102 self.assertDoctypeHandled(
103 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
104
105 def test_empty_doctype(self):
106 soup = self.soup("<!DOCTYPE>")
107 doctype = soup.contents[0]
108 self.assertEqual("", doctype.strip())
109
110 def test_public_doctype_with_url(self):
111 doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
112 self.assertDoctypeHandled(doctype)
113
114 def test_system_doctype(self):
115 self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
116
117 def test_namespaced_system_doctype(self):
118 # We can handle a namespaced doctype with a system ID.
119 self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
120
121 def test_namespaced_public_doctype(self):
122 # Test a namespaced doctype with a public id.
123 self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
124
125 def test_real_xhtml_document(self):
126 """A real XHTML document should come out more or less the same as it went in."""
127 markup = b"""<?xml version="1.0" encoding="utf-8"?>
128<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
129<html xmlns="http://www.w3.org/1999/xhtml">
130<head><title>Hello.</title></head>
131<body>Goodbye.</body>
132</html>"""
133 soup = self.soup(markup)
134 self.assertEqual(
135 soup.encode("utf-8").replace(b"\n", b""),
136 markup.replace(b"\n", b""))
137
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600138 def test_processing_instruction(self):
139 markup = b"""<?PITarget PIContent?>"""
140 soup = self.soup(markup)
141 self.assertEqual(markup, soup.encode("utf8"))
142
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500143 def test_deepcopy(self):
144 """Make sure you can copy the tree builder.
145
146 This is important because the builder is part of a
147 BeautifulSoup object, and we want to be able to copy that.
148 """
149 copy.deepcopy(self.default_builder)
150
151 def test_p_tag_is_never_empty_element(self):
152 """A <p> tag is never designated as an empty-element tag.
153
154 Even if the markup shows it as an empty-element tag, it
155 shouldn't be presented that way.
156 """
157 soup = self.soup("<p/>")
158 self.assertFalse(soup.p.is_empty_element)
159 self.assertEqual(str(soup.p), "<p></p>")
160
161 def test_unclosed_tags_get_closed(self):
162 """A tag that's not closed by the end of the document should be closed.
163
164 This applies to all tags except empty-element tags.
165 """
166 self.assertSoupEquals("<p>", "<p></p>")
167 self.assertSoupEquals("<b>", "<b></b>")
168
169 self.assertSoupEquals("<br>", "<br/>")
170
171 def test_br_is_always_empty_element_tag(self):
172 """A <br> tag is designated as an empty-element tag.
173
174 Some parsers treat <br></br> as one <br/> tag, some parsers as
175 two tags, but it should always be an empty-element tag.
176 """
177 soup = self.soup("<br></br>")
178 self.assertTrue(soup.br.is_empty_element)
179 self.assertEqual(str(soup.br), "<br/>")
180
181 def test_nested_formatting_elements(self):
182 self.assertSoupEquals("<em><em></em></em>")
183
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600184 def test_double_head(self):
185 html = '''<!DOCTYPE html>
186<html>
187<head>
188<title>Ordinary HEAD element test</title>
189</head>
190<script type="text/javascript">
191alert("Help!");
192</script>
193<body>
194Hello, world!
195</body>
196</html>
197'''
198 soup = self.soup(html)
199 self.assertEqual("text/javascript", soup.find('script')['type'])
200
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500201 def test_comment(self):
202 # Comments are represented as Comment objects.
203 markup = "<p>foo<!--foobar-->baz</p>"
204 self.assertSoupEquals(markup)
205
206 soup = self.soup(markup)
207 comment = soup.find(text="foobar")
208 self.assertEqual(comment.__class__, Comment)
209
210 # The comment is properly integrated into the tree.
211 foo = soup.find(text="foo")
212 self.assertEqual(comment, foo.next_element)
213 baz = soup.find(text="baz")
214 self.assertEqual(comment, baz.previous_element)
215
216 def test_preserved_whitespace_in_pre_and_textarea(self):
217 """Whitespace must be preserved in <pre> and <textarea> tags."""
218 self.assertSoupEquals("<pre> </pre>")
219 self.assertSoupEquals("<textarea> woo </textarea>")
220
221 def test_nested_inline_elements(self):
222 """Inline elements can be nested indefinitely."""
223 b_tag = "<b>Inside a B tag</b>"
224 self.assertSoupEquals(b_tag)
225
226 nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>"
227 self.assertSoupEquals(nested_b_tag)
228
229 double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>"
230 self.assertSoupEquals(nested_b_tag)
231
232 def test_nested_block_level_elements(self):
233 """Block elements can be nested."""
234 soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>')
235 blockquote = soup.blockquote
236 self.assertEqual(blockquote.p.b.string, 'Foo')
237 self.assertEqual(blockquote.b.string, 'Foo')
238
239 def test_correctly_nested_tables(self):
240 """One table can go inside another one."""
241 markup = ('<table id="1">'
242 '<tr>'
243 "<td>Here's another table:"
244 '<table id="2">'
245 '<tr><td>foo</td></tr>'
246 '</table></td>')
247
248 self.assertSoupEquals(
249 markup,
250 '<table id="1"><tr><td>Here\'s another table:'
251 '<table id="2"><tr><td>foo</td></tr></table>'
252 '</td></tr></table>')
253
254 self.assertSoupEquals(
255 "<table><thead><tr><td>Foo</td></tr></thead>"
256 "<tbody><tr><td>Bar</td></tr></tbody>"
257 "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
258
259 def test_deeply_nested_multivalued_attribute(self):
260 # html5lib can set the attributes of the same tag many times
261 # as it rearranges the tree. This has caused problems with
262 # multivalued attributes.
263 markup = '<table><div><div class="css"></div></div></table>'
264 soup = self.soup(markup)
265 self.assertEqual(["css"], soup.div.div['class'])
266
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600267 def test_multivalued_attribute_on_html(self):
268 # html5lib uses a different API to set the attributes ot the
269 # <html> tag. This has caused problems with multivalued
270 # attributes.
271 markup = '<html class="a b"></html>'
272 soup = self.soup(markup)
273 self.assertEqual(["a", "b"], soup.html['class'])
274
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500275 def test_angle_brackets_in_attribute_values_are_escaped(self):
276 self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')
277
278 def test_entities_in_attributes_converted_to_unicode(self):
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600279 expect = '<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500280 self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
281 self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
282 self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect)
283 self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)
284
285 def test_entities_in_text_converted_to_unicode(self):
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600286 expect = '<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500287 self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
288 self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
289 self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect)
290 self.assertSoupEquals("<p>pi&ntilde;ata</p>", expect)
291
292 def test_quot_entity_converted_to_quotation_mark(self):
293 self.assertSoupEquals("<p>I said &quot;good day!&quot;</p>",
294 '<p>I said "good day!"</p>')
295
296 def test_out_of_range_entity(self):
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600297 expect = "\N{REPLACEMENT CHARACTER}"
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500298 self.assertSoupEquals("&#10000000000000;", expect)
299 self.assertSoupEquals("&#x10000000000000;", expect)
300 self.assertSoupEquals("&#1000000000;", expect)
301
302 def test_multipart_strings(self):
303 "Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
304 soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
305 self.assertEqual("p", soup.h2.string.next_element.name)
306 self.assertEqual("p", soup.p.name)
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600307 self.assertConnectedness(soup)
308
309 def test_head_tag_between_head_and_body(self):
310 "Prevent recurrence of a bug in the html5lib treebuilder."
311 content = """<html><head></head>
312 <link></link>
313 <body>foo</body>
314</html>
315"""
316 soup = self.soup(content)
317 self.assertNotEqual(None, soup.html.body)
318 self.assertConnectedness(soup)
319
320 def test_multiple_copies_of_a_tag(self):
321 "Prevent recurrence of a bug in the html5lib treebuilder."
322 content = """<!DOCTYPE html>
323<html>
324 <body>
325 <article id="a" >
326 <div><a href="1"></div>
327 <footer>
328 <a href="2"></a>
329 </footer>
330 </article>
331 </body>
332</html>
333"""
334 soup = self.soup(content)
335 self.assertConnectedness(soup.article)
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500336
337 def test_basic_namespaces(self):
338 """Parsers don't need to *understand* namespaces, but at the
339 very least they should not choke on namespaces or lose
340 data."""
341
342 markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>'
343 soup = self.soup(markup)
344 self.assertEqual(markup, soup.encode())
345 html = soup.html
346 self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns'])
347 self.assertEqual(
348 'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml'])
349 self.assertEqual(
350 'http://www.w3.org/2000/svg', soup.html['xmlns:svg'])
351
352 def test_multivalued_attribute_value_becomes_list(self):
353 markup = b'<a class="foo bar">'
354 soup = self.soup(markup)
355 self.assertEqual(['foo', 'bar'], soup.a['class'])
356
357 #
358 # Generally speaking, tests below this point are more tests of
359 # Beautiful Soup than tests of the tree builders. But parsers are
360 # weird, so we run these tests separately for every tree builder
361 # to detect any differences between them.
362 #
363
364 def test_can_parse_unicode_document(self):
365 # A seemingly innocuous document... but it's in Unicode! And
366 # it contains characters that can't be represented in the
367 # encoding found in the declaration! The horror!
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600368 markup = '<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500369 soup = self.soup(markup)
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600370 self.assertEqual('Sacr\xe9 bleu!', soup.body.string)
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500371
372 def test_soupstrainer(self):
373 """Parsers should be able to work with SoupStrainers."""
374 strainer = SoupStrainer("b")
375 soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>",
376 parse_only=strainer)
377 self.assertEqual(soup.decode(), "<b>bold</b>")
378
379 def test_single_quote_attribute_values_become_double_quotes(self):
380 self.assertSoupEquals("<foo attr='bar'></foo>",
381 '<foo attr="bar"></foo>')
382
383 def test_attribute_values_with_nested_quotes_are_left_alone(self):
384 text = """<foo attr='bar "brawls" happen'>a</foo>"""
385 self.assertSoupEquals(text)
386
387 def test_attribute_values_with_double_nested_quotes_get_quoted(self):
388 text = """<foo attr='bar "brawls" happen'>a</foo>"""
389 soup = self.soup(text)
390 soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
391 self.assertSoupEquals(
392 soup.foo.decode(),
393 """<foo attr="Brawls happen at &quot;Bob\'s Bar&quot;">a</foo>""")
394
395 def test_ampersand_in_attribute_value_gets_escaped(self):
396 self.assertSoupEquals('<this is="really messed up & stuff"></this>',
397 '<this is="really messed up &amp; stuff"></this>')
398
399 self.assertSoupEquals(
400 '<a href="http://example.org?a=1&b=2;3">foo</a>',
401 '<a href="http://example.org?a=1&amp;b=2;3">foo</a>')
402
403 def test_escaped_ampersand_in_attribute_value_is_left_alone(self):
404 self.assertSoupEquals('<a href="http://example.org?a=1&amp;b=2;3"></a>')
405
406 def test_entities_in_strings_converted_during_parsing(self):
407 # Both XML and HTML entities are converted to Unicode characters
408 # during parsing.
409 text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600410 expected = "<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500411 self.assertSoupEquals(text, expected)
412
413 def test_smart_quotes_converted_on_the_way_in(self):
414 # Microsoft smart quotes are converted to Unicode characters during
415 # parsing.
416 quote = b"<p>\x91Foo\x92</p>"
417 soup = self.soup(quote)
418 self.assertEqual(
419 soup.p.string,
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600420 "\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500421
422 def test_non_breaking_spaces_converted_on_the_way_in(self):
423 soup = self.soup("<a>&nbsp;&nbsp;</a>")
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600424 self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2)
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500425
426 def test_entities_converted_on_the_way_out(self):
427 text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600428 expected = "<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500429 soup = self.soup(text)
430 self.assertEqual(soup.p.encode("utf-8"), expected)
431
432 def test_real_iso_latin_document(self):
433 # Smoke test of interrelated functionality, using an
434 # easy-to-understand document.
435
436 # Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600437 unicode_html = '<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500438
439 # That's because we're going to encode it into ISO-Latin-1, and use
440 # that to test.
441 iso_latin_html = unicode_html.encode("iso-8859-1")
442
443 # Parse the ISO-Latin-1 HTML.
444 soup = self.soup(iso_latin_html)
445 # Encode it to UTF-8.
446 result = soup.encode("utf-8")
447
448 # What do we expect the result to look like? Well, it would
449 # look like unicode_html, except that the META tag would say
450 # UTF-8 instead of ISO-Latin-1.
451 expected = unicode_html.replace("ISO-Latin-1", "utf-8")
452
453 # And, of course, it would be in UTF-8, not Unicode.
454 expected = expected.encode("utf-8")
455
456 # Ta-da!
457 self.assertEqual(result, expected)
458
459 def test_real_shift_jis_document(self):
460 # Smoke test to make sure the parser can handle a document in
461 # Shift-JIS encoding, without choking.
462 shift_jis_html = (
463 b'<html><head></head><body><pre>'
464 b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
465 b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
466 b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
467 b'</pre></body></html>')
468 unicode_html = shift_jis_html.decode("shift-jis")
469 soup = self.soup(unicode_html)
470
471 # Make sure the parse tree is correctly encoded to various
472 # encodings.
473 self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8"))
474 self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp"))
475
476 def test_real_hebrew_document(self):
477 # A real-world test to make sure we can convert ISO-8859-9 (a
478 # Hebrew encoding) to UTF-8.
479 hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
480 soup = self.soup(
481 hebrew_document, from_encoding="iso8859-8")
482 self.assertEqual(soup.original_encoding, 'iso8859-8')
483 self.assertEqual(
484 soup.encode('utf-8'),
485 hebrew_document.decode("iso8859-8").encode("utf-8"))
486
487 def test_meta_tag_reflects_current_encoding(self):
488 # Here's the <meta> tag saying that a document is
489 # encoded in Shift-JIS.
490 meta_tag = ('<meta content="text/html; charset=x-sjis" '
491 'http-equiv="Content-type"/>')
492
493 # Here's a document incorporating that meta tag.
494 shift_jis_html = (
495 '<html><head>\n%s\n'
496 '<meta http-equiv="Content-language" content="ja"/>'
497 '</head><body>Shift-JIS markup goes here.') % meta_tag
498 soup = self.soup(shift_jis_html)
499
500 # Parse the document, and the charset is seemingly unaffected.
501 parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
502 content = parsed_meta['content']
503 self.assertEqual('text/html; charset=x-sjis', content)
504
505 # But that value is actually a ContentMetaAttributeValue object.
506 self.assertTrue(isinstance(content, ContentMetaAttributeValue))
507
508 # And it will take on a value that reflects its current
509 # encoding.
510 self.assertEqual('text/html; charset=utf8', content.encode("utf8"))
511
512 # For the rest of the story, see TestSubstitutions in
513 # test_tree.py.
514
515 def test_html5_style_meta_tag_reflects_current_encoding(self):
516 # Here's the <meta> tag saying that a document is
517 # encoded in Shift-JIS.
518 meta_tag = ('<meta id="encoding" charset="x-sjis" />')
519
520 # Here's a document incorporating that meta tag.
521 shift_jis_html = (
522 '<html><head>\n%s\n'
523 '<meta http-equiv="Content-language" content="ja"/>'
524 '</head><body>Shift-JIS markup goes here.') % meta_tag
525 soup = self.soup(shift_jis_html)
526
527 # Parse the document, and the charset is seemingly unaffected.
528 parsed_meta = soup.find('meta', id="encoding")
529 charset = parsed_meta['charset']
530 self.assertEqual('x-sjis', charset)
531
532 # But that value is actually a CharsetMetaAttributeValue object.
533 self.assertTrue(isinstance(charset, CharsetMetaAttributeValue))
534
535 # And it will take on a value that reflects its current
536 # encoding.
537 self.assertEqual('utf8', charset.encode("utf8"))
538
539 def test_tag_with_no_attributes_can_have_attributes_added(self):
540 data = self.soup("<a>text</a>")
541 data.a['foo'] = 'bar'
542 self.assertEqual('<a foo="bar">text</a>', data.a.decode())
543
Andrew Geissler4b740dc2020-05-05 08:54:39 -0500544class XMLTreeBuilderSmokeTest(SoupTest):
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500545
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600546 def test_pickle_and_unpickle_identity(self):
547 # Pickling a tree, then unpickling it, yields a tree identical
548 # to the original.
549 tree = self.soup("<a><b>foo</a>")
550 dumped = pickle.dumps(tree, 2)
551 loaded = pickle.loads(dumped)
552 self.assertEqual(loaded.__class__, BeautifulSoup)
553 self.assertEqual(loaded.decode(), tree.decode())
554
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500555 def test_docstring_generated(self):
556 soup = self.soup("<root/>")
557 self.assertEqual(
558 soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
559
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600560 def test_xml_declaration(self):
561 markup = b"""<?xml version="1.0" encoding="utf8"?>\n<foo/>"""
562 soup = self.soup(markup)
563 self.assertEqual(markup, soup.encode("utf8"))
564
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500565 def test_real_xhtml_document(self):
566 """A real XHTML document should come out *exactly* the same as it went in."""
567 markup = b"""<?xml version="1.0" encoding="utf-8"?>
568<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
569<html xmlns="http://www.w3.org/1999/xhtml">
570<head><title>Hello.</title></head>
571<body>Goodbye.</body>
572</html>"""
573 soup = self.soup(markup)
574 self.assertEqual(
575 soup.encode("utf-8"), markup)
576
577 def test_formatter_processes_script_tag_for_xml_documents(self):
578 doc = """
579 <script type="text/javascript">
580 </script>
581"""
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600582 soup = BeautifulSoup(doc, "lxml-xml")
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500583 # lxml would have stripped this while parsing, but we can add
584 # it later.
585 soup.script.string = 'console.log("< < hey > > ");'
586 encoded = soup.encode()
587 self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
588
589 def test_can_parse_unicode_document(self):
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600590 markup = '<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500591 soup = self.soup(markup)
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600592 self.assertEqual('Sacr\xe9 bleu!', soup.root.string)
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500593
594 def test_popping_namespaced_tag(self):
595 markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
596 soup = self.soup(markup)
597 self.assertEqual(
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600598 str(soup.rss), markup)
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500599
600 def test_docstring_includes_correct_encoding(self):
601 soup = self.soup("<root/>")
602 self.assertEqual(
603 soup.encode("latin1"),
604 b'<?xml version="1.0" encoding="latin1"?>\n<root/>')
605
606 def test_large_xml_document(self):
607 """A large XML document should come out the same as it went in."""
608 markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>'
609 + b'0' * (2**12)
610 + b'</root>')
611 soup = self.soup(markup)
612 self.assertEqual(soup.encode("utf-8"), markup)
613
614
615 def test_tags_are_empty_element_if_and_only_if_they_are_empty(self):
616 self.assertSoupEquals("<p>", "<p/>")
617 self.assertSoupEquals("<p>foo</p>")
618
619 def test_namespaces_are_preserved(self):
620 markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>'
621 soup = self.soup(markup)
622 root = soup.root
623 self.assertEqual("http://example.com/", root['xmlns:a'])
624 self.assertEqual("http://example.net/", root['xmlns:b'])
625
626 def test_closing_namespaced_tag(self):
627 markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
628 soup = self.soup(markup)
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600629 self.assertEqual(str(soup.p), markup)
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500630
631 def test_namespaced_attributes(self):
632 markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
633 soup = self.soup(markup)
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600634 self.assertEqual(str(soup.foo), markup)
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500635
636 def test_namespaced_attributes_xml_namespace(self):
637 markup = '<foo xml:lang="fr">bar</foo>'
638 soup = self.soup(markup)
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600639 self.assertEqual(str(soup.foo), markup)
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500640
641class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
642 """Smoke test for a tree builder that supports HTML5."""
643
644 def test_real_xhtml_document(self):
645 # Since XHTML is not HTML5, HTML5 parsers are not tested to handle
646 # XHTML documents in any particular way.
647 pass
648
649 def test_html_tags_have_namespace(self):
650 markup = "<a>"
651 soup = self.soup(markup)
652 self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace)
653
654 def test_svg_tags_have_namespace(self):
655 markup = '<svg><circle/></svg>'
656 soup = self.soup(markup)
657 namespace = "http://www.w3.org/2000/svg"
658 self.assertEqual(namespace, soup.svg.namespace)
659 self.assertEqual(namespace, soup.circle.namespace)
660
661
662 def test_mathml_tags_have_namespace(self):
663 markup = '<math><msqrt>5</msqrt></math>'
664 soup = self.soup(markup)
665 namespace = 'http://www.w3.org/1998/Math/MathML'
666 self.assertEqual(namespace, soup.math.namespace)
667 self.assertEqual(namespace, soup.msqrt.namespace)
668
669 def test_xml_declaration_becomes_comment(self):
670 markup = '<?xml version="1.0" encoding="utf-8"?><html></html>'
671 soup = self.soup(markup)
672 self.assertTrue(isinstance(soup.contents[0], Comment))
673 self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?')
674 self.assertEqual("html", soup.contents[0].next_element.name)
675
676def skipIf(condition, reason):
677 def nothing(test, *args, **kwargs):
678 return None
679
680 def decorator(test_item):
681 if condition:
682 return nothing
683 else:
684 return test_item
685
686 return decorator