blob: 3a2f260e24ee6ac4fcf6e6ab372eb1e7ced5d994 [file] [log] [blame]
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001"""Helper classes for tests."""
2
Patrick Williamsc0f7c042017-02-23 20:41:17 -06003__license__ = "MIT"
4
5import pickle
Patrick Williamsc124f4f2015-09-15 14:41:29 -05006import copy
7import functools
8import unittest
9from unittest import TestCase
10from bs4 import BeautifulSoup
11from bs4.element import (
12 CharsetMetaAttributeValue,
13 Comment,
14 ContentMetaAttributeValue,
15 Doctype,
16 SoupStrainer,
17)
18
19from bs4.builder import HTMLParserTreeBuilder
20default_builder = HTMLParserTreeBuilder
21
22
23class SoupTest(unittest.TestCase):
24
25 @property
26 def default_builder(self):
27 return default_builder()
28
29 def soup(self, markup, **kwargs):
30 """Build a Beautiful Soup object from markup."""
31 builder = kwargs.pop('builder', self.default_builder)
32 return BeautifulSoup(markup, builder=builder, **kwargs)
33
34 def document_for(self, markup):
35 """Turn an HTML fragment into a document.
36
37 The details depend on the builder.
38 """
39 return self.default_builder.test_fragment_to_document(markup)
40
41 def assertSoupEquals(self, to_parse, compare_parsed_to=None):
42 builder = self.default_builder
43 obj = BeautifulSoup(to_parse, builder=builder)
44 if compare_parsed_to is None:
45 compare_parsed_to = to_parse
46
47 self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
48
Patrick Williamsc0f7c042017-02-23 20:41:17 -060049 def assertConnectedness(self, element):
50 """Ensure that next_element and previous_element are properly
51 set for all descendants of the given element.
52 """
53 earlier = None
54 for e in element.descendants:
55 if earlier:
56 self.assertEqual(e, earlier.next_element)
57 self.assertEqual(earlier, e.previous_element)
58 earlier = e
Patrick Williamsc124f4f2015-09-15 14:41:29 -050059
60class HTMLTreeBuilderSmokeTest(object):
61
62 """A basic test of a treebuilder's competence.
63
64 Any HTML treebuilder, present or future, should be able to pass
65 these tests. With invalid markup, there's room for interpretation,
66 and different parsers can handle it differently. But with the
67 markup in these tests, there's not much room for interpretation.
68 """
69
Patrick Williamsc0f7c042017-02-23 20:41:17 -060070 def test_pickle_and_unpickle_identity(self):
71 # Pickling a tree, then unpickling it, yields a tree identical
72 # to the original.
73 tree = self.soup("<a><b>foo</a>")
74 dumped = pickle.dumps(tree, 2)
75 loaded = pickle.loads(dumped)
76 self.assertEqual(loaded.__class__, BeautifulSoup)
77 self.assertEqual(loaded.decode(), tree.decode())
78
Patrick Williamsc124f4f2015-09-15 14:41:29 -050079 def assertDoctypeHandled(self, doctype_fragment):
80 """Assert that a given doctype string is handled correctly."""
81 doctype_str, soup = self._document_with_doctype(doctype_fragment)
82
83 # Make sure a Doctype object was created.
84 doctype = soup.contents[0]
85 self.assertEqual(doctype.__class__, Doctype)
86 self.assertEqual(doctype, doctype_fragment)
87 self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
88
89 # Make sure that the doctype was correctly associated with the
90 # parse tree and that the rest of the document parsed.
91 self.assertEqual(soup.p.contents[0], 'foo')
92
93 def _document_with_doctype(self, doctype_fragment):
94 """Generate and parse a document with the given doctype."""
95 doctype = '<!DOCTYPE %s>' % doctype_fragment
96 markup = doctype + '\n<p>foo</p>'
97 soup = self.soup(markup)
98 return doctype, soup
99
100 def test_normal_doctypes(self):
101 """Make sure normal, everyday HTML doctypes are handled correctly."""
102 self.assertDoctypeHandled("html")
103 self.assertDoctypeHandled(
104 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
105
106 def test_empty_doctype(self):
107 soup = self.soup("<!DOCTYPE>")
108 doctype = soup.contents[0]
109 self.assertEqual("", doctype.strip())
110
111 def test_public_doctype_with_url(self):
112 doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
113 self.assertDoctypeHandled(doctype)
114
115 def test_system_doctype(self):
116 self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
117
118 def test_namespaced_system_doctype(self):
119 # We can handle a namespaced doctype with a system ID.
120 self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
121
122 def test_namespaced_public_doctype(self):
123 # Test a namespaced doctype with a public id.
124 self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
125
126 def test_real_xhtml_document(self):
127 """A real XHTML document should come out more or less the same as it went in."""
128 markup = b"""<?xml version="1.0" encoding="utf-8"?>
129<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
130<html xmlns="http://www.w3.org/1999/xhtml">
131<head><title>Hello.</title></head>
132<body>Goodbye.</body>
133</html>"""
134 soup = self.soup(markup)
135 self.assertEqual(
136 soup.encode("utf-8").replace(b"\n", b""),
137 markup.replace(b"\n", b""))
138
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600139 def test_processing_instruction(self):
140 markup = b"""<?PITarget PIContent?>"""
141 soup = self.soup(markup)
142 self.assertEqual(markup, soup.encode("utf8"))
143
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500144 def test_deepcopy(self):
145 """Make sure you can copy the tree builder.
146
147 This is important because the builder is part of a
148 BeautifulSoup object, and we want to be able to copy that.
149 """
150 copy.deepcopy(self.default_builder)
151
152 def test_p_tag_is_never_empty_element(self):
153 """A <p> tag is never designated as an empty-element tag.
154
155 Even if the markup shows it as an empty-element tag, it
156 shouldn't be presented that way.
157 """
158 soup = self.soup("<p/>")
159 self.assertFalse(soup.p.is_empty_element)
160 self.assertEqual(str(soup.p), "<p></p>")
161
162 def test_unclosed_tags_get_closed(self):
163 """A tag that's not closed by the end of the document should be closed.
164
165 This applies to all tags except empty-element tags.
166 """
167 self.assertSoupEquals("<p>", "<p></p>")
168 self.assertSoupEquals("<b>", "<b></b>")
169
170 self.assertSoupEquals("<br>", "<br/>")
171
172 def test_br_is_always_empty_element_tag(self):
173 """A <br> tag is designated as an empty-element tag.
174
175 Some parsers treat <br></br> as one <br/> tag, some parsers as
176 two tags, but it should always be an empty-element tag.
177 """
178 soup = self.soup("<br></br>")
179 self.assertTrue(soup.br.is_empty_element)
180 self.assertEqual(str(soup.br), "<br/>")
181
182 def test_nested_formatting_elements(self):
183 self.assertSoupEquals("<em><em></em></em>")
184
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600185 def test_double_head(self):
186 html = '''<!DOCTYPE html>
187<html>
188<head>
189<title>Ordinary HEAD element test</title>
190</head>
191<script type="text/javascript">
192alert("Help!");
193</script>
194<body>
195Hello, world!
196</body>
197</html>
198'''
199 soup = self.soup(html)
200 self.assertEqual("text/javascript", soup.find('script')['type'])
201
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500202 def test_comment(self):
203 # Comments are represented as Comment objects.
204 markup = "<p>foo<!--foobar-->baz</p>"
205 self.assertSoupEquals(markup)
206
207 soup = self.soup(markup)
208 comment = soup.find(text="foobar")
209 self.assertEqual(comment.__class__, Comment)
210
211 # The comment is properly integrated into the tree.
212 foo = soup.find(text="foo")
213 self.assertEqual(comment, foo.next_element)
214 baz = soup.find(text="baz")
215 self.assertEqual(comment, baz.previous_element)
216
217 def test_preserved_whitespace_in_pre_and_textarea(self):
218 """Whitespace must be preserved in <pre> and <textarea> tags."""
219 self.assertSoupEquals("<pre> </pre>")
220 self.assertSoupEquals("<textarea> woo </textarea>")
221
222 def test_nested_inline_elements(self):
223 """Inline elements can be nested indefinitely."""
224 b_tag = "<b>Inside a B tag</b>"
225 self.assertSoupEquals(b_tag)
226
227 nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>"
228 self.assertSoupEquals(nested_b_tag)
229
230 double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>"
231 self.assertSoupEquals(nested_b_tag)
232
233 def test_nested_block_level_elements(self):
234 """Block elements can be nested."""
235 soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>')
236 blockquote = soup.blockquote
237 self.assertEqual(blockquote.p.b.string, 'Foo')
238 self.assertEqual(blockquote.b.string, 'Foo')
239
240 def test_correctly_nested_tables(self):
241 """One table can go inside another one."""
242 markup = ('<table id="1">'
243 '<tr>'
244 "<td>Here's another table:"
245 '<table id="2">'
246 '<tr><td>foo</td></tr>'
247 '</table></td>')
248
249 self.assertSoupEquals(
250 markup,
251 '<table id="1"><tr><td>Here\'s another table:'
252 '<table id="2"><tr><td>foo</td></tr></table>'
253 '</td></tr></table>')
254
255 self.assertSoupEquals(
256 "<table><thead><tr><td>Foo</td></tr></thead>"
257 "<tbody><tr><td>Bar</td></tr></tbody>"
258 "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
259
260 def test_deeply_nested_multivalued_attribute(self):
261 # html5lib can set the attributes of the same tag many times
262 # as it rearranges the tree. This has caused problems with
263 # multivalued attributes.
264 markup = '<table><div><div class="css"></div></div></table>'
265 soup = self.soup(markup)
266 self.assertEqual(["css"], soup.div.div['class'])
267
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600268 def test_multivalued_attribute_on_html(self):
269 # html5lib uses a different API to set the attributes ot the
270 # <html> tag. This has caused problems with multivalued
271 # attributes.
272 markup = '<html class="a b"></html>'
273 soup = self.soup(markup)
274 self.assertEqual(["a", "b"], soup.html['class'])
275
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500276 def test_angle_brackets_in_attribute_values_are_escaped(self):
277 self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')
278
279 def test_entities_in_attributes_converted_to_unicode(self):
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600280 expect = '<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500281 self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
282 self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
283 self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect)
284 self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)
285
286 def test_entities_in_text_converted_to_unicode(self):
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600287 expect = '<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500288 self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
289 self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
290 self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect)
291 self.assertSoupEquals("<p>pi&ntilde;ata</p>", expect)
292
293 def test_quot_entity_converted_to_quotation_mark(self):
294 self.assertSoupEquals("<p>I said &quot;good day!&quot;</p>",
295 '<p>I said "good day!"</p>')
296
297 def test_out_of_range_entity(self):
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600298 expect = "\N{REPLACEMENT CHARACTER}"
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500299 self.assertSoupEquals("&#10000000000000;", expect)
300 self.assertSoupEquals("&#x10000000000000;", expect)
301 self.assertSoupEquals("&#1000000000;", expect)
302
303 def test_multipart_strings(self):
304 "Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
305 soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
306 self.assertEqual("p", soup.h2.string.next_element.name)
307 self.assertEqual("p", soup.p.name)
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600308 self.assertConnectedness(soup)
309
310 def test_head_tag_between_head_and_body(self):
311 "Prevent recurrence of a bug in the html5lib treebuilder."
312 content = """<html><head></head>
313 <link></link>
314 <body>foo</body>
315</html>
316"""
317 soup = self.soup(content)
318 self.assertNotEqual(None, soup.html.body)
319 self.assertConnectedness(soup)
320
321 def test_multiple_copies_of_a_tag(self):
322 "Prevent recurrence of a bug in the html5lib treebuilder."
323 content = """<!DOCTYPE html>
324<html>
325 <body>
326 <article id="a" >
327 <div><a href="1"></div>
328 <footer>
329 <a href="2"></a>
330 </footer>
331 </article>
332 </body>
333</html>
334"""
335 soup = self.soup(content)
336 self.assertConnectedness(soup.article)
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500337
338 def test_basic_namespaces(self):
339 """Parsers don't need to *understand* namespaces, but at the
340 very least they should not choke on namespaces or lose
341 data."""
342
343 markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>'
344 soup = self.soup(markup)
345 self.assertEqual(markup, soup.encode())
346 html = soup.html
347 self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns'])
348 self.assertEqual(
349 'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml'])
350 self.assertEqual(
351 'http://www.w3.org/2000/svg', soup.html['xmlns:svg'])
352
353 def test_multivalued_attribute_value_becomes_list(self):
354 markup = b'<a class="foo bar">'
355 soup = self.soup(markup)
356 self.assertEqual(['foo', 'bar'], soup.a['class'])
357
358 #
359 # Generally speaking, tests below this point are more tests of
360 # Beautiful Soup than tests of the tree builders. But parsers are
361 # weird, so we run these tests separately for every tree builder
362 # to detect any differences between them.
363 #
364
365 def test_can_parse_unicode_document(self):
366 # A seemingly innocuous document... but it's in Unicode! And
367 # it contains characters that can't be represented in the
368 # encoding found in the declaration! The horror!
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600369 markup = '<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500370 soup = self.soup(markup)
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600371 self.assertEqual('Sacr\xe9 bleu!', soup.body.string)
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500372
373 def test_soupstrainer(self):
374 """Parsers should be able to work with SoupStrainers."""
375 strainer = SoupStrainer("b")
376 soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>",
377 parse_only=strainer)
378 self.assertEqual(soup.decode(), "<b>bold</b>")
379
380 def test_single_quote_attribute_values_become_double_quotes(self):
381 self.assertSoupEquals("<foo attr='bar'></foo>",
382 '<foo attr="bar"></foo>')
383
384 def test_attribute_values_with_nested_quotes_are_left_alone(self):
385 text = """<foo attr='bar "brawls" happen'>a</foo>"""
386 self.assertSoupEquals(text)
387
388 def test_attribute_values_with_double_nested_quotes_get_quoted(self):
389 text = """<foo attr='bar "brawls" happen'>a</foo>"""
390 soup = self.soup(text)
391 soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
392 self.assertSoupEquals(
393 soup.foo.decode(),
394 """<foo attr="Brawls happen at &quot;Bob\'s Bar&quot;">a</foo>""")
395
396 def test_ampersand_in_attribute_value_gets_escaped(self):
397 self.assertSoupEquals('<this is="really messed up & stuff"></this>',
398 '<this is="really messed up &amp; stuff"></this>')
399
400 self.assertSoupEquals(
401 '<a href="http://example.org?a=1&b=2;3">foo</a>',
402 '<a href="http://example.org?a=1&amp;b=2;3">foo</a>')
403
404 def test_escaped_ampersand_in_attribute_value_is_left_alone(self):
405 self.assertSoupEquals('<a href="http://example.org?a=1&amp;b=2;3"></a>')
406
407 def test_entities_in_strings_converted_during_parsing(self):
408 # Both XML and HTML entities are converted to Unicode characters
409 # during parsing.
410 text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600411 expected = "<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500412 self.assertSoupEquals(text, expected)
413
414 def test_smart_quotes_converted_on_the_way_in(self):
415 # Microsoft smart quotes are converted to Unicode characters during
416 # parsing.
417 quote = b"<p>\x91Foo\x92</p>"
418 soup = self.soup(quote)
419 self.assertEqual(
420 soup.p.string,
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600421 "\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500422
423 def test_non_breaking_spaces_converted_on_the_way_in(self):
424 soup = self.soup("<a>&nbsp;&nbsp;</a>")
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600425 self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2)
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500426
427 def test_entities_converted_on_the_way_out(self):
428 text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600429 expected = "<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500430 soup = self.soup(text)
431 self.assertEqual(soup.p.encode("utf-8"), expected)
432
433 def test_real_iso_latin_document(self):
434 # Smoke test of interrelated functionality, using an
435 # easy-to-understand document.
436
437 # Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600438 unicode_html = '<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500439
440 # That's because we're going to encode it into ISO-Latin-1, and use
441 # that to test.
442 iso_latin_html = unicode_html.encode("iso-8859-1")
443
444 # Parse the ISO-Latin-1 HTML.
445 soup = self.soup(iso_latin_html)
446 # Encode it to UTF-8.
447 result = soup.encode("utf-8")
448
449 # What do we expect the result to look like? Well, it would
450 # look like unicode_html, except that the META tag would say
451 # UTF-8 instead of ISO-Latin-1.
452 expected = unicode_html.replace("ISO-Latin-1", "utf-8")
453
454 # And, of course, it would be in UTF-8, not Unicode.
455 expected = expected.encode("utf-8")
456
457 # Ta-da!
458 self.assertEqual(result, expected)
459
460 def test_real_shift_jis_document(self):
461 # Smoke test to make sure the parser can handle a document in
462 # Shift-JIS encoding, without choking.
463 shift_jis_html = (
464 b'<html><head></head><body><pre>'
465 b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
466 b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
467 b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
468 b'</pre></body></html>')
469 unicode_html = shift_jis_html.decode("shift-jis")
470 soup = self.soup(unicode_html)
471
472 # Make sure the parse tree is correctly encoded to various
473 # encodings.
474 self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8"))
475 self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp"))
476
477 def test_real_hebrew_document(self):
478 # A real-world test to make sure we can convert ISO-8859-9 (a
479 # Hebrew encoding) to UTF-8.
480 hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
481 soup = self.soup(
482 hebrew_document, from_encoding="iso8859-8")
483 self.assertEqual(soup.original_encoding, 'iso8859-8')
484 self.assertEqual(
485 soup.encode('utf-8'),
486 hebrew_document.decode("iso8859-8").encode("utf-8"))
487
488 def test_meta_tag_reflects_current_encoding(self):
489 # Here's the <meta> tag saying that a document is
490 # encoded in Shift-JIS.
491 meta_tag = ('<meta content="text/html; charset=x-sjis" '
492 'http-equiv="Content-type"/>')
493
494 # Here's a document incorporating that meta tag.
495 shift_jis_html = (
496 '<html><head>\n%s\n'
497 '<meta http-equiv="Content-language" content="ja"/>'
498 '</head><body>Shift-JIS markup goes here.') % meta_tag
499 soup = self.soup(shift_jis_html)
500
501 # Parse the document, and the charset is seemingly unaffected.
502 parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
503 content = parsed_meta['content']
504 self.assertEqual('text/html; charset=x-sjis', content)
505
506 # But that value is actually a ContentMetaAttributeValue object.
507 self.assertTrue(isinstance(content, ContentMetaAttributeValue))
508
509 # And it will take on a value that reflects its current
510 # encoding.
511 self.assertEqual('text/html; charset=utf8', content.encode("utf8"))
512
513 # For the rest of the story, see TestSubstitutions in
514 # test_tree.py.
515
516 def test_html5_style_meta_tag_reflects_current_encoding(self):
517 # Here's the <meta> tag saying that a document is
518 # encoded in Shift-JIS.
519 meta_tag = ('<meta id="encoding" charset="x-sjis" />')
520
521 # Here's a document incorporating that meta tag.
522 shift_jis_html = (
523 '<html><head>\n%s\n'
524 '<meta http-equiv="Content-language" content="ja"/>'
525 '</head><body>Shift-JIS markup goes here.') % meta_tag
526 soup = self.soup(shift_jis_html)
527
528 # Parse the document, and the charset is seemingly unaffected.
529 parsed_meta = soup.find('meta', id="encoding")
530 charset = parsed_meta['charset']
531 self.assertEqual('x-sjis', charset)
532
533 # But that value is actually a CharsetMetaAttributeValue object.
534 self.assertTrue(isinstance(charset, CharsetMetaAttributeValue))
535
536 # And it will take on a value that reflects its current
537 # encoding.
538 self.assertEqual('utf8', charset.encode("utf8"))
539
540 def test_tag_with_no_attributes_can_have_attributes_added(self):
541 data = self.soup("<a>text</a>")
542 data.a['foo'] = 'bar'
543 self.assertEqual('<a foo="bar">text</a>', data.a.decode())
544
545class XMLTreeBuilderSmokeTest(object):
546
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600547 def test_pickle_and_unpickle_identity(self):
548 # Pickling a tree, then unpickling it, yields a tree identical
549 # to the original.
550 tree = self.soup("<a><b>foo</a>")
551 dumped = pickle.dumps(tree, 2)
552 loaded = pickle.loads(dumped)
553 self.assertEqual(loaded.__class__, BeautifulSoup)
554 self.assertEqual(loaded.decode(), tree.decode())
555
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500556 def test_docstring_generated(self):
557 soup = self.soup("<root/>")
558 self.assertEqual(
559 soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
560
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600561 def test_xml_declaration(self):
562 markup = b"""<?xml version="1.0" encoding="utf8"?>\n<foo/>"""
563 soup = self.soup(markup)
564 self.assertEqual(markup, soup.encode("utf8"))
565
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500566 def test_real_xhtml_document(self):
567 """A real XHTML document should come out *exactly* the same as it went in."""
568 markup = b"""<?xml version="1.0" encoding="utf-8"?>
569<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
570<html xmlns="http://www.w3.org/1999/xhtml">
571<head><title>Hello.</title></head>
572<body>Goodbye.</body>
573</html>"""
574 soup = self.soup(markup)
575 self.assertEqual(
576 soup.encode("utf-8"), markup)
577
578 def test_formatter_processes_script_tag_for_xml_documents(self):
579 doc = """
580 <script type="text/javascript">
581 </script>
582"""
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600583 soup = BeautifulSoup(doc, "lxml-xml")
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500584 # lxml would have stripped this while parsing, but we can add
585 # it later.
586 soup.script.string = 'console.log("< < hey > > ");'
587 encoded = soup.encode()
588 self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
589
590 def test_can_parse_unicode_document(self):
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600591 markup = '<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500592 soup = self.soup(markup)
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600593 self.assertEqual('Sacr\xe9 bleu!', soup.root.string)
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500594
595 def test_popping_namespaced_tag(self):
596 markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
597 soup = self.soup(markup)
598 self.assertEqual(
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600599 str(soup.rss), markup)
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500600
601 def test_docstring_includes_correct_encoding(self):
602 soup = self.soup("<root/>")
603 self.assertEqual(
604 soup.encode("latin1"),
605 b'<?xml version="1.0" encoding="latin1"?>\n<root/>')
606
607 def test_large_xml_document(self):
608 """A large XML document should come out the same as it went in."""
609 markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>'
610 + b'0' * (2**12)
611 + b'</root>')
612 soup = self.soup(markup)
613 self.assertEqual(soup.encode("utf-8"), markup)
614
615
616 def test_tags_are_empty_element_if_and_only_if_they_are_empty(self):
617 self.assertSoupEquals("<p>", "<p/>")
618 self.assertSoupEquals("<p>foo</p>")
619
620 def test_namespaces_are_preserved(self):
621 markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>'
622 soup = self.soup(markup)
623 root = soup.root
624 self.assertEqual("http://example.com/", root['xmlns:a'])
625 self.assertEqual("http://example.net/", root['xmlns:b'])
626
627 def test_closing_namespaced_tag(self):
628 markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
629 soup = self.soup(markup)
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600630 self.assertEqual(str(soup.p), markup)
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500631
632 def test_namespaced_attributes(self):
633 markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
634 soup = self.soup(markup)
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600635 self.assertEqual(str(soup.foo), markup)
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500636
637 def test_namespaced_attributes_xml_namespace(self):
638 markup = '<foo xml:lang="fr">bar</foo>'
639 soup = self.soup(markup)
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600640 self.assertEqual(str(soup.foo), markup)
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500641
642class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
643 """Smoke test for a tree builder that supports HTML5."""
644
645 def test_real_xhtml_document(self):
646 # Since XHTML is not HTML5, HTML5 parsers are not tested to handle
647 # XHTML documents in any particular way.
648 pass
649
650 def test_html_tags_have_namespace(self):
651 markup = "<a>"
652 soup = self.soup(markup)
653 self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace)
654
655 def test_svg_tags_have_namespace(self):
656 markup = '<svg><circle/></svg>'
657 soup = self.soup(markup)
658 namespace = "http://www.w3.org/2000/svg"
659 self.assertEqual(namespace, soup.svg.namespace)
660 self.assertEqual(namespace, soup.circle.namespace)
661
662
663 def test_mathml_tags_have_namespace(self):
664 markup = '<math><msqrt>5</msqrt></math>'
665 soup = self.soup(markup)
666 namespace = 'http://www.w3.org/1998/Math/MathML'
667 self.assertEqual(namespace, soup.math.namespace)
668 self.assertEqual(namespace, soup.msqrt.namespace)
669
670 def test_xml_declaration_becomes_comment(self):
671 markup = '<?xml version="1.0" encoding="utf-8"?><html></html>'
672 soup = self.soup(markup)
673 self.assertTrue(isinstance(soup.contents[0], Comment))
674 self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?')
675 self.assertEqual("html", soup.contents[0].next_element.name)
676
677def skipIf(condition, reason):
678 def nothing(test, *args, **kwargs):
679 return None
680
681 def decorator(test_item):
682 if condition:
683 return nothing
684 else:
685 return test_item
686
687 return decorator