blob: fd4495ac58c92f21a29d24c520738d55a908e0a2 [file] [log] [blame]
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001"""Helper classes for tests."""
2
3import copy
4import functools
5import unittest
6from unittest import TestCase
7from bs4 import BeautifulSoup
8from bs4.element import (
9 CharsetMetaAttributeValue,
10 Comment,
11 ContentMetaAttributeValue,
12 Doctype,
13 SoupStrainer,
14)
15
16from bs4.builder import HTMLParserTreeBuilder
17default_builder = HTMLParserTreeBuilder
18
19
20class SoupTest(unittest.TestCase):
21
22 @property
23 def default_builder(self):
24 return default_builder()
25
26 def soup(self, markup, **kwargs):
27 """Build a Beautiful Soup object from markup."""
28 builder = kwargs.pop('builder', self.default_builder)
29 return BeautifulSoup(markup, builder=builder, **kwargs)
30
31 def document_for(self, markup):
32 """Turn an HTML fragment into a document.
33
34 The details depend on the builder.
35 """
36 return self.default_builder.test_fragment_to_document(markup)
37
38 def assertSoupEquals(self, to_parse, compare_parsed_to=None):
39 builder = self.default_builder
40 obj = BeautifulSoup(to_parse, builder=builder)
41 if compare_parsed_to is None:
42 compare_parsed_to = to_parse
43
44 self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
45
46
47class HTMLTreeBuilderSmokeTest(object):
48
49 """A basic test of a treebuilder's competence.
50
51 Any HTML treebuilder, present or future, should be able to pass
52 these tests. With invalid markup, there's room for interpretation,
53 and different parsers can handle it differently. But with the
54 markup in these tests, there's not much room for interpretation.
55 """
56
57 def assertDoctypeHandled(self, doctype_fragment):
58 """Assert that a given doctype string is handled correctly."""
59 doctype_str, soup = self._document_with_doctype(doctype_fragment)
60
61 # Make sure a Doctype object was created.
62 doctype = soup.contents[0]
63 self.assertEqual(doctype.__class__, Doctype)
64 self.assertEqual(doctype, doctype_fragment)
65 self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
66
67 # Make sure that the doctype was correctly associated with the
68 # parse tree and that the rest of the document parsed.
69 self.assertEqual(soup.p.contents[0], 'foo')
70
71 def _document_with_doctype(self, doctype_fragment):
72 """Generate and parse a document with the given doctype."""
73 doctype = '<!DOCTYPE %s>' % doctype_fragment
74 markup = doctype + '\n<p>foo</p>'
75 soup = self.soup(markup)
76 return doctype, soup
77
78 def test_normal_doctypes(self):
79 """Make sure normal, everyday HTML doctypes are handled correctly."""
80 self.assertDoctypeHandled("html")
81 self.assertDoctypeHandled(
82 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
83
84 def test_empty_doctype(self):
85 soup = self.soup("<!DOCTYPE>")
86 doctype = soup.contents[0]
87 self.assertEqual("", doctype.strip())
88
89 def test_public_doctype_with_url(self):
90 doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
91 self.assertDoctypeHandled(doctype)
92
93 def test_system_doctype(self):
94 self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
95
96 def test_namespaced_system_doctype(self):
97 # We can handle a namespaced doctype with a system ID.
98 self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
99
100 def test_namespaced_public_doctype(self):
101 # Test a namespaced doctype with a public id.
102 self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
103
104 def test_real_xhtml_document(self):
105 """A real XHTML document should come out more or less the same as it went in."""
106 markup = b"""<?xml version="1.0" encoding="utf-8"?>
107<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
108<html xmlns="http://www.w3.org/1999/xhtml">
109<head><title>Hello.</title></head>
110<body>Goodbye.</body>
111</html>"""
112 soup = self.soup(markup)
113 self.assertEqual(
114 soup.encode("utf-8").replace(b"\n", b""),
115 markup.replace(b"\n", b""))
116
117 def test_deepcopy(self):
118 """Make sure you can copy the tree builder.
119
120 This is important because the builder is part of a
121 BeautifulSoup object, and we want to be able to copy that.
122 """
123 copy.deepcopy(self.default_builder)
124
125 def test_p_tag_is_never_empty_element(self):
126 """A <p> tag is never designated as an empty-element tag.
127
128 Even if the markup shows it as an empty-element tag, it
129 shouldn't be presented that way.
130 """
131 soup = self.soup("<p/>")
132 self.assertFalse(soup.p.is_empty_element)
133 self.assertEqual(str(soup.p), "<p></p>")
134
135 def test_unclosed_tags_get_closed(self):
136 """A tag that's not closed by the end of the document should be closed.
137
138 This applies to all tags except empty-element tags.
139 """
140 self.assertSoupEquals("<p>", "<p></p>")
141 self.assertSoupEquals("<b>", "<b></b>")
142
143 self.assertSoupEquals("<br>", "<br/>")
144
145 def test_br_is_always_empty_element_tag(self):
146 """A <br> tag is designated as an empty-element tag.
147
148 Some parsers treat <br></br> as one <br/> tag, some parsers as
149 two tags, but it should always be an empty-element tag.
150 """
151 soup = self.soup("<br></br>")
152 self.assertTrue(soup.br.is_empty_element)
153 self.assertEqual(str(soup.br), "<br/>")
154
155 def test_nested_formatting_elements(self):
156 self.assertSoupEquals("<em><em></em></em>")
157
158 def test_comment(self):
159 # Comments are represented as Comment objects.
160 markup = "<p>foo<!--foobar-->baz</p>"
161 self.assertSoupEquals(markup)
162
163 soup = self.soup(markup)
164 comment = soup.find(text="foobar")
165 self.assertEqual(comment.__class__, Comment)
166
167 # The comment is properly integrated into the tree.
168 foo = soup.find(text="foo")
169 self.assertEqual(comment, foo.next_element)
170 baz = soup.find(text="baz")
171 self.assertEqual(comment, baz.previous_element)
172
173 def test_preserved_whitespace_in_pre_and_textarea(self):
174 """Whitespace must be preserved in <pre> and <textarea> tags."""
175 self.assertSoupEquals("<pre> </pre>")
176 self.assertSoupEquals("<textarea> woo </textarea>")
177
178 def test_nested_inline_elements(self):
179 """Inline elements can be nested indefinitely."""
180 b_tag = "<b>Inside a B tag</b>"
181 self.assertSoupEquals(b_tag)
182
183 nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>"
184 self.assertSoupEquals(nested_b_tag)
185
186 double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>"
187 self.assertSoupEquals(nested_b_tag)
188
189 def test_nested_block_level_elements(self):
190 """Block elements can be nested."""
191 soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>')
192 blockquote = soup.blockquote
193 self.assertEqual(blockquote.p.b.string, 'Foo')
194 self.assertEqual(blockquote.b.string, 'Foo')
195
196 def test_correctly_nested_tables(self):
197 """One table can go inside another one."""
198 markup = ('<table id="1">'
199 '<tr>'
200 "<td>Here's another table:"
201 '<table id="2">'
202 '<tr><td>foo</td></tr>'
203 '</table></td>')
204
205 self.assertSoupEquals(
206 markup,
207 '<table id="1"><tr><td>Here\'s another table:'
208 '<table id="2"><tr><td>foo</td></tr></table>'
209 '</td></tr></table>')
210
211 self.assertSoupEquals(
212 "<table><thead><tr><td>Foo</td></tr></thead>"
213 "<tbody><tr><td>Bar</td></tr></tbody>"
214 "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
215
216 def test_deeply_nested_multivalued_attribute(self):
217 # html5lib can set the attributes of the same tag many times
218 # as it rearranges the tree. This has caused problems with
219 # multivalued attributes.
220 markup = '<table><div><div class="css"></div></div></table>'
221 soup = self.soup(markup)
222 self.assertEqual(["css"], soup.div.div['class'])
223
224 def test_angle_brackets_in_attribute_values_are_escaped(self):
225 self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')
226
227 def test_entities_in_attributes_converted_to_unicode(self):
228 expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
229 self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
230 self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
231 self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect)
232 self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)
233
234 def test_entities_in_text_converted_to_unicode(self):
235 expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
236 self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
237 self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
238 self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect)
239 self.assertSoupEquals("<p>pi&ntilde;ata</p>", expect)
240
241 def test_quot_entity_converted_to_quotation_mark(self):
242 self.assertSoupEquals("<p>I said &quot;good day!&quot;</p>",
243 '<p>I said "good day!"</p>')
244
245 def test_out_of_range_entity(self):
246 expect = u"\N{REPLACEMENT CHARACTER}"
247 self.assertSoupEquals("&#10000000000000;", expect)
248 self.assertSoupEquals("&#x10000000000000;", expect)
249 self.assertSoupEquals("&#1000000000;", expect)
250
251 def test_multipart_strings(self):
252 "Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
253 soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
254 self.assertEqual("p", soup.h2.string.next_element.name)
255 self.assertEqual("p", soup.p.name)
256
257 def test_basic_namespaces(self):
258 """Parsers don't need to *understand* namespaces, but at the
259 very least they should not choke on namespaces or lose
260 data."""
261
262 markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>'
263 soup = self.soup(markup)
264 self.assertEqual(markup, soup.encode())
265 html = soup.html
266 self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns'])
267 self.assertEqual(
268 'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml'])
269 self.assertEqual(
270 'http://www.w3.org/2000/svg', soup.html['xmlns:svg'])
271
272 def test_multivalued_attribute_value_becomes_list(self):
273 markup = b'<a class="foo bar">'
274 soup = self.soup(markup)
275 self.assertEqual(['foo', 'bar'], soup.a['class'])
276
277 #
278 # Generally speaking, tests below this point are more tests of
279 # Beautiful Soup than tests of the tree builders. But parsers are
280 # weird, so we run these tests separately for every tree builder
281 # to detect any differences between them.
282 #
283
284 def test_can_parse_unicode_document(self):
285 # A seemingly innocuous document... but it's in Unicode! And
286 # it contains characters that can't be represented in the
287 # encoding found in the declaration! The horror!
288 markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
289 soup = self.soup(markup)
290 self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string)
291
292 def test_soupstrainer(self):
293 """Parsers should be able to work with SoupStrainers."""
294 strainer = SoupStrainer("b")
295 soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>",
296 parse_only=strainer)
297 self.assertEqual(soup.decode(), "<b>bold</b>")
298
299 def test_single_quote_attribute_values_become_double_quotes(self):
300 self.assertSoupEquals("<foo attr='bar'></foo>",
301 '<foo attr="bar"></foo>')
302
303 def test_attribute_values_with_nested_quotes_are_left_alone(self):
304 text = """<foo attr='bar "brawls" happen'>a</foo>"""
305 self.assertSoupEquals(text)
306
307 def test_attribute_values_with_double_nested_quotes_get_quoted(self):
308 text = """<foo attr='bar "brawls" happen'>a</foo>"""
309 soup = self.soup(text)
310 soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
311 self.assertSoupEquals(
312 soup.foo.decode(),
313 """<foo attr="Brawls happen at &quot;Bob\'s Bar&quot;">a</foo>""")
314
315 def test_ampersand_in_attribute_value_gets_escaped(self):
316 self.assertSoupEquals('<this is="really messed up & stuff"></this>',
317 '<this is="really messed up &amp; stuff"></this>')
318
319 self.assertSoupEquals(
320 '<a href="http://example.org?a=1&b=2;3">foo</a>',
321 '<a href="http://example.org?a=1&amp;b=2;3">foo</a>')
322
323 def test_escaped_ampersand_in_attribute_value_is_left_alone(self):
324 self.assertSoupEquals('<a href="http://example.org?a=1&amp;b=2;3"></a>')
325
326 def test_entities_in_strings_converted_during_parsing(self):
327 # Both XML and HTML entities are converted to Unicode characters
328 # during parsing.
329 text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
330 expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
331 self.assertSoupEquals(text, expected)
332
333 def test_smart_quotes_converted_on_the_way_in(self):
334 # Microsoft smart quotes are converted to Unicode characters during
335 # parsing.
336 quote = b"<p>\x91Foo\x92</p>"
337 soup = self.soup(quote)
338 self.assertEqual(
339 soup.p.string,
340 u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
341
342 def test_non_breaking_spaces_converted_on_the_way_in(self):
343 soup = self.soup("<a>&nbsp;&nbsp;</a>")
344 self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
345
346 def test_entities_converted_on_the_way_out(self):
347 text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
348 expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
349 soup = self.soup(text)
350 self.assertEqual(soup.p.encode("utf-8"), expected)
351
352 def test_real_iso_latin_document(self):
353 # Smoke test of interrelated functionality, using an
354 # easy-to-understand document.
355
356 # Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
357 unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
358
359 # That's because we're going to encode it into ISO-Latin-1, and use
360 # that to test.
361 iso_latin_html = unicode_html.encode("iso-8859-1")
362
363 # Parse the ISO-Latin-1 HTML.
364 soup = self.soup(iso_latin_html)
365 # Encode it to UTF-8.
366 result = soup.encode("utf-8")
367
368 # What do we expect the result to look like? Well, it would
369 # look like unicode_html, except that the META tag would say
370 # UTF-8 instead of ISO-Latin-1.
371 expected = unicode_html.replace("ISO-Latin-1", "utf-8")
372
373 # And, of course, it would be in UTF-8, not Unicode.
374 expected = expected.encode("utf-8")
375
376 # Ta-da!
377 self.assertEqual(result, expected)
378
379 def test_real_shift_jis_document(self):
380 # Smoke test to make sure the parser can handle a document in
381 # Shift-JIS encoding, without choking.
382 shift_jis_html = (
383 b'<html><head></head><body><pre>'
384 b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
385 b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
386 b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
387 b'</pre></body></html>')
388 unicode_html = shift_jis_html.decode("shift-jis")
389 soup = self.soup(unicode_html)
390
391 # Make sure the parse tree is correctly encoded to various
392 # encodings.
393 self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8"))
394 self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp"))
395
396 def test_real_hebrew_document(self):
397 # A real-world test to make sure we can convert ISO-8859-9 (a
398 # Hebrew encoding) to UTF-8.
399 hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
400 soup = self.soup(
401 hebrew_document, from_encoding="iso8859-8")
402 self.assertEqual(soup.original_encoding, 'iso8859-8')
403 self.assertEqual(
404 soup.encode('utf-8'),
405 hebrew_document.decode("iso8859-8").encode("utf-8"))
406
407 def test_meta_tag_reflects_current_encoding(self):
408 # Here's the <meta> tag saying that a document is
409 # encoded in Shift-JIS.
410 meta_tag = ('<meta content="text/html; charset=x-sjis" '
411 'http-equiv="Content-type"/>')
412
413 # Here's a document incorporating that meta tag.
414 shift_jis_html = (
415 '<html><head>\n%s\n'
416 '<meta http-equiv="Content-language" content="ja"/>'
417 '</head><body>Shift-JIS markup goes here.') % meta_tag
418 soup = self.soup(shift_jis_html)
419
420 # Parse the document, and the charset is seemingly unaffected.
421 parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
422 content = parsed_meta['content']
423 self.assertEqual('text/html; charset=x-sjis', content)
424
425 # But that value is actually a ContentMetaAttributeValue object.
426 self.assertTrue(isinstance(content, ContentMetaAttributeValue))
427
428 # And it will take on a value that reflects its current
429 # encoding.
430 self.assertEqual('text/html; charset=utf8', content.encode("utf8"))
431
432 # For the rest of the story, see TestSubstitutions in
433 # test_tree.py.
434
435 def test_html5_style_meta_tag_reflects_current_encoding(self):
436 # Here's the <meta> tag saying that a document is
437 # encoded in Shift-JIS.
438 meta_tag = ('<meta id="encoding" charset="x-sjis" />')
439
440 # Here's a document incorporating that meta tag.
441 shift_jis_html = (
442 '<html><head>\n%s\n'
443 '<meta http-equiv="Content-language" content="ja"/>'
444 '</head><body>Shift-JIS markup goes here.') % meta_tag
445 soup = self.soup(shift_jis_html)
446
447 # Parse the document, and the charset is seemingly unaffected.
448 parsed_meta = soup.find('meta', id="encoding")
449 charset = parsed_meta['charset']
450 self.assertEqual('x-sjis', charset)
451
452 # But that value is actually a CharsetMetaAttributeValue object.
453 self.assertTrue(isinstance(charset, CharsetMetaAttributeValue))
454
455 # And it will take on a value that reflects its current
456 # encoding.
457 self.assertEqual('utf8', charset.encode("utf8"))
458
459 def test_tag_with_no_attributes_can_have_attributes_added(self):
460 data = self.soup("<a>text</a>")
461 data.a['foo'] = 'bar'
462 self.assertEqual('<a foo="bar">text</a>', data.a.decode())
463
464class XMLTreeBuilderSmokeTest(object):
465
466 def test_docstring_generated(self):
467 soup = self.soup("<root/>")
468 self.assertEqual(
469 soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
470
471 def test_real_xhtml_document(self):
472 """A real XHTML document should come out *exactly* the same as it went in."""
473 markup = b"""<?xml version="1.0" encoding="utf-8"?>
474<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
475<html xmlns="http://www.w3.org/1999/xhtml">
476<head><title>Hello.</title></head>
477<body>Goodbye.</body>
478</html>"""
479 soup = self.soup(markup)
480 self.assertEqual(
481 soup.encode("utf-8"), markup)
482
483 def test_formatter_processes_script_tag_for_xml_documents(self):
484 doc = """
485 <script type="text/javascript">
486 </script>
487"""
488 soup = BeautifulSoup(doc, "xml")
489 # lxml would have stripped this while parsing, but we can add
490 # it later.
491 soup.script.string = 'console.log("< < hey > > ");'
492 encoded = soup.encode()
493 self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
494
495 def test_can_parse_unicode_document(self):
496 markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
497 soup = self.soup(markup)
498 self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string)
499
500 def test_popping_namespaced_tag(self):
501 markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
502 soup = self.soup(markup)
503 self.assertEqual(
504 unicode(soup.rss), markup)
505
506 def test_docstring_includes_correct_encoding(self):
507 soup = self.soup("<root/>")
508 self.assertEqual(
509 soup.encode("latin1"),
510 b'<?xml version="1.0" encoding="latin1"?>\n<root/>')
511
512 def test_large_xml_document(self):
513 """A large XML document should come out the same as it went in."""
514 markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>'
515 + b'0' * (2**12)
516 + b'</root>')
517 soup = self.soup(markup)
518 self.assertEqual(soup.encode("utf-8"), markup)
519
520
521 def test_tags_are_empty_element_if_and_only_if_they_are_empty(self):
522 self.assertSoupEquals("<p>", "<p/>")
523 self.assertSoupEquals("<p>foo</p>")
524
525 def test_namespaces_are_preserved(self):
526 markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>'
527 soup = self.soup(markup)
528 root = soup.root
529 self.assertEqual("http://example.com/", root['xmlns:a'])
530 self.assertEqual("http://example.net/", root['xmlns:b'])
531
532 def test_closing_namespaced_tag(self):
533 markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
534 soup = self.soup(markup)
535 self.assertEqual(unicode(soup.p), markup)
536
537 def test_namespaced_attributes(self):
538 markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
539 soup = self.soup(markup)
540 self.assertEqual(unicode(soup.foo), markup)
541
542 def test_namespaced_attributes_xml_namespace(self):
543 markup = '<foo xml:lang="fr">bar</foo>'
544 soup = self.soup(markup)
545 self.assertEqual(unicode(soup.foo), markup)
546
547class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
548 """Smoke test for a tree builder that supports HTML5."""
549
550 def test_real_xhtml_document(self):
551 # Since XHTML is not HTML5, HTML5 parsers are not tested to handle
552 # XHTML documents in any particular way.
553 pass
554
555 def test_html_tags_have_namespace(self):
556 markup = "<a>"
557 soup = self.soup(markup)
558 self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace)
559
560 def test_svg_tags_have_namespace(self):
561 markup = '<svg><circle/></svg>'
562 soup = self.soup(markup)
563 namespace = "http://www.w3.org/2000/svg"
564 self.assertEqual(namespace, soup.svg.namespace)
565 self.assertEqual(namespace, soup.circle.namespace)
566
567
568 def test_mathml_tags_have_namespace(self):
569 markup = '<math><msqrt>5</msqrt></math>'
570 soup = self.soup(markup)
571 namespace = 'http://www.w3.org/1998/Math/MathML'
572 self.assertEqual(namespace, soup.math.namespace)
573 self.assertEqual(namespace, soup.msqrt.namespace)
574
575 def test_xml_declaration_becomes_comment(self):
576 markup = '<?xml version="1.0" encoding="utf-8"?><html></html>'
577 soup = self.soup(markup)
578 self.assertTrue(isinstance(soup.contents[0], Comment))
579 self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?')
580 self.assertEqual("html", soup.contents[0].next_element.name)
581
582def skipIf(condition, reason):
583 def nothing(test, *args, **kwargs):
584 return None
585
586 def decorator(test_item):
587 if condition:
588 return nothing
589 else:
590 return test_item
591
592 return decorator