Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 1 | """Tests to ensure that the html5lib tree builder generates good trees.""" |
| 2 | |
| 3 | import warnings |
| 4 | |
| 5 | try: |
| 6 | from bs4.builder import HTML5TreeBuilder |
| 7 | HTML5LIB_PRESENT = True |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 8 | except ImportError as e: |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 9 | HTML5LIB_PRESENT = False |
| 10 | from bs4.element import SoupStrainer |
| 11 | from bs4.testing import ( |
| 12 | HTML5TreeBuilderSmokeTest, |
| 13 | SoupTest, |
| 14 | skipIf, |
| 15 | ) |
| 16 | |
| 17 | @skipIf( |
| 18 | not HTML5LIB_PRESENT, |
| 19 | "html5lib seems not to be present, not testing its tree builder.") |
| 20 | class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): |
| 21 | """See ``HTML5TreeBuilderSmokeTest``.""" |
| 22 | |
| 23 | @property |
| 24 | def default_builder(self): |
| 25 | return HTML5TreeBuilder() |
| 26 | |
| 27 | def test_soupstrainer(self): |
| 28 | # The html5lib tree builder does not support SoupStrainers. |
| 29 | strainer = SoupStrainer("b") |
| 30 | markup = "<p>A <b>bold</b> statement.</p>" |
| 31 | with warnings.catch_warnings(record=True) as w: |
| 32 | soup = self.soup(markup, parse_only=strainer) |
| 33 | self.assertEqual( |
| 34 | soup.decode(), self.document_for(markup)) |
| 35 | |
| 36 | self.assertTrue( |
| 37 | "the html5lib tree builder doesn't support parse_only" in |
| 38 | str(w[0].message)) |
| 39 | |
| 40 | def test_correctly_nested_tables(self): |
| 41 | """html5lib inserts <tbody> tags where other parsers don't.""" |
| 42 | markup = ('<table id="1">' |
| 43 | '<tr>' |
| 44 | "<td>Here's another table:" |
| 45 | '<table id="2">' |
| 46 | '<tr><td>foo</td></tr>' |
| 47 | '</table></td>') |
| 48 | |
| 49 | self.assertSoupEquals( |
| 50 | markup, |
| 51 | '<table id="1"><tbody><tr><td>Here\'s another table:' |
| 52 | '<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>' |
| 53 | '</td></tr></tbody></table>') |
| 54 | |
| 55 | self.assertSoupEquals( |
| 56 | "<table><thead><tr><td>Foo</td></tr></thead>" |
| 57 | "<tbody><tr><td>Bar</td></tr></tbody>" |
| 58 | "<tfoot><tr><td>Baz</td></tr></tfoot></table>") |
| 59 | |
| 60 | def test_xml_declaration_followed_by_doctype(self): |
| 61 | markup = '''<?xml version="1.0" encoding="utf-8"?> |
| 62 | <!DOCTYPE html> |
| 63 | <html> |
| 64 | <head> |
| 65 | </head> |
| 66 | <body> |
| 67 | <p>foo</p> |
| 68 | </body> |
| 69 | </html>''' |
| 70 | soup = self.soup(markup) |
| 71 | # Verify that we can reach the <p> tag; this means the tree is connected. |
| 72 | self.assertEqual(b"<p>foo</p>", soup.p.encode()) |
| 73 | |
| 74 | def test_reparented_markup(self): |
| 75 | markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>' |
| 76 | soup = self.soup(markup) |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 77 | self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode()) |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 78 | self.assertEqual(2, len(soup.find_all('p'))) |
| 79 | |
| 80 | |
| 81 | def test_reparented_markup_ends_with_whitespace(self): |
| 82 | markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n' |
| 83 | soup = self.soup(markup) |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 84 | self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode()) |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 85 | self.assertEqual(2, len(soup.find_all('p'))) |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 86 | |
| 87 | def test_processing_instruction(self): |
| 88 | """Processing instructions become comments.""" |
| 89 | markup = b"""<?PITarget PIContent?>""" |
| 90 | soup = self.soup(markup) |
| 91 | assert str(soup).startswith("<!--?PITarget PIContent?-->") |
| 92 | |
| 93 | def test_cloned_multivalue_node(self): |
| 94 | markup = b"""<a class="my_class"><p></a>""" |
| 95 | soup = self.soup(markup) |
| 96 | a1, a2 = soup.find_all('a') |
| 97 | self.assertEqual(a1, a2) |
| 98 | assert a1 is not a2 |