Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 1 | # -*- coding: utf-8 -*- |
| 2 | """Tests of Beautiful Soup as a whole.""" |
| 3 | |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 4 | from pdb import set_trace |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 5 | import logging |
| 6 | import unittest |
| 7 | import sys |
| 8 | import tempfile |
| 9 | |
| 10 | from bs4 import ( |
| 11 | BeautifulSoup, |
| 12 | BeautifulStoneSoup, |
| 13 | ) |
| 14 | from bs4.element import ( |
| 15 | CharsetMetaAttributeValue, |
| 16 | ContentMetaAttributeValue, |
| 17 | SoupStrainer, |
| 18 | NamespacedAttribute, |
| 19 | ) |
| 20 | import bs4.dammit |
| 21 | from bs4.dammit import ( |
| 22 | EntitySubstitution, |
| 23 | UnicodeDammit, |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 24 | EncodingDetector, |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 25 | ) |
| 26 | from bs4.testing import ( |
| 27 | SoupTest, |
| 28 | skipIf, |
| 29 | ) |
| 30 | import warnings |
| 31 | |
| 32 | try: |
| 33 | from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML |
| 34 | LXML_PRESENT = True |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 35 | except ImportError as e: |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 36 | LXML_PRESENT = False |
| 37 | |
| 38 | PYTHON_2_PRE_2_7 = (sys.version_info < (2,7)) |
| 39 | PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) |
| 40 | |
| 41 | class TestConstructor(SoupTest): |
| 42 | |
| 43 | def test_short_unicode_input(self): |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 44 | data = "<h1>éé</h1>" |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 45 | soup = self.soup(data) |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 46 | self.assertEqual("éé", soup.h1.string) |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 47 | |
| 48 | def test_embedded_null(self): |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 49 | data = "<h1>foo\0bar</h1>" |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 50 | soup = self.soup(data) |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 51 | self.assertEqual("foo\0bar", soup.h1.string) |
| 52 | |
| 53 | def test_exclude_encodings(self): |
| 54 | utf8_data = "Räksmörgås".encode("utf-8") |
| 55 | soup = self.soup(utf8_data, exclude_encodings=["utf-8"]) |
| 56 | self.assertEqual("windows-1252", soup.original_encoding) |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 57 | |
| 58 | |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 59 | class TestWarnings(SoupTest): |
| 60 | |
| 61 | def _no_parser_specified(self, s, is_there=True): |
| 62 | v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80]) |
| 63 | self.assertTrue(v) |
| 64 | |
| 65 | def test_warning_if_no_parser_specified(self): |
| 66 | with warnings.catch_warnings(record=True) as w: |
| 67 | soup = self.soup("<a><b></b></a>") |
| 68 | msg = str(w[0].message) |
| 69 | self._assert_no_parser_specified(msg) |
| 70 | |
| 71 | def test_warning_if_parser_specified_too_vague(self): |
| 72 | with warnings.catch_warnings(record=True) as w: |
| 73 | soup = self.soup("<a><b></b></a>", "html") |
| 74 | msg = str(w[0].message) |
| 75 | self._assert_no_parser_specified(msg) |
| 76 | |
| 77 | def test_no_warning_if_explicit_parser_specified(self): |
| 78 | with warnings.catch_warnings(record=True) as w: |
| 79 | soup = self.soup("<a><b></b></a>", "html.parser") |
| 80 | self.assertEqual([], w) |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 81 | |
| 82 | def test_parseOnlyThese_renamed_to_parse_only(self): |
| 83 | with warnings.catch_warnings(record=True) as w: |
| 84 | soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b")) |
| 85 | msg = str(w[0].message) |
| 86 | self.assertTrue("parseOnlyThese" in msg) |
| 87 | self.assertTrue("parse_only" in msg) |
| 88 | self.assertEqual(b"<b></b>", soup.encode()) |
| 89 | |
| 90 | def test_fromEncoding_renamed_to_from_encoding(self): |
| 91 | with warnings.catch_warnings(record=True) as w: |
| 92 | utf8 = b"\xc3\xa9" |
| 93 | soup = self.soup(utf8, fromEncoding="utf8") |
| 94 | msg = str(w[0].message) |
| 95 | self.assertTrue("fromEncoding" in msg) |
| 96 | self.assertTrue("from_encoding" in msg) |
| 97 | self.assertEqual("utf8", soup.original_encoding) |
| 98 | |
| 99 | def test_unrecognized_keyword_argument(self): |
| 100 | self.assertRaises( |
| 101 | TypeError, self.soup, "<a>", no_such_argument=True) |
| 102 | |
| 103 | class TestWarnings(SoupTest): |
| 104 | |
| 105 | def test_disk_file_warning(self): |
| 106 | filehandle = tempfile.NamedTemporaryFile() |
| 107 | filename = filehandle.name |
| 108 | try: |
| 109 | with warnings.catch_warnings(record=True) as w: |
| 110 | soup = self.soup(filename) |
| 111 | msg = str(w[0].message) |
| 112 | self.assertTrue("looks like a filename" in msg) |
| 113 | finally: |
| 114 | filehandle.close() |
| 115 | |
| 116 | # The file no longer exists, so Beautiful Soup will no longer issue the warning. |
| 117 | with warnings.catch_warnings(record=True) as w: |
| 118 | soup = self.soup(filename) |
| 119 | self.assertEqual(0, len(w)) |
| 120 | |
| 121 | def test_url_warning(self): |
| 122 | with warnings.catch_warnings(record=True) as w: |
| 123 | soup = self.soup("http://www.crummy.com/") |
| 124 | msg = str(w[0].message) |
| 125 | self.assertTrue("looks like a URL" in msg) |
| 126 | |
| 127 | with warnings.catch_warnings(record=True) as w: |
| 128 | soup = self.soup("http://www.crummy.com/ is great") |
| 129 | self.assertEqual(0, len(w)) |
| 130 | |
| 131 | class TestSelectiveParsing(SoupTest): |
| 132 | |
| 133 | def test_parse_with_soupstrainer(self): |
| 134 | markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>" |
| 135 | strainer = SoupStrainer("b") |
| 136 | soup = self.soup(markup, parse_only=strainer) |
| 137 | self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>") |
| 138 | |
| 139 | |
| 140 | class TestEntitySubstitution(unittest.TestCase): |
| 141 | """Standalone tests of the EntitySubstitution class.""" |
| 142 | def setUp(self): |
| 143 | self.sub = EntitySubstitution |
| 144 | |
| 145 | def test_simple_html_substitution(self): |
| 146 | # Unicode characters corresponding to named HTML entites |
| 147 | # are substituted, and no others. |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 148 | s = "foo\u2200\N{SNOWMAN}\u00f5bar" |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 149 | self.assertEqual(self.sub.substitute_html(s), |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 150 | "foo∀\N{SNOWMAN}õbar") |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 151 | |
| 152 | def test_smart_quote_substitution(self): |
| 153 | # MS smart quotes are a common source of frustration, so we |
| 154 | # give them a special test. |
| 155 | quotes = b"\x91\x92foo\x93\x94" |
| 156 | dammit = UnicodeDammit(quotes) |
| 157 | self.assertEqual(self.sub.substitute_html(dammit.markup), |
| 158 | "‘’foo“”") |
| 159 | |
| 160 | def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self): |
| 161 | s = 'Welcome to "my bar"' |
| 162 | self.assertEqual(self.sub.substitute_xml(s, False), s) |
| 163 | |
| 164 | def test_xml_attribute_quoting_normally_uses_double_quotes(self): |
| 165 | self.assertEqual(self.sub.substitute_xml("Welcome", True), |
| 166 | '"Welcome"') |
| 167 | self.assertEqual(self.sub.substitute_xml("Bob's Bar", True), |
| 168 | '"Bob\'s Bar"') |
| 169 | |
| 170 | def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self): |
| 171 | s = 'Welcome to "my bar"' |
| 172 | self.assertEqual(self.sub.substitute_xml(s, True), |
| 173 | "'Welcome to \"my bar\"'") |
| 174 | |
| 175 | def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self): |
| 176 | s = 'Welcome to "Bob\'s Bar"' |
| 177 | self.assertEqual( |
| 178 | self.sub.substitute_xml(s, True), |
| 179 | '"Welcome to "Bob\'s Bar""') |
| 180 | |
| 181 | def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self): |
| 182 | quoted = 'Welcome to "Bob\'s Bar"' |
| 183 | self.assertEqual(self.sub.substitute_xml(quoted), quoted) |
| 184 | |
| 185 | def test_xml_quoting_handles_angle_brackets(self): |
| 186 | self.assertEqual( |
| 187 | self.sub.substitute_xml("foo<bar>"), |
| 188 | "foo<bar>") |
| 189 | |
| 190 | def test_xml_quoting_handles_ampersands(self): |
| 191 | self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&T") |
| 192 | |
| 193 | def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self): |
| 194 | self.assertEqual( |
| 195 | self.sub.substitute_xml("ÁT&T"), |
| 196 | "&Aacute;T&T") |
| 197 | |
| 198 | def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self): |
| 199 | self.assertEqual( |
| 200 | self.sub.substitute_xml_containing_entities("ÁT&T"), |
| 201 | "ÁT&T") |
| 202 | |
| 203 | def test_quotes_not_html_substituted(self): |
| 204 | """There's no need to do this except inside attribute values.""" |
| 205 | text = 'Bob\'s "bar"' |
| 206 | self.assertEqual(self.sub.substitute_html(text), text) |
| 207 | |
| 208 | |
| 209 | class TestEncodingConversion(SoupTest): |
| 210 | # Test Beautiful Soup's ability to decode and encode from various |
| 211 | # encodings. |
| 212 | |
| 213 | def setUp(self): |
| 214 | super(TestEncodingConversion, self).setUp() |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 215 | self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>' |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 216 | self.utf8_data = self.unicode_data.encode("utf-8") |
| 217 | # Just so you know what it looks like. |
| 218 | self.assertEqual( |
| 219 | self.utf8_data, |
| 220 | b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>') |
| 221 | |
| 222 | def test_ascii_in_unicode_out(self): |
| 223 | # ASCII input is converted to Unicode. The original_encoding |
| 224 | # attribute is set to 'utf-8', a superset of ASCII. |
| 225 | chardet = bs4.dammit.chardet_dammit |
| 226 | logging.disable(logging.WARNING) |
| 227 | try: |
| 228 | def noop(str): |
| 229 | return None |
| 230 | # Disable chardet, which will realize that the ASCII is ASCII. |
| 231 | bs4.dammit.chardet_dammit = noop |
| 232 | ascii = b"<foo>a</foo>" |
| 233 | soup_from_ascii = self.soup(ascii) |
| 234 | unicode_output = soup_from_ascii.decode() |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 235 | self.assertTrue(isinstance(unicode_output, str)) |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 236 | self.assertEqual(unicode_output, self.document_for(ascii.decode())) |
| 237 | self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") |
| 238 | finally: |
| 239 | logging.disable(logging.NOTSET) |
| 240 | bs4.dammit.chardet_dammit = chardet |
| 241 | |
| 242 | def test_unicode_in_unicode_out(self): |
| 243 | # Unicode input is left alone. The original_encoding attribute |
| 244 | # is not set. |
| 245 | soup_from_unicode = self.soup(self.unicode_data) |
| 246 | self.assertEqual(soup_from_unicode.decode(), self.unicode_data) |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 247 | self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!') |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 248 | self.assertEqual(soup_from_unicode.original_encoding, None) |
| 249 | |
| 250 | def test_utf8_in_unicode_out(self): |
| 251 | # UTF-8 input is converted to Unicode. The original_encoding |
| 252 | # attribute is set. |
| 253 | soup_from_utf8 = self.soup(self.utf8_data) |
| 254 | self.assertEqual(soup_from_utf8.decode(), self.unicode_data) |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 255 | self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!') |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 256 | |
| 257 | def test_utf8_out(self): |
| 258 | # The internal data structures can be encoded as UTF-8. |
| 259 | soup_from_unicode = self.soup(self.unicode_data) |
| 260 | self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data) |
| 261 | |
| 262 | @skipIf( |
| 263 | PYTHON_2_PRE_2_7 or PYTHON_3_PRE_3_2, |
| 264 | "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.") |
| 265 | def test_attribute_name_containing_unicode_characters(self): |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 266 | markup = '<div><a \N{SNOWMAN}="snowman"></a></div>' |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 267 | self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) |
| 268 | |
| 269 | class TestUnicodeDammit(unittest.TestCase): |
| 270 | """Standalone tests of UnicodeDammit.""" |
| 271 | |
| 272 | def test_unicode_input(self): |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 273 | markup = "I'm already Unicode! \N{SNOWMAN}" |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 274 | dammit = UnicodeDammit(markup) |
| 275 | self.assertEqual(dammit.unicode_markup, markup) |
| 276 | |
| 277 | def test_smart_quotes_to_unicode(self): |
| 278 | markup = b"<foo>\x91\x92\x93\x94</foo>" |
| 279 | dammit = UnicodeDammit(markup) |
| 280 | self.assertEqual( |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 281 | dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>") |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 282 | |
| 283 | def test_smart_quotes_to_xml_entities(self): |
| 284 | markup = b"<foo>\x91\x92\x93\x94</foo>" |
| 285 | dammit = UnicodeDammit(markup, smart_quotes_to="xml") |
| 286 | self.assertEqual( |
| 287 | dammit.unicode_markup, "<foo>‘’“”</foo>") |
| 288 | |
| 289 | def test_smart_quotes_to_html_entities(self): |
| 290 | markup = b"<foo>\x91\x92\x93\x94</foo>" |
| 291 | dammit = UnicodeDammit(markup, smart_quotes_to="html") |
| 292 | self.assertEqual( |
| 293 | dammit.unicode_markup, "<foo>‘’“”</foo>") |
| 294 | |
| 295 | def test_smart_quotes_to_ascii(self): |
| 296 | markup = b"<foo>\x91\x92\x93\x94</foo>" |
| 297 | dammit = UnicodeDammit(markup, smart_quotes_to="ascii") |
| 298 | self.assertEqual( |
| 299 | dammit.unicode_markup, """<foo>''""</foo>""") |
| 300 | |
| 301 | def test_detect_utf8(self): |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 302 | utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83" |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 303 | dammit = UnicodeDammit(utf8) |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 304 | self.assertEqual(dammit.original_encoding.lower(), 'utf-8') |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 305 | self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}') |
| 306 | |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 307 | |
| 308 | def test_convert_hebrew(self): |
| 309 | hebrew = b"\xed\xe5\xec\xf9" |
| 310 | dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) |
| 311 | self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 312 | self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9') |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 313 | |
| 314 | def test_dont_see_smart_quotes_where_there_are_none(self): |
| 315 | utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" |
| 316 | dammit = UnicodeDammit(utf_8) |
| 317 | self.assertEqual(dammit.original_encoding.lower(), 'utf-8') |
| 318 | self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) |
| 319 | |
| 320 | def test_ignore_inappropriate_codecs(self): |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 321 | utf8_data = "Räksmörgås".encode("utf-8") |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 322 | dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) |
| 323 | self.assertEqual(dammit.original_encoding.lower(), 'utf-8') |
| 324 | |
| 325 | def test_ignore_invalid_codecs(self): |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 326 | utf8_data = "Räksmörgås".encode("utf-8") |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 327 | for bad_encoding in ['.utf8', '...', 'utF---16.!']: |
| 328 | dammit = UnicodeDammit(utf8_data, [bad_encoding]) |
| 329 | self.assertEqual(dammit.original_encoding.lower(), 'utf-8') |
| 330 | |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 331 | def test_exclude_encodings(self): |
| 332 | # This is UTF-8. |
| 333 | utf8_data = "Räksmörgås".encode("utf-8") |
| 334 | |
| 335 | # But if we exclude UTF-8 from consideration, the guess is |
| 336 | # Windows-1252. |
| 337 | dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"]) |
| 338 | self.assertEqual(dammit.original_encoding.lower(), 'windows-1252') |
| 339 | |
| 340 | # And if we exclude that, there is no valid guess at all. |
| 341 | dammit = UnicodeDammit( |
| 342 | utf8_data, exclude_encodings=["utf-8", "windows-1252"]) |
| 343 | self.assertEqual(dammit.original_encoding, None) |
| 344 | |
| 345 | def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self): |
| 346 | detected = EncodingDetector( |
| 347 | b'<?xml version="1.0" encoding="UTF-\xdb" ?>') |
| 348 | encodings = list(detected.encodings) |
| 349 | assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings |
| 350 | |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 351 | def test_detect_html5_style_meta_tag(self): |
| 352 | |
| 353 | for data in ( |
| 354 | b'<html><meta charset="euc-jp" /></html>', |
| 355 | b"<html><meta charset='euc-jp' /></html>", |
| 356 | b"<html><meta charset=euc-jp /></html>", |
| 357 | b"<html><meta charset=euc-jp/></html>"): |
| 358 | dammit = UnicodeDammit(data, is_html=True) |
| 359 | self.assertEqual( |
| 360 | "euc-jp", dammit.original_encoding) |
| 361 | |
| 362 | def test_last_ditch_entity_replacement(self): |
| 363 | # This is a UTF-8 document that contains bytestrings |
| 364 | # completely incompatible with UTF-8 (ie. encoded with some other |
| 365 | # encoding). |
| 366 | # |
| 367 | # Since there is no consistent encoding for the document, |
| 368 | # Unicode, Dammit will eventually encode the document as UTF-8 |
| 369 | # and encode the incompatible characters as REPLACEMENT |
| 370 | # CHARACTER. |
| 371 | # |
| 372 | # If chardet is installed, it will detect that the document |
| 373 | # can be converted into ISO-8859-1 without errors. This happens |
| 374 | # to be the wrong encoding, but it is a consistent encoding, so the |
| 375 | # code we're testing here won't run. |
| 376 | # |
| 377 | # So we temporarily disable chardet if it's present. |
| 378 | doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?> |
| 379 | <html><b>\330\250\330\252\330\261</b> |
| 380 | <i>\310\322\321\220\312\321\355\344</i></html>""" |
| 381 | chardet = bs4.dammit.chardet_dammit |
| 382 | logging.disable(logging.WARNING) |
| 383 | try: |
| 384 | def noop(str): |
| 385 | return None |
| 386 | bs4.dammit.chardet_dammit = noop |
| 387 | dammit = UnicodeDammit(doc) |
| 388 | self.assertEqual(True, dammit.contains_replacement_characters) |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 389 | self.assertTrue("\ufffd" in dammit.unicode_markup) |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 390 | |
| 391 | soup = BeautifulSoup(doc, "html.parser") |
| 392 | self.assertTrue(soup.contains_replacement_characters) |
| 393 | finally: |
| 394 | logging.disable(logging.NOTSET) |
| 395 | bs4.dammit.chardet_dammit = chardet |
| 396 | |
| 397 | def test_byte_order_mark_removed(self): |
| 398 | # A document written in UTF-16LE will have its byte order marker stripped. |
| 399 | data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' |
| 400 | dammit = UnicodeDammit(data) |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 401 | self.assertEqual("<a>áé</a>", dammit.unicode_markup) |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 402 | self.assertEqual("utf-16le", dammit.original_encoding) |
| 403 | |
| 404 | def test_detwingle(self): |
| 405 | # Here's a UTF8 document. |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 406 | utf8 = ("\N{SNOWMAN}" * 3).encode("utf8") |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 407 | |
| 408 | # Here's a Windows-1252 document. |
| 409 | windows_1252 = ( |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 410 | "\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" |
| 411 | "\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 412 | |
| 413 | # Through some unholy alchemy, they've been stuck together. |
| 414 | doc = utf8 + windows_1252 + utf8 |
| 415 | |
| 416 | # The document can't be turned into UTF-8: |
| 417 | self.assertRaises(UnicodeDecodeError, doc.decode, "utf8") |
| 418 | |
| 419 | # Unicode, Dammit thinks the whole document is Windows-1252, |
| 420 | # and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃" |
| 421 | |
| 422 | # But if we run it through fix_embedded_windows_1252, it's fixed: |
| 423 | |
| 424 | fixed = UnicodeDammit.detwingle(doc) |
| 425 | self.assertEqual( |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 426 | "☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8")) |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 427 | |
| 428 | def test_detwingle_ignores_multibyte_characters(self): |
| 429 | # Each of these characters has a UTF-8 representation ending |
| 430 | # in \x93. \x93 is a smart quote if interpreted as |
| 431 | # Windows-1252. But our code knows to skip over multibyte |
| 432 | # UTF-8 characters, so they'll survive the process unscathed. |
| 433 | for tricky_unicode_char in ( |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 434 | "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' |
| 435 | "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' |
| 436 | "\xf0\x90\x90\x93", # This is a CJK character, not sure which one. |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 437 | ): |
| 438 | input = tricky_unicode_char.encode("utf8") |
| 439 | self.assertTrue(input.endswith(b'\x93')) |
| 440 | output = UnicodeDammit.detwingle(input) |
| 441 | self.assertEqual(output, input) |
| 442 | |
| 443 | class TestNamedspacedAttribute(SoupTest): |
| 444 | |
| 445 | def test_name_may_be_none(self): |
| 446 | a = NamespacedAttribute("xmlns", None) |
| 447 | self.assertEqual(a, "xmlns") |
| 448 | |
| 449 | def test_attribute_is_equivalent_to_colon_separated_string(self): |
| 450 | a = NamespacedAttribute("a", "b") |
| 451 | self.assertEqual("a:b", a) |
| 452 | |
| 453 | def test_attributes_are_equivalent_if_prefix_and_name_identical(self): |
| 454 | a = NamespacedAttribute("a", "b", "c") |
| 455 | b = NamespacedAttribute("a", "b", "c") |
| 456 | self.assertEqual(a, b) |
| 457 | |
| 458 | # The actual namespace is not considered. |
| 459 | c = NamespacedAttribute("a", "b", None) |
| 460 | self.assertEqual(a, c) |
| 461 | |
| 462 | # But name and prefix are important. |
| 463 | d = NamespacedAttribute("a", "z", "c") |
| 464 | self.assertNotEqual(a, d) |
| 465 | |
| 466 | e = NamespacedAttribute("z", "b", "c") |
| 467 | self.assertNotEqual(a, e) |
| 468 | |
| 469 | |
| 470 | class TestAttributeValueWithCharsetSubstitution(unittest.TestCase): |
| 471 | |
| 472 | def test_content_meta_attribute_value(self): |
| 473 | value = CharsetMetaAttributeValue("euc-jp") |
| 474 | self.assertEqual("euc-jp", value) |
| 475 | self.assertEqual("euc-jp", value.original_value) |
| 476 | self.assertEqual("utf8", value.encode("utf8")) |
| 477 | |
| 478 | |
| 479 | def test_content_meta_attribute_value(self): |
| 480 | value = ContentMetaAttributeValue("text/html; charset=euc-jp") |
| 481 | self.assertEqual("text/html; charset=euc-jp", value) |
| 482 | self.assertEqual("text/html; charset=euc-jp", value.original_value) |
| 483 | self.assertEqual("text/html; charset=utf8", value.encode("utf8")) |