| Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 1 | # -*- coding: utf-8 -*- | 
|  | 2 | """Tests for Beautiful Soup's tree traversal methods. | 
|  | 3 |  | 
|  | 4 | The tree traversal methods are the main advantage of using Beautiful | 
|  | 5 | Soup over just using a parser. | 
|  | 6 |  | 
|  | 7 | Different parsers will build different Beautiful Soup trees given the | 
|  | 8 | same markup, but all Beautiful Soup trees can be traversed with the | 
|  | 9 | methods tested here. | 
|  | 10 | """ | 
|  | 11 |  | 
|  | 12 | import copy | 
|  | 13 | import pickle | 
|  | 14 | import re | 
|  | 15 | import warnings | 
|  | 16 | from bs4 import BeautifulSoup | 
|  | 17 | from bs4.builder import ( | 
|  | 18 | builder_registry, | 
|  | 19 | HTMLParserTreeBuilder, | 
|  | 20 | ) | 
|  | 21 | from bs4.element import ( | 
|  | 22 | CData, | 
|  | 23 | Comment, | 
|  | 24 | Doctype, | 
|  | 25 | NavigableString, | 
|  | 26 | SoupStrainer, | 
|  | 27 | Tag, | 
|  | 28 | ) | 
|  | 29 | from bs4.testing import ( | 
|  | 30 | SoupTest, | 
|  | 31 | skipIf, | 
|  | 32 | ) | 
|  | 33 |  | 
|  | 34 | XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None) | 
|  | 35 | LXML_PRESENT = (builder_registry.lookup("lxml") is not None) | 
|  | 36 |  | 
|  | 37 | class TreeTest(SoupTest): | 
|  | 38 |  | 
|  | 39 | def assertSelects(self, tags, should_match): | 
|  | 40 | """Make sure that the given tags have the correct text. | 
|  | 41 |  | 
|  | 42 | This is used in tests that define a bunch of tags, each | 
|  | 43 | containing a single string, and then select certain strings by | 
|  | 44 | some mechanism. | 
|  | 45 | """ | 
|  | 46 | self.assertEqual([tag.string for tag in tags], should_match) | 
|  | 47 |  | 
|  | 48 | def assertSelectsIDs(self, tags, should_match): | 
|  | 49 | """Make sure that the given tags have the correct IDs. | 
|  | 50 |  | 
|  | 51 | This is used in tests that define a bunch of tags, each | 
|  | 52 | containing a single string, and then select certain strings by | 
|  | 53 | some mechanism. | 
|  | 54 | """ | 
|  | 55 | self.assertEqual([tag['id'] for tag in tags], should_match) | 
|  | 56 |  | 
|  | 57 |  | 
|  | 58 | class TestFind(TreeTest): | 
|  | 59 | """Basic tests of the find() method. | 
|  | 60 |  | 
|  | 61 | find() just calls find_all() with limit=1, so it's not tested all | 
|  | 62 | that thouroughly here. | 
|  | 63 | """ | 
|  | 64 |  | 
|  | 65 | def test_find_tag(self): | 
|  | 66 | soup = self.soup("<a>1</a><b>2</b><a>3</a><b>4</b>") | 
|  | 67 | self.assertEqual(soup.find("b").string, "2") | 
|  | 68 |  | 
|  | 69 | def test_unicode_text_find(self): | 
|  | 70 | soup = self.soup(u'<h1>Räksmörgås</h1>') | 
|  | 71 | self.assertEqual(soup.find(text=u'Räksmörgås'), u'Räksmörgås') | 
|  | 72 |  | 
|  | 73 | def test_find_everything(self): | 
|  | 74 | """Test an optimization that finds all tags.""" | 
|  | 75 | soup = self.soup("<a>foo</a><b>bar</b>") | 
|  | 76 | self.assertEqual(2, len(soup.find_all())) | 
|  | 77 |  | 
|  | 78 | def test_find_everything_with_name(self): | 
|  | 79 | """Test an optimization that finds all tags with a given name.""" | 
|  | 80 | soup = self.soup("<a>foo</a><b>bar</b><a>baz</a>") | 
|  | 81 | self.assertEqual(2, len(soup.find_all('a'))) | 
|  | 82 |  | 
|  | 83 | class TestFindAll(TreeTest): | 
|  | 84 | """Basic tests of the find_all() method.""" | 
|  | 85 |  | 
|  | 86 | def test_find_all_text_nodes(self): | 
|  | 87 | """You can search the tree for text nodes.""" | 
|  | 88 | soup = self.soup("<html>Foo<b>bar</b>\xbb</html>") | 
|  | 89 | # Exact match. | 
|  | 90 | self.assertEqual(soup.find_all(text="bar"), [u"bar"]) | 
|  | 91 | # Match any of a number of strings. | 
|  | 92 | self.assertEqual( | 
|  | 93 | soup.find_all(text=["Foo", "bar"]), [u"Foo", u"bar"]) | 
|  | 94 | # Match a regular expression. | 
|  | 95 | self.assertEqual(soup.find_all(text=re.compile('.*')), | 
|  | 96 | [u"Foo", u"bar", u'\xbb']) | 
|  | 97 | # Match anything. | 
|  | 98 | self.assertEqual(soup.find_all(text=True), | 
|  | 99 | [u"Foo", u"bar", u'\xbb']) | 
|  | 100 |  | 
|  | 101 | def test_find_all_limit(self): | 
|  | 102 | """You can limit the number of items returned by find_all.""" | 
|  | 103 | soup = self.soup("<a>1</a><a>2</a><a>3</a><a>4</a><a>5</a>") | 
|  | 104 | self.assertSelects(soup.find_all('a', limit=3), ["1", "2", "3"]) | 
|  | 105 | self.assertSelects(soup.find_all('a', limit=1), ["1"]) | 
|  | 106 | self.assertSelects( | 
|  | 107 | soup.find_all('a', limit=10), ["1", "2", "3", "4", "5"]) | 
|  | 108 |  | 
|  | 109 | # A limit of 0 means no limit. | 
|  | 110 | self.assertSelects( | 
|  | 111 | soup.find_all('a', limit=0), ["1", "2", "3", "4", "5"]) | 
|  | 112 |  | 
|  | 113 | def test_calling_a_tag_is_calling_findall(self): | 
|  | 114 | soup = self.soup("<a>1</a><b>2<a id='foo'>3</a></b>") | 
|  | 115 | self.assertSelects(soup('a', limit=1), ["1"]) | 
|  | 116 | self.assertSelects(soup.b(id="foo"), ["3"]) | 
|  | 117 |  | 
|  | 118 | def test_find_all_with_self_referential_data_structure_does_not_cause_infinite_recursion(self): | 
|  | 119 | soup = self.soup("<a></a>") | 
|  | 120 | # Create a self-referential list. | 
|  | 121 | l = [] | 
|  | 122 | l.append(l) | 
|  | 123 |  | 
|  | 124 | # Without special code in _normalize_search_value, this would cause infinite | 
|  | 125 | # recursion. | 
|  | 126 | self.assertEqual([], soup.find_all(l)) | 
|  | 127 |  | 
|  | 128 | def test_find_all_resultset(self): | 
|  | 129 | """All find_all calls return a ResultSet""" | 
|  | 130 | soup = self.soup("<a></a>") | 
|  | 131 | result = soup.find_all("a") | 
|  | 132 | self.assertTrue(hasattr(result, "source")) | 
|  | 133 |  | 
|  | 134 | result = soup.find_all(True) | 
|  | 135 | self.assertTrue(hasattr(result, "source")) | 
|  | 136 |  | 
|  | 137 | result = soup.find_all(text="foo") | 
|  | 138 | self.assertTrue(hasattr(result, "source")) | 
|  | 139 |  | 
|  | 140 |  | 
|  | 141 | class TestFindAllBasicNamespaces(TreeTest): | 
|  | 142 |  | 
|  | 143 | def test_find_by_namespaced_name(self): | 
|  | 144 | soup = self.soup('<mathml:msqrt>4</mathml:msqrt><a svg:fill="red">') | 
|  | 145 | self.assertEqual("4", soup.find("mathml:msqrt").string) | 
|  | 146 | self.assertEqual("a", soup.find(attrs= { "svg:fill" : "red" }).name) | 
|  | 147 |  | 
|  | 148 |  | 
|  | 149 | class TestFindAllByName(TreeTest): | 
|  | 150 | """Test ways of finding tags by tag name.""" | 
|  | 151 |  | 
|  | 152 | def setUp(self): | 
|  | 153 | super(TreeTest, self).setUp() | 
|  | 154 | self.tree =  self.soup("""<a>First tag.</a> | 
|  | 155 | <b>Second tag.</b> | 
|  | 156 | <c>Third <a>Nested tag.</a> tag.</c>""") | 
|  | 157 |  | 
|  | 158 | def test_find_all_by_tag_name(self): | 
|  | 159 | # Find all the <a> tags. | 
|  | 160 | self.assertSelects( | 
|  | 161 | self.tree.find_all('a'), ['First tag.', 'Nested tag.']) | 
|  | 162 |  | 
|  | 163 | def test_find_all_by_name_and_text(self): | 
|  | 164 | self.assertSelects( | 
|  | 165 | self.tree.find_all('a', text='First tag.'), ['First tag.']) | 
|  | 166 |  | 
|  | 167 | self.assertSelects( | 
|  | 168 | self.tree.find_all('a', text=True), ['First tag.', 'Nested tag.']) | 
|  | 169 |  | 
|  | 170 | self.assertSelects( | 
|  | 171 | self.tree.find_all('a', text=re.compile("tag")), | 
|  | 172 | ['First tag.', 'Nested tag.']) | 
|  | 173 |  | 
|  | 174 |  | 
|  | 175 | def test_find_all_on_non_root_element(self): | 
|  | 176 | # You can call find_all on any node, not just the root. | 
|  | 177 | self.assertSelects(self.tree.c.find_all('a'), ['Nested tag.']) | 
|  | 178 |  | 
|  | 179 | def test_calling_element_invokes_find_all(self): | 
|  | 180 | self.assertSelects(self.tree('a'), ['First tag.', 'Nested tag.']) | 
|  | 181 |  | 
|  | 182 | def test_find_all_by_tag_strainer(self): | 
|  | 183 | self.assertSelects( | 
|  | 184 | self.tree.find_all(SoupStrainer('a')), | 
|  | 185 | ['First tag.', 'Nested tag.']) | 
|  | 186 |  | 
|  | 187 | def test_find_all_by_tag_names(self): | 
|  | 188 | self.assertSelects( | 
|  | 189 | self.tree.find_all(['a', 'b']), | 
|  | 190 | ['First tag.', 'Second tag.', 'Nested tag.']) | 
|  | 191 |  | 
|  | 192 | def test_find_all_by_tag_dict(self): | 
|  | 193 | self.assertSelects( | 
|  | 194 | self.tree.find_all({'a' : True, 'b' : True}), | 
|  | 195 | ['First tag.', 'Second tag.', 'Nested tag.']) | 
|  | 196 |  | 
|  | 197 | def test_find_all_by_tag_re(self): | 
|  | 198 | self.assertSelects( | 
|  | 199 | self.tree.find_all(re.compile('^[ab]$')), | 
|  | 200 | ['First tag.', 'Second tag.', 'Nested tag.']) | 
|  | 201 |  | 
|  | 202 | def test_find_all_with_tags_matching_method(self): | 
|  | 203 | # You can define an oracle method that determines whether | 
|  | 204 | # a tag matches the search. | 
|  | 205 | def id_matches_name(tag): | 
|  | 206 | return tag.name == tag.get('id') | 
|  | 207 |  | 
|  | 208 | tree = self.soup("""<a id="a">Match 1.</a> | 
|  | 209 | <a id="1">Does not match.</a> | 
|  | 210 | <b id="b">Match 2.</a>""") | 
|  | 211 |  | 
|  | 212 | self.assertSelects( | 
|  | 213 | tree.find_all(id_matches_name), ["Match 1.", "Match 2."]) | 
|  | 214 |  | 
|  | 215 |  | 
|  | 216 | class TestFindAllByAttribute(TreeTest): | 
|  | 217 |  | 
|  | 218 | def test_find_all_by_attribute_name(self): | 
|  | 219 | # You can pass in keyword arguments to find_all to search by | 
|  | 220 | # attribute. | 
|  | 221 | tree = self.soup(""" | 
|  | 222 | <a id="first">Matching a.</a> | 
|  | 223 | <a id="second"> | 
|  | 224 | Non-matching <b id="first">Matching b.</b>a. | 
|  | 225 | </a>""") | 
|  | 226 | self.assertSelects(tree.find_all(id='first'), | 
|  | 227 | ["Matching a.", "Matching b."]) | 
|  | 228 |  | 
|  | 229 | def test_find_all_by_utf8_attribute_value(self): | 
|  | 230 | peace = u"םולש".encode("utf8") | 
|  | 231 | data = u'<a title="םולש"></a>'.encode("utf8") | 
|  | 232 | soup = self.soup(data) | 
|  | 233 | self.assertEqual([soup.a], soup.find_all(title=peace)) | 
|  | 234 | self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8"))) | 
|  | 235 | self.assertEqual([soup.a], soup.find_all(title=[peace, "something else"])) | 
|  | 236 |  | 
|  | 237 | def test_find_all_by_attribute_dict(self): | 
|  | 238 | # You can pass in a dictionary as the argument 'attrs'. This | 
|  | 239 | # lets you search for attributes like 'name' (a fixed argument | 
|  | 240 | # to find_all) and 'class' (a reserved word in Python.) | 
|  | 241 | tree = self.soup(""" | 
|  | 242 | <a name="name1" class="class1">Name match.</a> | 
|  | 243 | <a name="name2" class="class2">Class match.</a> | 
|  | 244 | <a name="name3" class="class3">Non-match.</a> | 
|  | 245 | <name1>A tag called 'name1'.</name1> | 
|  | 246 | """) | 
|  | 247 |  | 
|  | 248 | # This doesn't do what you want. | 
|  | 249 | self.assertSelects(tree.find_all(name='name1'), | 
|  | 250 | ["A tag called 'name1'."]) | 
|  | 251 | # This does what you want. | 
|  | 252 | self.assertSelects(tree.find_all(attrs={'name' : 'name1'}), | 
|  | 253 | ["Name match."]) | 
|  | 254 |  | 
|  | 255 | self.assertSelects(tree.find_all(attrs={'class' : 'class2'}), | 
|  | 256 | ["Class match."]) | 
|  | 257 |  | 
|  | 258 | def test_find_all_by_class(self): | 
|  | 259 | tree = self.soup(""" | 
|  | 260 | <a class="1">Class 1.</a> | 
|  | 261 | <a class="2">Class 2.</a> | 
|  | 262 | <b class="1">Class 1.</b> | 
|  | 263 | <c class="3 4">Class 3 and 4.</c> | 
|  | 264 | """) | 
|  | 265 |  | 
|  | 266 | # Passing in the class_ keyword argument will search against | 
|  | 267 | # the 'class' attribute. | 
|  | 268 | self.assertSelects(tree.find_all('a', class_='1'), ['Class 1.']) | 
|  | 269 | self.assertSelects(tree.find_all('c', class_='3'), ['Class 3 and 4.']) | 
|  | 270 | self.assertSelects(tree.find_all('c', class_='4'), ['Class 3 and 4.']) | 
|  | 271 |  | 
|  | 272 | # Passing in a string to 'attrs' will also search the CSS class. | 
|  | 273 | self.assertSelects(tree.find_all('a', '1'), ['Class 1.']) | 
|  | 274 | self.assertSelects(tree.find_all(attrs='1'), ['Class 1.', 'Class 1.']) | 
|  | 275 | self.assertSelects(tree.find_all('c', '3'), ['Class 3 and 4.']) | 
|  | 276 | self.assertSelects(tree.find_all('c', '4'), ['Class 3 and 4.']) | 
|  | 277 |  | 
|  | 278 | def test_find_by_class_when_multiple_classes_present(self): | 
|  | 279 | tree = self.soup("<gar class='foo bar'>Found it</gar>") | 
|  | 280 |  | 
|  | 281 | f = tree.find_all("gar", class_=re.compile("o")) | 
|  | 282 | self.assertSelects(f, ["Found it"]) | 
|  | 283 |  | 
|  | 284 | f = tree.find_all("gar", class_=re.compile("a")) | 
|  | 285 | self.assertSelects(f, ["Found it"]) | 
|  | 286 |  | 
|  | 287 | # Since the class is not the string "foo bar", but the two | 
|  | 288 | # strings "foo" and "bar", this will not find anything. | 
|  | 289 | f = tree.find_all("gar", class_=re.compile("o b")) | 
|  | 290 | self.assertSelects(f, []) | 
|  | 291 |  | 
|  | 292 | def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self): | 
|  | 293 | soup = self.soup("<a class='bar'>Found it</a>") | 
|  | 294 |  | 
|  | 295 | self.assertSelects(soup.find_all("a", re.compile("ba")), ["Found it"]) | 
|  | 296 |  | 
|  | 297 | def big_attribute_value(value): | 
|  | 298 | return len(value) > 3 | 
|  | 299 |  | 
|  | 300 | self.assertSelects(soup.find_all("a", big_attribute_value), []) | 
|  | 301 |  | 
|  | 302 | def small_attribute_value(value): | 
|  | 303 | return len(value) <= 3 | 
|  | 304 |  | 
|  | 305 | self.assertSelects( | 
|  | 306 | soup.find_all("a", small_attribute_value), ["Found it"]) | 
|  | 307 |  | 
|  | 308 | def test_find_all_with_string_for_attrs_finds_multiple_classes(self): | 
|  | 309 | soup = self.soup('<a class="foo bar"></a><a class="foo"></a>') | 
|  | 310 | a, a2 = soup.find_all("a") | 
|  | 311 | self.assertEqual([a, a2], soup.find_all("a", "foo")) | 
|  | 312 | self.assertEqual([a], soup.find_all("a", "bar")) | 
|  | 313 |  | 
|  | 314 | # If you specify the class as a string that contains a | 
|  | 315 | # space, only that specific value will be found. | 
|  | 316 | self.assertEqual([a], soup.find_all("a", class_="foo bar")) | 
|  | 317 | self.assertEqual([a], soup.find_all("a", "foo bar")) | 
|  | 318 | self.assertEqual([], soup.find_all("a", "bar foo")) | 
|  | 319 |  | 
|  | 320 | def test_find_all_by_attribute_soupstrainer(self): | 
|  | 321 | tree = self.soup(""" | 
|  | 322 | <a id="first">Match.</a> | 
|  | 323 | <a id="second">Non-match.</a>""") | 
|  | 324 |  | 
|  | 325 | strainer = SoupStrainer(attrs={'id' : 'first'}) | 
|  | 326 | self.assertSelects(tree.find_all(strainer), ['Match.']) | 
|  | 327 |  | 
|  | 328 | def test_find_all_with_missing_atribute(self): | 
|  | 329 | # You can pass in None as the value of an attribute to find_all. | 
|  | 330 | # This will match tags that do not have that attribute set. | 
|  | 331 | tree = self.soup("""<a id="1">ID present.</a> | 
|  | 332 | <a>No ID present.</a> | 
|  | 333 | <a id="">ID is empty.</a>""") | 
|  | 334 | self.assertSelects(tree.find_all('a', id=None), ["No ID present."]) | 
|  | 335 |  | 
|  | 336 | def test_find_all_with_defined_attribute(self): | 
|  | 337 | # You can pass in None as the value of an attribute to find_all. | 
|  | 338 | # This will match tags that have that attribute set to any value. | 
|  | 339 | tree = self.soup("""<a id="1">ID present.</a> | 
|  | 340 | <a>No ID present.</a> | 
|  | 341 | <a id="">ID is empty.</a>""") | 
|  | 342 | self.assertSelects( | 
|  | 343 | tree.find_all(id=True), ["ID present.", "ID is empty."]) | 
|  | 344 |  | 
|  | 345 | def test_find_all_with_numeric_attribute(self): | 
|  | 346 | # If you search for a number, it's treated as a string. | 
|  | 347 | tree = self.soup("""<a id=1>Unquoted attribute.</a> | 
|  | 348 | <a id="1">Quoted attribute.</a>""") | 
|  | 349 |  | 
|  | 350 | expected = ["Unquoted attribute.", "Quoted attribute."] | 
|  | 351 | self.assertSelects(tree.find_all(id=1), expected) | 
|  | 352 | self.assertSelects(tree.find_all(id="1"), expected) | 
|  | 353 |  | 
|  | 354 | def test_find_all_with_list_attribute_values(self): | 
|  | 355 | # You can pass a list of attribute values instead of just one, | 
|  | 356 | # and you'll get tags that match any of the values. | 
|  | 357 | tree = self.soup("""<a id="1">1</a> | 
|  | 358 | <a id="2">2</a> | 
|  | 359 | <a id="3">3</a> | 
|  | 360 | <a>No ID.</a>""") | 
|  | 361 | self.assertSelects(tree.find_all(id=["1", "3", "4"]), | 
|  | 362 | ["1", "3"]) | 
|  | 363 |  | 
|  | 364 | def test_find_all_with_regular_expression_attribute_value(self): | 
|  | 365 | # You can pass a regular expression as an attribute value, and | 
|  | 366 | # you'll get tags whose values for that attribute match the | 
|  | 367 | # regular expression. | 
|  | 368 | tree = self.soup("""<a id="a">One a.</a> | 
|  | 369 | <a id="aa">Two as.</a> | 
|  | 370 | <a id="ab">Mixed as and bs.</a> | 
|  | 371 | <a id="b">One b.</a> | 
|  | 372 | <a>No ID.</a>""") | 
|  | 373 |  | 
|  | 374 | self.assertSelects(tree.find_all(id=re.compile("^a+$")), | 
|  | 375 | ["One a.", "Two as."]) | 
|  | 376 |  | 
|  | 377 | def test_find_by_name_and_containing_string(self): | 
|  | 378 | soup = self.soup("<b>foo</b><b>bar</b><a>foo</a>") | 
|  | 379 | a = soup.a | 
|  | 380 |  | 
|  | 381 | self.assertEqual([a], soup.find_all("a", text="foo")) | 
|  | 382 | self.assertEqual([], soup.find_all("a", text="bar")) | 
|  | 383 | self.assertEqual([], soup.find_all("a", text="bar")) | 
|  | 384 |  | 
|  | 385 | def test_find_by_name_and_containing_string_when_string_is_buried(self): | 
|  | 386 | soup = self.soup("<a>foo</a><a><b><c>foo</c></b></a>") | 
|  | 387 | self.assertEqual(soup.find_all("a"), soup.find_all("a", text="foo")) | 
|  | 388 |  | 
|  | 389 | def test_find_by_attribute_and_containing_string(self): | 
|  | 390 | soup = self.soup('<b id="1">foo</b><a id="2">foo</a>') | 
|  | 391 | a = soup.a | 
|  | 392 |  | 
|  | 393 | self.assertEqual([a], soup.find_all(id=2, text="foo")) | 
|  | 394 | self.assertEqual([], soup.find_all(id=1, text="bar")) | 
|  | 395 |  | 
|  | 396 |  | 
|  | 397 |  | 
|  | 398 |  | 
|  | 399 | class TestIndex(TreeTest): | 
|  | 400 | """Test Tag.index""" | 
|  | 401 | def test_index(self): | 
|  | 402 | tree = self.soup("""<div> | 
|  | 403 | <a>Identical</a> | 
|  | 404 | <b>Not identical</b> | 
|  | 405 | <a>Identical</a> | 
|  | 406 |  | 
|  | 407 | <c><d>Identical with child</d></c> | 
|  | 408 | <b>Also not identical</b> | 
|  | 409 | <c><d>Identical with child</d></c> | 
|  | 410 | </div>""") | 
|  | 411 | div = tree.div | 
|  | 412 | for i, element in enumerate(div.contents): | 
|  | 413 | self.assertEqual(i, div.index(element)) | 
|  | 414 | self.assertRaises(ValueError, tree.index, 1) | 
|  | 415 |  | 
|  | 416 |  | 
|  | 417 | class TestParentOperations(TreeTest): | 
|  | 418 | """Test navigation and searching through an element's parents.""" | 
|  | 419 |  | 
|  | 420 | def setUp(self): | 
|  | 421 | super(TestParentOperations, self).setUp() | 
|  | 422 | self.tree = self.soup('''<ul id="empty"></ul> | 
|  | 423 | <ul id="top"> | 
|  | 424 | <ul id="middle"> | 
|  | 425 | <ul id="bottom"> | 
|  | 426 | <b>Start here</b> | 
|  | 427 | </ul> | 
|  | 428 | </ul>''') | 
|  | 429 | self.start = self.tree.b | 
|  | 430 |  | 
|  | 431 |  | 
|  | 432 | def test_parent(self): | 
|  | 433 | self.assertEqual(self.start.parent['id'], 'bottom') | 
|  | 434 | self.assertEqual(self.start.parent.parent['id'], 'middle') | 
|  | 435 | self.assertEqual(self.start.parent.parent.parent['id'], 'top') | 
|  | 436 |  | 
|  | 437 | def test_parent_of_top_tag_is_soup_object(self): | 
|  | 438 | top_tag = self.tree.contents[0] | 
|  | 439 | self.assertEqual(top_tag.parent, self.tree) | 
|  | 440 |  | 
|  | 441 | def test_soup_object_has_no_parent(self): | 
|  | 442 | self.assertEqual(None, self.tree.parent) | 
|  | 443 |  | 
|  | 444 | def test_find_parents(self): | 
|  | 445 | self.assertSelectsIDs( | 
|  | 446 | self.start.find_parents('ul'), ['bottom', 'middle', 'top']) | 
|  | 447 | self.assertSelectsIDs( | 
|  | 448 | self.start.find_parents('ul', id="middle"), ['middle']) | 
|  | 449 |  | 
|  | 450 | def test_find_parent(self): | 
|  | 451 | self.assertEqual(self.start.find_parent('ul')['id'], 'bottom') | 
|  | 452 | self.assertEqual(self.start.find_parent('ul', id='top')['id'], 'top') | 
|  | 453 |  | 
|  | 454 | def test_parent_of_text_element(self): | 
|  | 455 | text = self.tree.find(text="Start here") | 
|  | 456 | self.assertEqual(text.parent.name, 'b') | 
|  | 457 |  | 
|  | 458 | def test_text_element_find_parent(self): | 
|  | 459 | text = self.tree.find(text="Start here") | 
|  | 460 | self.assertEqual(text.find_parent('ul')['id'], 'bottom') | 
|  | 461 |  | 
|  | 462 | def test_parent_generator(self): | 
|  | 463 | parents = [parent['id'] for parent in self.start.parents | 
|  | 464 | if parent is not None and 'id' in parent.attrs] | 
|  | 465 | self.assertEqual(parents, ['bottom', 'middle', 'top']) | 
|  | 466 |  | 
|  | 467 |  | 
|  | 468 | class ProximityTest(TreeTest): | 
|  | 469 |  | 
|  | 470 | def setUp(self): | 
|  | 471 | super(TreeTest, self).setUp() | 
|  | 472 | self.tree = self.soup( | 
|  | 473 | '<html id="start"><head></head><body><b id="1">One</b><b id="2">Two</b><b id="3">Three</b></body></html>') | 
|  | 474 |  | 
|  | 475 |  | 
|  | 476 | class TestNextOperations(ProximityTest): | 
|  | 477 |  | 
|  | 478 | def setUp(self): | 
|  | 479 | super(TestNextOperations, self).setUp() | 
|  | 480 | self.start = self.tree.b | 
|  | 481 |  | 
|  | 482 | def test_next(self): | 
|  | 483 | self.assertEqual(self.start.next_element, "One") | 
|  | 484 | self.assertEqual(self.start.next_element.next_element['id'], "2") | 
|  | 485 |  | 
|  | 486 | def test_next_of_last_item_is_none(self): | 
|  | 487 | last = self.tree.find(text="Three") | 
|  | 488 | self.assertEqual(last.next_element, None) | 
|  | 489 |  | 
|  | 490 | def test_next_of_root_is_none(self): | 
|  | 491 | # The document root is outside the next/previous chain. | 
|  | 492 | self.assertEqual(self.tree.next_element, None) | 
|  | 493 |  | 
|  | 494 | def test_find_all_next(self): | 
|  | 495 | self.assertSelects(self.start.find_all_next('b'), ["Two", "Three"]) | 
|  | 496 | self.start.find_all_next(id=3) | 
|  | 497 | self.assertSelects(self.start.find_all_next(id=3), ["Three"]) | 
|  | 498 |  | 
|  | 499 | def test_find_next(self): | 
|  | 500 | self.assertEqual(self.start.find_next('b')['id'], '2') | 
|  | 501 | self.assertEqual(self.start.find_next(text="Three"), "Three") | 
|  | 502 |  | 
|  | 503 | def test_find_next_for_text_element(self): | 
|  | 504 | text = self.tree.find(text="One") | 
|  | 505 | self.assertEqual(text.find_next("b").string, "Two") | 
|  | 506 | self.assertSelects(text.find_all_next("b"), ["Two", "Three"]) | 
|  | 507 |  | 
|  | 508 | def test_next_generator(self): | 
|  | 509 | start = self.tree.find(text="Two") | 
|  | 510 | successors = [node for node in start.next_elements] | 
|  | 511 | # There are two successors: the final <b> tag and its text contents. | 
|  | 512 | tag, contents = successors | 
|  | 513 | self.assertEqual(tag['id'], '3') | 
|  | 514 | self.assertEqual(contents, "Three") | 
|  | 515 |  | 
|  | 516 | class TestPreviousOperations(ProximityTest): | 
|  | 517 |  | 
|  | 518 | def setUp(self): | 
|  | 519 | super(TestPreviousOperations, self).setUp() | 
|  | 520 | self.end = self.tree.find(text="Three") | 
|  | 521 |  | 
|  | 522 | def test_previous(self): | 
|  | 523 | self.assertEqual(self.end.previous_element['id'], "3") | 
|  | 524 | self.assertEqual(self.end.previous_element.previous_element, "Two") | 
|  | 525 |  | 
|  | 526 | def test_previous_of_first_item_is_none(self): | 
|  | 527 | first = self.tree.find('html') | 
|  | 528 | self.assertEqual(first.previous_element, None) | 
|  | 529 |  | 
|  | 530 | def test_previous_of_root_is_none(self): | 
|  | 531 | # The document root is outside the next/previous chain. | 
|  | 532 | # XXX This is broken! | 
|  | 533 | #self.assertEqual(self.tree.previous_element, None) | 
|  | 534 | pass | 
|  | 535 |  | 
|  | 536 | def test_find_all_previous(self): | 
|  | 537 | # The <b> tag containing the "Three" node is the predecessor | 
|  | 538 | # of the "Three" node itself, which is why "Three" shows up | 
|  | 539 | # here. | 
|  | 540 | self.assertSelects( | 
|  | 541 | self.end.find_all_previous('b'), ["Three", "Two", "One"]) | 
|  | 542 | self.assertSelects(self.end.find_all_previous(id=1), ["One"]) | 
|  | 543 |  | 
|  | 544 | def test_find_previous(self): | 
|  | 545 | self.assertEqual(self.end.find_previous('b')['id'], '3') | 
|  | 546 | self.assertEqual(self.end.find_previous(text="One"), "One") | 
|  | 547 |  | 
|  | 548 | def test_find_previous_for_text_element(self): | 
|  | 549 | text = self.tree.find(text="Three") | 
|  | 550 | self.assertEqual(text.find_previous("b").string, "Three") | 
|  | 551 | self.assertSelects( | 
|  | 552 | text.find_all_previous("b"), ["Three", "Two", "One"]) | 
|  | 553 |  | 
|  | 554 | def test_previous_generator(self): | 
|  | 555 | start = self.tree.find(text="One") | 
|  | 556 | predecessors = [node for node in start.previous_elements] | 
|  | 557 |  | 
|  | 558 | # There are four predecessors: the <b> tag containing "One" | 
|  | 559 | # the <body> tag, the <head> tag, and the <html> tag. | 
|  | 560 | b, body, head, html = predecessors | 
|  | 561 | self.assertEqual(b['id'], '1') | 
|  | 562 | self.assertEqual(body.name, "body") | 
|  | 563 | self.assertEqual(head.name, "head") | 
|  | 564 | self.assertEqual(html.name, "html") | 
|  | 565 |  | 
|  | 566 |  | 
|  | 567 | class SiblingTest(TreeTest): | 
|  | 568 |  | 
|  | 569 | def setUp(self): | 
|  | 570 | super(SiblingTest, self).setUp() | 
|  | 571 | markup = '''<html> | 
|  | 572 | <span id="1"> | 
|  | 573 | <span id="1.1"></span> | 
|  | 574 | </span> | 
|  | 575 | <span id="2"> | 
|  | 576 | <span id="2.1"></span> | 
|  | 577 | </span> | 
|  | 578 | <span id="3"> | 
|  | 579 | <span id="3.1"></span> | 
|  | 580 | </span> | 
|  | 581 | <span id="4"></span> | 
|  | 582 | </html>''' | 
|  | 583 | # All that whitespace looks good but makes the tests more | 
|  | 584 | # difficult. Get rid of it. | 
|  | 585 | markup = re.compile("\n\s*").sub("", markup) | 
|  | 586 | self.tree = self.soup(markup) | 
|  | 587 |  | 
|  | 588 |  | 
|  | 589 | class TestNextSibling(SiblingTest): | 
|  | 590 |  | 
|  | 591 | def setUp(self): | 
|  | 592 | super(TestNextSibling, self).setUp() | 
|  | 593 | self.start = self.tree.find(id="1") | 
|  | 594 |  | 
|  | 595 | def test_next_sibling_of_root_is_none(self): | 
|  | 596 | self.assertEqual(self.tree.next_sibling, None) | 
|  | 597 |  | 
|  | 598 | def test_next_sibling(self): | 
|  | 599 | self.assertEqual(self.start.next_sibling['id'], '2') | 
|  | 600 | self.assertEqual(self.start.next_sibling.next_sibling['id'], '3') | 
|  | 601 |  | 
|  | 602 | # Note the difference between next_sibling and next_element. | 
|  | 603 | self.assertEqual(self.start.next_element['id'], '1.1') | 
|  | 604 |  | 
|  | 605 | def test_next_sibling_may_not_exist(self): | 
|  | 606 | self.assertEqual(self.tree.html.next_sibling, None) | 
|  | 607 |  | 
|  | 608 | nested_span = self.tree.find(id="1.1") | 
|  | 609 | self.assertEqual(nested_span.next_sibling, None) | 
|  | 610 |  | 
|  | 611 | last_span = self.tree.find(id="4") | 
|  | 612 | self.assertEqual(last_span.next_sibling, None) | 
|  | 613 |  | 
|  | 614 | def test_find_next_sibling(self): | 
|  | 615 | self.assertEqual(self.start.find_next_sibling('span')['id'], '2') | 
|  | 616 |  | 
|  | 617 | def test_next_siblings(self): | 
|  | 618 | self.assertSelectsIDs(self.start.find_next_siblings("span"), | 
|  | 619 | ['2', '3', '4']) | 
|  | 620 |  | 
|  | 621 | self.assertSelectsIDs(self.start.find_next_siblings(id='3'), ['3']) | 
|  | 622 |  | 
|  | 623 | def test_next_sibling_for_text_element(self): | 
|  | 624 | soup = self.soup("Foo<b>bar</b>baz") | 
|  | 625 | start = soup.find(text="Foo") | 
|  | 626 | self.assertEqual(start.next_sibling.name, 'b') | 
|  | 627 | self.assertEqual(start.next_sibling.next_sibling, 'baz') | 
|  | 628 |  | 
|  | 629 | self.assertSelects(start.find_next_siblings('b'), ['bar']) | 
|  | 630 | self.assertEqual(start.find_next_sibling(text="baz"), "baz") | 
|  | 631 | self.assertEqual(start.find_next_sibling(text="nonesuch"), None) | 
|  | 632 |  | 
|  | 633 |  | 
|  | 634 | class TestPreviousSibling(SiblingTest): | 
|  | 635 |  | 
|  | 636 | def setUp(self): | 
|  | 637 | super(TestPreviousSibling, self).setUp() | 
|  | 638 | self.end = self.tree.find(id="4") | 
|  | 639 |  | 
|  | 640 | def test_previous_sibling_of_root_is_none(self): | 
|  | 641 | self.assertEqual(self.tree.previous_sibling, None) | 
|  | 642 |  | 
|  | 643 | def test_previous_sibling(self): | 
|  | 644 | self.assertEqual(self.end.previous_sibling['id'], '3') | 
|  | 645 | self.assertEqual(self.end.previous_sibling.previous_sibling['id'], '2') | 
|  | 646 |  | 
|  | 647 | # Note the difference between previous_sibling and previous_element. | 
|  | 648 | self.assertEqual(self.end.previous_element['id'], '3.1') | 
|  | 649 |  | 
|  | 650 | def test_previous_sibling_may_not_exist(self): | 
|  | 651 | self.assertEqual(self.tree.html.previous_sibling, None) | 
|  | 652 |  | 
|  | 653 | nested_span = self.tree.find(id="1.1") | 
|  | 654 | self.assertEqual(nested_span.previous_sibling, None) | 
|  | 655 |  | 
|  | 656 | first_span = self.tree.find(id="1") | 
|  | 657 | self.assertEqual(first_span.previous_sibling, None) | 
|  | 658 |  | 
|  | 659 | def test_find_previous_sibling(self): | 
|  | 660 | self.assertEqual(self.end.find_previous_sibling('span')['id'], '3') | 
|  | 661 |  | 
|  | 662 | def test_previous_siblings(self): | 
|  | 663 | self.assertSelectsIDs(self.end.find_previous_siblings("span"), | 
|  | 664 | ['3', '2', '1']) | 
|  | 665 |  | 
|  | 666 | self.assertSelectsIDs(self.end.find_previous_siblings(id='1'), ['1']) | 
|  | 667 |  | 
|  | 668 | def test_previous_sibling_for_text_element(self): | 
|  | 669 | soup = self.soup("Foo<b>bar</b>baz") | 
|  | 670 | start = soup.find(text="baz") | 
|  | 671 | self.assertEqual(start.previous_sibling.name, 'b') | 
|  | 672 | self.assertEqual(start.previous_sibling.previous_sibling, 'Foo') | 
|  | 673 |  | 
|  | 674 | self.assertSelects(start.find_previous_siblings('b'), ['bar']) | 
|  | 675 | self.assertEqual(start.find_previous_sibling(text="Foo"), "Foo") | 
|  | 676 | self.assertEqual(start.find_previous_sibling(text="nonesuch"), None) | 
|  | 677 |  | 
|  | 678 |  | 
|  | 679 | class TestTagCreation(SoupTest): | 
|  | 680 | """Test the ability to create new tags.""" | 
|  | 681 | def test_new_tag(self): | 
|  | 682 | soup = self.soup("") | 
|  | 683 | new_tag = soup.new_tag("foo", bar="baz") | 
|  | 684 | self.assertTrue(isinstance(new_tag, Tag)) | 
|  | 685 | self.assertEqual("foo", new_tag.name) | 
|  | 686 | self.assertEqual(dict(bar="baz"), new_tag.attrs) | 
|  | 687 | self.assertEqual(None, new_tag.parent) | 
|  | 688 |  | 
|  | 689 | def test_tag_inherits_self_closing_rules_from_builder(self): | 
|  | 690 | if XML_BUILDER_PRESENT: | 
|  | 691 | xml_soup = BeautifulSoup("", "xml") | 
|  | 692 | xml_br = xml_soup.new_tag("br") | 
|  | 693 | xml_p = xml_soup.new_tag("p") | 
|  | 694 |  | 
|  | 695 | # Both the <br> and <p> tag are empty-element, just because | 
|  | 696 | # they have no contents. | 
|  | 697 | self.assertEqual(b"<br/>", xml_br.encode()) | 
|  | 698 | self.assertEqual(b"<p/>", xml_p.encode()) | 
|  | 699 |  | 
|  | 700 | html_soup = BeautifulSoup("", "html") | 
|  | 701 | html_br = html_soup.new_tag("br") | 
|  | 702 | html_p = html_soup.new_tag("p") | 
|  | 703 |  | 
|  | 704 | # The HTML builder users HTML's rules about which tags are | 
|  | 705 | # empty-element tags, and the new tags reflect these rules. | 
|  | 706 | self.assertEqual(b"<br/>", html_br.encode()) | 
|  | 707 | self.assertEqual(b"<p></p>", html_p.encode()) | 
|  | 708 |  | 
|  | 709 | def test_new_string_creates_navigablestring(self): | 
|  | 710 | soup = self.soup("") | 
|  | 711 | s = soup.new_string("foo") | 
|  | 712 | self.assertEqual("foo", s) | 
|  | 713 | self.assertTrue(isinstance(s, NavigableString)) | 
|  | 714 |  | 
|  | 715 | def test_new_string_can_create_navigablestring_subclass(self): | 
|  | 716 | soup = self.soup("") | 
|  | 717 | s = soup.new_string("foo", Comment) | 
|  | 718 | self.assertEqual("foo", s) | 
|  | 719 | self.assertTrue(isinstance(s, Comment)) | 
|  | 720 |  | 
|  | 721 | class TestTreeModification(SoupTest): | 
|  | 722 |  | 
|  | 723 | def test_attribute_modification(self): | 
|  | 724 | soup = self.soup('<a id="1"></a>') | 
|  | 725 | soup.a['id'] = 2 | 
|  | 726 | self.assertEqual(soup.decode(), self.document_for('<a id="2"></a>')) | 
|  | 727 | del(soup.a['id']) | 
|  | 728 | self.assertEqual(soup.decode(), self.document_for('<a></a>')) | 
|  | 729 | soup.a['id2'] = 'foo' | 
|  | 730 | self.assertEqual(soup.decode(), self.document_for('<a id2="foo"></a>')) | 
|  | 731 |  | 
|  | 732 | def test_new_tag_creation(self): | 
|  | 733 | builder = builder_registry.lookup('html')() | 
|  | 734 | soup = self.soup("<body></body>", builder=builder) | 
|  | 735 | a = Tag(soup, builder, 'a') | 
|  | 736 | ol = Tag(soup, builder, 'ol') | 
|  | 737 | a['href'] = 'http://foo.com/' | 
|  | 738 | soup.body.insert(0, a) | 
|  | 739 | soup.body.insert(1, ol) | 
|  | 740 | self.assertEqual( | 
|  | 741 | soup.body.encode(), | 
|  | 742 | b'<body><a href="http://foo.com/"></a><ol></ol></body>') | 
|  | 743 |  | 
|  | 744 | def test_append_to_contents_moves_tag(self): | 
|  | 745 | doc = """<p id="1">Don't leave me <b>here</b>.</p> | 
|  | 746 | <p id="2">Don\'t leave!</p>""" | 
|  | 747 | soup = self.soup(doc) | 
|  | 748 | second_para = soup.find(id='2') | 
|  | 749 | bold = soup.b | 
|  | 750 |  | 
|  | 751 | # Move the <b> tag to the end of the second paragraph. | 
|  | 752 | soup.find(id='2').append(soup.b) | 
|  | 753 |  | 
|  | 754 | # The <b> tag is now a child of the second paragraph. | 
|  | 755 | self.assertEqual(bold.parent, second_para) | 
|  | 756 |  | 
|  | 757 | self.assertEqual( | 
|  | 758 | soup.decode(), self.document_for( | 
|  | 759 | '<p id="1">Don\'t leave me .</p>\n' | 
|  | 760 | '<p id="2">Don\'t leave!<b>here</b></p>')) | 
|  | 761 |  | 
|  | 762 | def test_replace_with_returns_thing_that_was_replaced(self): | 
|  | 763 | text = "<a></a><b><c></c></b>" | 
|  | 764 | soup = self.soup(text) | 
|  | 765 | a = soup.a | 
|  | 766 | new_a = a.replace_with(soup.c) | 
|  | 767 | self.assertEqual(a, new_a) | 
|  | 768 |  | 
|  | 769 | def test_unwrap_returns_thing_that_was_replaced(self): | 
|  | 770 | text = "<a><b></b><c></c></a>" | 
|  | 771 | soup = self.soup(text) | 
|  | 772 | a = soup.a | 
|  | 773 | new_a = a.unwrap() | 
|  | 774 | self.assertEqual(a, new_a) | 
|  | 775 |  | 
|  | 776 | def test_replace_tag_with_itself(self): | 
|  | 777 | text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>" | 
|  | 778 | soup = self.soup(text) | 
|  | 779 | c = soup.c | 
|  | 780 | soup.c.replace_with(c) | 
|  | 781 | self.assertEqual(soup.decode(), self.document_for(text)) | 
|  | 782 |  | 
|  | 783 | def test_replace_tag_with_its_parent_raises_exception(self): | 
|  | 784 | text = "<a><b></b></a>" | 
|  | 785 | soup = self.soup(text) | 
|  | 786 | self.assertRaises(ValueError, soup.b.replace_with, soup.a) | 
|  | 787 |  | 
|  | 788 | def test_insert_tag_into_itself_raises_exception(self): | 
|  | 789 | text = "<a><b></b></a>" | 
|  | 790 | soup = self.soup(text) | 
|  | 791 | self.assertRaises(ValueError, soup.a.insert, 0, soup.a) | 
|  | 792 |  | 
|  | 793 | def test_replace_with_maintains_next_element_throughout(self): | 
|  | 794 | soup = self.soup('<p><a>one</a><b>three</b></p>') | 
|  | 795 | a = soup.a | 
|  | 796 | b = a.contents[0] | 
|  | 797 | # Make it so the <a> tag has two text children. | 
|  | 798 | a.insert(1, "two") | 
|  | 799 |  | 
|  | 800 | # Now replace each one with the empty string. | 
|  | 801 | left, right = a.contents | 
|  | 802 | left.replaceWith('') | 
|  | 803 | right.replaceWith('') | 
|  | 804 |  | 
|  | 805 | # The <b> tag is still connected to the tree. | 
|  | 806 | self.assertEqual("three", soup.b.string) | 
|  | 807 |  | 
|  | 808 | def test_replace_final_node(self): | 
|  | 809 | soup = self.soup("<b>Argh!</b>") | 
|  | 810 | soup.find(text="Argh!").replace_with("Hooray!") | 
|  | 811 | new_text = soup.find(text="Hooray!") | 
|  | 812 | b = soup.b | 
|  | 813 | self.assertEqual(new_text.previous_element, b) | 
|  | 814 | self.assertEqual(new_text.parent, b) | 
|  | 815 | self.assertEqual(new_text.previous_element.next_element, new_text) | 
|  | 816 | self.assertEqual(new_text.next_element, None) | 
|  | 817 |  | 
|  | 818 | def test_consecutive_text_nodes(self): | 
|  | 819 | # A builder should never create two consecutive text nodes, | 
|  | 820 | # but if you insert one next to another, Beautiful Soup will | 
|  | 821 | # handle it correctly. | 
|  | 822 | soup = self.soup("<a><b>Argh!</b><c></c></a>") | 
|  | 823 | soup.b.insert(1, "Hooray!") | 
|  | 824 |  | 
|  | 825 | self.assertEqual( | 
|  | 826 | soup.decode(), self.document_for( | 
|  | 827 | "<a><b>Argh!Hooray!</b><c></c></a>")) | 
|  | 828 |  | 
|  | 829 | new_text = soup.find(text="Hooray!") | 
|  | 830 | self.assertEqual(new_text.previous_element, "Argh!") | 
|  | 831 | self.assertEqual(new_text.previous_element.next_element, new_text) | 
|  | 832 |  | 
|  | 833 | self.assertEqual(new_text.previous_sibling, "Argh!") | 
|  | 834 | self.assertEqual(new_text.previous_sibling.next_sibling, new_text) | 
|  | 835 |  | 
|  | 836 | self.assertEqual(new_text.next_sibling, None) | 
|  | 837 | self.assertEqual(new_text.next_element, soup.c) | 
|  | 838 |  | 
|  | 839 | def test_insert_string(self): | 
|  | 840 | soup = self.soup("<a></a>") | 
|  | 841 | soup.a.insert(0, "bar") | 
|  | 842 | soup.a.insert(0, "foo") | 
|  | 843 | # The string were added to the tag. | 
|  | 844 | self.assertEqual(["foo", "bar"], soup.a.contents) | 
|  | 845 | # And they were converted to NavigableStrings. | 
|  | 846 | self.assertEqual(soup.a.contents[0].next_element, "bar") | 
|  | 847 |  | 
|  | 848 | def test_insert_tag(self): | 
|  | 849 | builder = self.default_builder | 
|  | 850 | soup = self.soup( | 
|  | 851 | "<a><b>Find</b><c>lady!</c><d></d></a>", builder=builder) | 
|  | 852 | magic_tag = Tag(soup, builder, 'magictag') | 
|  | 853 | magic_tag.insert(0, "the") | 
|  | 854 | soup.a.insert(1, magic_tag) | 
|  | 855 |  | 
|  | 856 | self.assertEqual( | 
|  | 857 | soup.decode(), self.document_for( | 
|  | 858 | "<a><b>Find</b><magictag>the</magictag><c>lady!</c><d></d></a>")) | 
|  | 859 |  | 
|  | 860 | # Make sure all the relationships are hooked up correctly. | 
|  | 861 | b_tag = soup.b | 
|  | 862 | self.assertEqual(b_tag.next_sibling, magic_tag) | 
|  | 863 | self.assertEqual(magic_tag.previous_sibling, b_tag) | 
|  | 864 |  | 
|  | 865 | find = b_tag.find(text="Find") | 
|  | 866 | self.assertEqual(find.next_element, magic_tag) | 
|  | 867 | self.assertEqual(magic_tag.previous_element, find) | 
|  | 868 |  | 
|  | 869 | c_tag = soup.c | 
|  | 870 | self.assertEqual(magic_tag.next_sibling, c_tag) | 
|  | 871 | self.assertEqual(c_tag.previous_sibling, magic_tag) | 
|  | 872 |  | 
|  | 873 | the = magic_tag.find(text="the") | 
|  | 874 | self.assertEqual(the.parent, magic_tag) | 
|  | 875 | self.assertEqual(the.next_element, c_tag) | 
|  | 876 | self.assertEqual(c_tag.previous_element, the) | 
|  | 877 |  | 
|  | 878 | def test_append_child_thats_already_at_the_end(self): | 
|  | 879 | data = "<a><b></b></a>" | 
|  | 880 | soup = self.soup(data) | 
|  | 881 | soup.a.append(soup.b) | 
|  | 882 | self.assertEqual(data, soup.decode()) | 
|  | 883 |  | 
|  | 884 | def test_move_tag_to_beginning_of_parent(self): | 
|  | 885 | data = "<a><b></b><c></c><d></d></a>" | 
|  | 886 | soup = self.soup(data) | 
|  | 887 | soup.a.insert(0, soup.d) | 
|  | 888 | self.assertEqual("<a><d></d><b></b><c></c></a>", soup.decode()) | 
|  | 889 |  | 
|  | 890 | def test_insert_works_on_empty_element_tag(self): | 
|  | 891 | # This is a little strange, since most HTML parsers don't allow | 
|  | 892 | # markup like this to come through. But in general, we don't | 
|  | 893 | # know what the parser would or wouldn't have allowed, so | 
|  | 894 | # I'm letting this succeed for now. | 
|  | 895 | soup = self.soup("<br/>") | 
|  | 896 | soup.br.insert(1, "Contents") | 
|  | 897 | self.assertEqual(str(soup.br), "<br>Contents</br>") | 
|  | 898 |  | 
|  | 899 | def test_insert_before(self): | 
|  | 900 | soup = self.soup("<a>foo</a><b>bar</b>") | 
|  | 901 | soup.b.insert_before("BAZ") | 
|  | 902 | soup.a.insert_before("QUUX") | 
|  | 903 | self.assertEqual( | 
|  | 904 | soup.decode(), self.document_for("QUUX<a>foo</a>BAZ<b>bar</b>")) | 
|  | 905 |  | 
|  | 906 | soup.a.insert_before(soup.b) | 
|  | 907 | self.assertEqual( | 
|  | 908 | soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ")) | 
|  | 909 |  | 
|  | 910 | def test_insert_after(self): | 
|  | 911 | soup = self.soup("<a>foo</a><b>bar</b>") | 
|  | 912 | soup.b.insert_after("BAZ") | 
|  | 913 | soup.a.insert_after("QUUX") | 
|  | 914 | self.assertEqual( | 
|  | 915 | soup.decode(), self.document_for("<a>foo</a>QUUX<b>bar</b>BAZ")) | 
|  | 916 | soup.b.insert_after(soup.a) | 
|  | 917 | self.assertEqual( | 
|  | 918 | soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ")) | 
|  | 919 |  | 
|  | 920 | def test_insert_after_raises_exception_if_after_has_no_meaning(self): | 
|  | 921 | soup = self.soup("") | 
|  | 922 | tag = soup.new_tag("a") | 
|  | 923 | string = soup.new_string("") | 
|  | 924 | self.assertRaises(ValueError, string.insert_after, tag) | 
|  | 925 | self.assertRaises(NotImplementedError, soup.insert_after, tag) | 
|  | 926 | self.assertRaises(ValueError, tag.insert_after, tag) | 
|  | 927 |  | 
|  | 928 | def test_insert_before_raises_notimplementederror_if_before_has_no_meaning(self): | 
|  | 929 | soup = self.soup("") | 
|  | 930 | tag = soup.new_tag("a") | 
|  | 931 | string = soup.new_string("") | 
|  | 932 | self.assertRaises(ValueError, string.insert_before, tag) | 
|  | 933 | self.assertRaises(NotImplementedError, soup.insert_before, tag) | 
|  | 934 | self.assertRaises(ValueError, tag.insert_before, tag) | 
|  | 935 |  | 
|  | 936 | def test_replace_with(self): | 
|  | 937 | soup = self.soup( | 
|  | 938 | "<p>There's <b>no</b> business like <b>show</b> business</p>") | 
|  | 939 | no, show = soup.find_all('b') | 
|  | 940 | show.replace_with(no) | 
|  | 941 | self.assertEqual( | 
|  | 942 | soup.decode(), | 
|  | 943 | self.document_for( | 
|  | 944 | "<p>There's  business like <b>no</b> business</p>")) | 
|  | 945 |  | 
|  | 946 | self.assertEqual(show.parent, None) | 
|  | 947 | self.assertEqual(no.parent, soup.p) | 
|  | 948 | self.assertEqual(no.next_element, "no") | 
|  | 949 | self.assertEqual(no.next_sibling, " business") | 
|  | 950 |  | 
|  | 951 | def test_replace_first_child(self): | 
|  | 952 | data = "<a><b></b><c></c></a>" | 
|  | 953 | soup = self.soup(data) | 
|  | 954 | soup.b.replace_with(soup.c) | 
|  | 955 | self.assertEqual("<a><c></c></a>", soup.decode()) | 
|  | 956 |  | 
|  | 957 | def test_replace_last_child(self): | 
|  | 958 | data = "<a><b></b><c></c></a>" | 
|  | 959 | soup = self.soup(data) | 
|  | 960 | soup.c.replace_with(soup.b) | 
|  | 961 | self.assertEqual("<a><b></b></a>", soup.decode()) | 
|  | 962 |  | 
|  | 963 | def test_nested_tag_replace_with(self): | 
|  | 964 | soup = self.soup( | 
|  | 965 | """<a>We<b>reserve<c>the</c><d>right</d></b></a><e>to<f>refuse</f><g>service</g></e>""") | 
|  | 966 |  | 
|  | 967 | # Replace the entire <b> tag and its contents ("reserve the | 
|  | 968 | # right") with the <f> tag ("refuse"). | 
|  | 969 | remove_tag = soup.b | 
|  | 970 | move_tag = soup.f | 
|  | 971 | remove_tag.replace_with(move_tag) | 
|  | 972 |  | 
|  | 973 | self.assertEqual( | 
|  | 974 | soup.decode(), self.document_for( | 
|  | 975 | "<a>We<f>refuse</f></a><e>to<g>service</g></e>")) | 
|  | 976 |  | 
|  | 977 | # The <b> tag is now an orphan. | 
|  | 978 | self.assertEqual(remove_tag.parent, None) | 
|  | 979 | self.assertEqual(remove_tag.find(text="right").next_element, None) | 
|  | 980 | self.assertEqual(remove_tag.previous_element, None) | 
|  | 981 | self.assertEqual(remove_tag.next_sibling, None) | 
|  | 982 | self.assertEqual(remove_tag.previous_sibling, None) | 
|  | 983 |  | 
|  | 984 | # The <f> tag is now connected to the <a> tag. | 
|  | 985 | self.assertEqual(move_tag.parent, soup.a) | 
|  | 986 | self.assertEqual(move_tag.previous_element, "We") | 
|  | 987 | self.assertEqual(move_tag.next_element.next_element, soup.e) | 
|  | 988 | self.assertEqual(move_tag.next_sibling, None) | 
|  | 989 |  | 
|  | 990 | # The gap where the <f> tag used to be has been mended, and | 
|  | 991 | # the word "to" is now connected to the <g> tag. | 
|  | 992 | to_text = soup.find(text="to") | 
|  | 993 | g_tag = soup.g | 
|  | 994 | self.assertEqual(to_text.next_element, g_tag) | 
|  | 995 | self.assertEqual(to_text.next_sibling, g_tag) | 
|  | 996 | self.assertEqual(g_tag.previous_element, to_text) | 
|  | 997 | self.assertEqual(g_tag.previous_sibling, to_text) | 
|  | 998 |  | 
|  | 999 | def test_unwrap(self): | 
|  | 1000 | tree = self.soup(""" | 
|  | 1001 | <p>Unneeded <em>formatting</em> is unneeded</p> | 
|  | 1002 | """) | 
|  | 1003 | tree.em.unwrap() | 
|  | 1004 | self.assertEqual(tree.em, None) | 
|  | 1005 | self.assertEqual(tree.p.text, "Unneeded formatting is unneeded") | 
|  | 1006 |  | 
|  | 1007 | def test_wrap(self): | 
|  | 1008 | soup = self.soup("I wish I was bold.") | 
|  | 1009 | value = soup.string.wrap(soup.new_tag("b")) | 
|  | 1010 | self.assertEqual(value.decode(), "<b>I wish I was bold.</b>") | 
|  | 1011 | self.assertEqual( | 
|  | 1012 | soup.decode(), self.document_for("<b>I wish I was bold.</b>")) | 
|  | 1013 |  | 
|  | 1014 | def test_wrap_extracts_tag_from_elsewhere(self): | 
|  | 1015 | soup = self.soup("<b></b>I wish I was bold.") | 
|  | 1016 | soup.b.next_sibling.wrap(soup.b) | 
|  | 1017 | self.assertEqual( | 
|  | 1018 | soup.decode(), self.document_for("<b>I wish I was bold.</b>")) | 
|  | 1019 |  | 
|  | 1020 | def test_wrap_puts_new_contents_at_the_end(self): | 
|  | 1021 | soup = self.soup("<b>I like being bold.</b>I wish I was bold.") | 
|  | 1022 | soup.b.next_sibling.wrap(soup.b) | 
|  | 1023 | self.assertEqual(2, len(soup.b.contents)) | 
|  | 1024 | self.assertEqual( | 
|  | 1025 | soup.decode(), self.document_for( | 
|  | 1026 | "<b>I like being bold.I wish I was bold.</b>")) | 
|  | 1027 |  | 
|  | 1028 | def test_extract(self): | 
|  | 1029 | soup = self.soup( | 
|  | 1030 | '<html><body>Some content. <div id="nav">Nav crap</div> More content.</body></html>') | 
|  | 1031 |  | 
|  | 1032 | self.assertEqual(len(soup.body.contents), 3) | 
|  | 1033 | extracted = soup.find(id="nav").extract() | 
|  | 1034 |  | 
|  | 1035 | self.assertEqual( | 
|  | 1036 | soup.decode(), "<html><body>Some content.  More content.</body></html>") | 
|  | 1037 | self.assertEqual(extracted.decode(), '<div id="nav">Nav crap</div>') | 
|  | 1038 |  | 
|  | 1039 | # The extracted tag is now an orphan. | 
|  | 1040 | self.assertEqual(len(soup.body.contents), 2) | 
|  | 1041 | self.assertEqual(extracted.parent, None) | 
|  | 1042 | self.assertEqual(extracted.previous_element, None) | 
|  | 1043 | self.assertEqual(extracted.next_element.next_element, None) | 
|  | 1044 |  | 
|  | 1045 | # The gap where the extracted tag used to be has been mended. | 
|  | 1046 | content_1 = soup.find(text="Some content. ") | 
|  | 1047 | content_2 = soup.find(text=" More content.") | 
|  | 1048 | self.assertEqual(content_1.next_element, content_2) | 
|  | 1049 | self.assertEqual(content_1.next_sibling, content_2) | 
|  | 1050 | self.assertEqual(content_2.previous_element, content_1) | 
|  | 1051 | self.assertEqual(content_2.previous_sibling, content_1) | 
|  | 1052 |  | 
|  | 1053 | def test_extract_distinguishes_between_identical_strings(self): | 
|  | 1054 | soup = self.soup("<a>foo</a><b>bar</b>") | 
|  | 1055 | foo_1 = soup.a.string | 
|  | 1056 | bar_1 = soup.b.string | 
|  | 1057 | foo_2 = soup.new_string("foo") | 
|  | 1058 | bar_2 = soup.new_string("bar") | 
|  | 1059 | soup.a.append(foo_2) | 
|  | 1060 | soup.b.append(bar_2) | 
|  | 1061 |  | 
|  | 1062 | # Now there are two identical strings in the <a> tag, and two | 
|  | 1063 | # in the <b> tag. Let's remove the first "foo" and the second | 
|  | 1064 | # "bar". | 
|  | 1065 | foo_1.extract() | 
|  | 1066 | bar_2.extract() | 
|  | 1067 | self.assertEqual(foo_2, soup.a.string) | 
|  | 1068 | self.assertEqual(bar_2, soup.b.string) | 
|  | 1069 |  | 
|  | 1070 | def test_clear(self): | 
|  | 1071 | """Tag.clear()""" | 
|  | 1072 | soup = self.soup("<p><a>String <em>Italicized</em></a> and another</p>") | 
|  | 1073 | # clear using extract() | 
|  | 1074 | a = soup.a | 
|  | 1075 | soup.p.clear() | 
|  | 1076 | self.assertEqual(len(soup.p.contents), 0) | 
|  | 1077 | self.assertTrue(hasattr(a, "contents")) | 
|  | 1078 |  | 
|  | 1079 | # clear using decompose() | 
|  | 1080 | em = a.em | 
|  | 1081 | a.clear(decompose=True) | 
|  | 1082 | self.assertEqual(0, len(em.contents)) | 
|  | 1083 |  | 
|  | 1084 | def test_string_set(self): | 
|  | 1085 | """Tag.string = 'string'""" | 
|  | 1086 | soup = self.soup("<a></a> <b><c></c></b>") | 
|  | 1087 | soup.a.string = "foo" | 
|  | 1088 | self.assertEqual(soup.a.contents, ["foo"]) | 
|  | 1089 | soup.b.string = "bar" | 
|  | 1090 | self.assertEqual(soup.b.contents, ["bar"]) | 
|  | 1091 |  | 
|  | 1092 | def test_string_set_does_not_affect_original_string(self): | 
|  | 1093 | soup = self.soup("<a><b>foo</b><c>bar</c>") | 
|  | 1094 | soup.b.string = soup.c.string | 
|  | 1095 | self.assertEqual(soup.a.encode(), b"<a><b>bar</b><c>bar</c></a>") | 
|  | 1096 |  | 
|  | 1097 | def test_set_string_preserves_class_of_string(self): | 
|  | 1098 | soup = self.soup("<a></a>") | 
|  | 1099 | cdata = CData("foo") | 
|  | 1100 | soup.a.string = cdata | 
|  | 1101 | self.assertTrue(isinstance(soup.a.string, CData)) | 
|  | 1102 |  | 
|  | 1103 | class TestElementObjects(SoupTest): | 
|  | 1104 | """Test various features of element objects.""" | 
|  | 1105 |  | 
|  | 1106 | def test_len(self): | 
|  | 1107 | """The length of an element is its number of children.""" | 
|  | 1108 | soup = self.soup("<top>1<b>2</b>3</top>") | 
|  | 1109 |  | 
|  | 1110 | # The BeautifulSoup object itself contains one element: the | 
|  | 1111 | # <top> tag. | 
|  | 1112 | self.assertEqual(len(soup.contents), 1) | 
|  | 1113 | self.assertEqual(len(soup), 1) | 
|  | 1114 |  | 
|  | 1115 | # The <top> tag contains three elements: the text node "1", the | 
|  | 1116 | # <b> tag, and the text node "3". | 
|  | 1117 | self.assertEqual(len(soup.top), 3) | 
|  | 1118 | self.assertEqual(len(soup.top.contents), 3) | 
|  | 1119 |  | 
|  | 1120 | def test_member_access_invokes_find(self): | 
|  | 1121 | """Accessing a Python member .foo invokes find('foo')""" | 
|  | 1122 | soup = self.soup('<b><i></i></b>') | 
|  | 1123 | self.assertEqual(soup.b, soup.find('b')) | 
|  | 1124 | self.assertEqual(soup.b.i, soup.find('b').find('i')) | 
|  | 1125 | self.assertEqual(soup.a, None) | 
|  | 1126 |  | 
|  | 1127 | def test_deprecated_member_access(self): | 
|  | 1128 | soup = self.soup('<b><i></i></b>') | 
|  | 1129 | with warnings.catch_warnings(record=True) as w: | 
|  | 1130 | tag = soup.bTag | 
|  | 1131 | self.assertEqual(soup.b, tag) | 
|  | 1132 | self.assertEqual( | 
|  | 1133 | '.bTag is deprecated, use .find("b") instead.', | 
|  | 1134 | str(w[0].message)) | 
|  | 1135 |  | 
|  | 1136 | def test_has_attr(self): | 
|  | 1137 | """has_attr() checks for the presence of an attribute. | 
|  | 1138 |  | 
|  | 1139 | Please note note: has_attr() is different from | 
|  | 1140 | __in__. has_attr() checks the tag's attributes and __in__ | 
|  | 1141 | checks the tag's chidlren. | 
|  | 1142 | """ | 
|  | 1143 | soup = self.soup("<foo attr='bar'>") | 
|  | 1144 | self.assertTrue(soup.foo.has_attr('attr')) | 
|  | 1145 | self.assertFalse(soup.foo.has_attr('attr2')) | 
|  | 1146 |  | 
|  | 1147 |  | 
|  | 1148 | def test_attributes_come_out_in_alphabetical_order(self): | 
|  | 1149 | markup = '<b a="1" z="5" m="3" f="2" y="4"></b>' | 
|  | 1150 | self.assertSoupEquals(markup, '<b a="1" f="2" m="3" y="4" z="5"></b>') | 
|  | 1151 |  | 
|  | 1152 | def test_string(self): | 
|  | 1153 | # A tag that contains only a text node makes that node | 
|  | 1154 | # available as .string. | 
|  | 1155 | soup = self.soup("<b>foo</b>") | 
|  | 1156 | self.assertEqual(soup.b.string, 'foo') | 
|  | 1157 |  | 
|  | 1158 | def test_empty_tag_has_no_string(self): | 
|  | 1159 | # A tag with no children has no .stirng. | 
|  | 1160 | soup = self.soup("<b></b>") | 
|  | 1161 | self.assertEqual(soup.b.string, None) | 
|  | 1162 |  | 
|  | 1163 | def test_tag_with_multiple_children_has_no_string(self): | 
|  | 1164 | # A tag with no children has no .string. | 
|  | 1165 | soup = self.soup("<a>foo<b></b><b></b></b>") | 
|  | 1166 | self.assertEqual(soup.b.string, None) | 
|  | 1167 |  | 
|  | 1168 | soup = self.soup("<a>foo<b></b>bar</b>") | 
|  | 1169 | self.assertEqual(soup.b.string, None) | 
|  | 1170 |  | 
|  | 1171 | # Even if all the children are strings, due to trickery, | 
|  | 1172 | # it won't work--but this would be a good optimization. | 
|  | 1173 | soup = self.soup("<a>foo</b>") | 
|  | 1174 | soup.a.insert(1, "bar") | 
|  | 1175 | self.assertEqual(soup.a.string, None) | 
|  | 1176 |  | 
|  | 1177 | def test_tag_with_recursive_string_has_string(self): | 
|  | 1178 | # A tag with a single child which has a .string inherits that | 
|  | 1179 | # .string. | 
|  | 1180 | soup = self.soup("<a><b>foo</b></a>") | 
|  | 1181 | self.assertEqual(soup.a.string, "foo") | 
|  | 1182 | self.assertEqual(soup.string, "foo") | 
|  | 1183 |  | 
|  | 1184 | def test_lack_of_string(self): | 
|  | 1185 | """Only a tag containing a single text node has a .string.""" | 
|  | 1186 | soup = self.soup("<b>f<i>e</i>o</b>") | 
|  | 1187 | self.assertFalse(soup.b.string) | 
|  | 1188 |  | 
|  | 1189 | soup = self.soup("<b></b>") | 
|  | 1190 | self.assertFalse(soup.b.string) | 
|  | 1191 |  | 
|  | 1192 | def test_all_text(self): | 
|  | 1193 | """Tag.text and Tag.get_text(sep=u"") -> all child text, concatenated""" | 
|  | 1194 | soup = self.soup("<a>a<b>r</b>   <r> t </r></a>") | 
|  | 1195 | self.assertEqual(soup.a.text, "ar  t ") | 
|  | 1196 | self.assertEqual(soup.a.get_text(strip=True), "art") | 
|  | 1197 | self.assertEqual(soup.a.get_text(","), "a,r, , t ") | 
|  | 1198 | self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t") | 
|  | 1199 |  | 
|  | 1200 | def test_get_text_ignores_comments(self): | 
|  | 1201 | soup = self.soup("foo<!--IGNORE-->bar") | 
|  | 1202 | self.assertEqual(soup.get_text(), "foobar") | 
|  | 1203 |  | 
|  | 1204 | self.assertEqual( | 
|  | 1205 | soup.get_text(types=(NavigableString, Comment)), "fooIGNOREbar") | 
|  | 1206 | self.assertEqual( | 
|  | 1207 | soup.get_text(types=None), "fooIGNOREbar") | 
|  | 1208 |  | 
|  | 1209 | def test_all_strings_ignores_comments(self): | 
|  | 1210 | soup = self.soup("foo<!--IGNORE-->bar") | 
|  | 1211 | self.assertEqual(['foo', 'bar'], list(soup.strings)) | 
|  | 1212 |  | 
|  | 1213 | class TestCDAtaListAttributes(SoupTest): | 
|  | 1214 |  | 
|  | 1215 | """Testing cdata-list attributes like 'class'. | 
|  | 1216 | """ | 
|  | 1217 | def test_single_value_becomes_list(self): | 
|  | 1218 | soup = self.soup("<a class='foo'>") | 
|  | 1219 | self.assertEqual(["foo"],soup.a['class']) | 
|  | 1220 |  | 
|  | 1221 | def test_multiple_values_becomes_list(self): | 
|  | 1222 | soup = self.soup("<a class='foo bar'>") | 
|  | 1223 | self.assertEqual(["foo", "bar"], soup.a['class']) | 
|  | 1224 |  | 
|  | 1225 | def test_multiple_values_separated_by_weird_whitespace(self): | 
|  | 1226 | soup = self.soup("<a class='foo\tbar\nbaz'>") | 
|  | 1227 | self.assertEqual(["foo", "bar", "baz"],soup.a['class']) | 
|  | 1228 |  | 
|  | 1229 | def test_attributes_joined_into_string_on_output(self): | 
|  | 1230 | soup = self.soup("<a class='foo\tbar'>") | 
|  | 1231 | self.assertEqual(b'<a class="foo bar"></a>', soup.a.encode()) | 
|  | 1232 |  | 
|  | 1233 | def test_accept_charset(self): | 
|  | 1234 | soup = self.soup('<form accept-charset="ISO-8859-1 UTF-8">') | 
|  | 1235 | self.assertEqual(['ISO-8859-1', 'UTF-8'], soup.form['accept-charset']) | 
|  | 1236 |  | 
|  | 1237 | def test_cdata_attribute_applying_only_to_one_tag(self): | 
|  | 1238 | data = '<a accept-charset="ISO-8859-1 UTF-8"></a>' | 
|  | 1239 | soup = self.soup(data) | 
|  | 1240 | # We saw in another test that accept-charset is a cdata-list | 
|  | 1241 | # attribute for the <form> tag. But it's not a cdata-list | 
|  | 1242 | # attribute for any other tag. | 
|  | 1243 | self.assertEqual('ISO-8859-1 UTF-8', soup.a['accept-charset']) | 
|  | 1244 |  | 
|  | 1245 | def test_string_has_immutable_name_property(self): | 
|  | 1246 | string = self.soup("s").string | 
|  | 1247 | self.assertEqual(None, string.name) | 
|  | 1248 | def t(): | 
|  | 1249 | string.name = 'foo' | 
|  | 1250 | self.assertRaises(AttributeError, t) | 
|  | 1251 |  | 
|  | 1252 | class TestPersistence(SoupTest): | 
|  | 1253 | "Testing features like pickle and deepcopy." | 
|  | 1254 |  | 
|  | 1255 | def setUp(self): | 
|  | 1256 | super(TestPersistence, self).setUp() | 
|  | 1257 | self.page = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" | 
|  | 1258 | "http://www.w3.org/TR/REC-html40/transitional.dtd"> | 
|  | 1259 | <html> | 
|  | 1260 | <head> | 
|  | 1261 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8"> | 
|  | 1262 | <title>Beautiful Soup: We called him Tortoise because he taught us.</title> | 
|  | 1263 | <link rev="made" href="mailto:leonardr@segfault.org"> | 
|  | 1264 | <meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping."> | 
|  | 1265 | <meta name="generator" content="Markov Approximation 1.4 (module: leonardr)"> | 
|  | 1266 | <meta name="author" content="Leonard Richardson"> | 
|  | 1267 | </head> | 
|  | 1268 | <body> | 
|  | 1269 | <a href="foo">foo</a> | 
|  | 1270 | <a href="foo"><b>bar</b></a> | 
|  | 1271 | </body> | 
|  | 1272 | </html>""" | 
|  | 1273 | self.tree = self.soup(self.page) | 
|  | 1274 |  | 
|  | 1275 | def test_pickle_and_unpickle_identity(self): | 
|  | 1276 | # Pickling a tree, then unpickling it, yields a tree identical | 
|  | 1277 | # to the original. | 
|  | 1278 | dumped = pickle.dumps(self.tree, 2) | 
|  | 1279 | loaded = pickle.loads(dumped) | 
|  | 1280 | self.assertEqual(loaded.__class__, BeautifulSoup) | 
|  | 1281 | self.assertEqual(loaded.decode(), self.tree.decode()) | 
|  | 1282 |  | 
|  | 1283 | def test_deepcopy_identity(self): | 
|  | 1284 | # Making a deepcopy of a tree yields an identical tree. | 
|  | 1285 | copied = copy.deepcopy(self.tree) | 
|  | 1286 | self.assertEqual(copied.decode(), self.tree.decode()) | 
|  | 1287 |  | 
|  | 1288 | def test_unicode_pickle(self): | 
|  | 1289 | # A tree containing Unicode characters can be pickled. | 
|  | 1290 | html = u"<b>\N{SNOWMAN}</b>" | 
|  | 1291 | soup = self.soup(html) | 
|  | 1292 | dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) | 
|  | 1293 | loaded = pickle.loads(dumped) | 
|  | 1294 | self.assertEqual(loaded.decode(), soup.decode()) | 
|  | 1295 |  | 
|  | 1296 |  | 
|  | 1297 | class TestSubstitutions(SoupTest): | 
|  | 1298 |  | 
|  | 1299 | def test_default_formatter_is_minimal(self): | 
|  | 1300 | markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" | 
|  | 1301 | soup = self.soup(markup) | 
|  | 1302 | decoded = soup.decode(formatter="minimal") | 
|  | 1303 | # The < is converted back into < but the e-with-acute is left alone. | 
|  | 1304 | self.assertEqual( | 
|  | 1305 | decoded, | 
|  | 1306 | self.document_for( | 
|  | 1307 | u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) | 
|  | 1308 |  | 
|  | 1309 | def test_formatter_html(self): | 
|  | 1310 | markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" | 
|  | 1311 | soup = self.soup(markup) | 
|  | 1312 | decoded = soup.decode(formatter="html") | 
|  | 1313 | self.assertEqual( | 
|  | 1314 | decoded, | 
|  | 1315 | self.document_for("<b><<Sacré bleu!>></b>")) | 
|  | 1316 |  | 
|  | 1317 | def test_formatter_minimal(self): | 
|  | 1318 | markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" | 
|  | 1319 | soup = self.soup(markup) | 
|  | 1320 | decoded = soup.decode(formatter="minimal") | 
|  | 1321 | # The < is converted back into < but the e-with-acute is left alone. | 
|  | 1322 | self.assertEqual( | 
|  | 1323 | decoded, | 
|  | 1324 | self.document_for( | 
|  | 1325 | u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) | 
|  | 1326 |  | 
|  | 1327 | def test_formatter_null(self): | 
|  | 1328 | markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>" | 
|  | 1329 | soup = self.soup(markup) | 
|  | 1330 | decoded = soup.decode(formatter=None) | 
|  | 1331 | # Neither the angle brackets nor the e-with-acute are converted. | 
|  | 1332 | # This is not valid HTML, but it's what the user wanted. | 
|  | 1333 | self.assertEqual(decoded, | 
|  | 1334 | self.document_for(u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>")) | 
|  | 1335 |  | 
|  | 1336 | def test_formatter_custom(self): | 
|  | 1337 | markup = u"<b><foo></b><b>bar</b>" | 
|  | 1338 | soup = self.soup(markup) | 
|  | 1339 | decoded = soup.decode(formatter = lambda x: x.upper()) | 
|  | 1340 | # Instead of normal entity conversion code, the custom | 
|  | 1341 | # callable is called on every string. | 
|  | 1342 | self.assertEqual( | 
|  | 1343 | decoded, | 
|  | 1344 | self.document_for(u"<b><FOO></b><b>BAR</b>")) | 
|  | 1345 |  | 
|  | 1346 | def test_formatter_is_run_on_attribute_values(self): | 
|  | 1347 | markup = u'<a href="http://a.com?a=b&c=é">e</a>' | 
|  | 1348 | soup = self.soup(markup) | 
|  | 1349 | a = soup.a | 
|  | 1350 |  | 
|  | 1351 | expect_minimal = u'<a href="http://a.com?a=b&c=é">e</a>' | 
|  | 1352 |  | 
|  | 1353 | self.assertEqual(expect_minimal, a.decode()) | 
|  | 1354 | self.assertEqual(expect_minimal, a.decode(formatter="minimal")) | 
|  | 1355 |  | 
|  | 1356 | expect_html = u'<a href="http://a.com?a=b&c=é">e</a>' | 
|  | 1357 | self.assertEqual(expect_html, a.decode(formatter="html")) | 
|  | 1358 |  | 
|  | 1359 | self.assertEqual(markup, a.decode(formatter=None)) | 
|  | 1360 | expect_upper = u'<a href="HTTP://A.COM?A=B&C=É">E</a>' | 
|  | 1361 | self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper())) | 
|  | 1362 |  | 
|  | 1363 | def test_formatter_skips_script_tag_for_html_documents(self): | 
|  | 1364 | doc = """ | 
|  | 1365 | <script type="text/javascript"> | 
|  | 1366 | console.log("< < hey > > "); | 
|  | 1367 | </script> | 
|  | 1368 | """ | 
|  | 1369 | encoded = BeautifulSoup(doc).encode() | 
|  | 1370 | self.assertTrue(b"< < hey > >" in encoded) | 
|  | 1371 |  | 
|  | 1372 | def test_formatter_skips_style_tag_for_html_documents(self): | 
|  | 1373 | doc = """ | 
|  | 1374 | <style type="text/css"> | 
|  | 1375 | console.log("< < hey > > "); | 
|  | 1376 | </style> | 
|  | 1377 | """ | 
|  | 1378 | encoded = BeautifulSoup(doc).encode() | 
|  | 1379 | self.assertTrue(b"< < hey > >" in encoded) | 
|  | 1380 |  | 
|  | 1381 | def test_prettify_leaves_preformatted_text_alone(self): | 
|  | 1382 | soup = self.soup("<div>  foo  <pre>  \tbar\n  \n  </pre>  baz  ") | 
|  | 1383 | # Everything outside the <pre> tag is reformatted, but everything | 
|  | 1384 | # inside is left alone. | 
|  | 1385 | self.assertEqual( | 
|  | 1386 | u'<div>\n foo\n <pre>  \tbar\n  \n  </pre>\n baz\n</div>', | 
|  | 1387 | soup.div.prettify()) | 
|  | 1388 |  | 
|  | 1389 | def test_prettify_accepts_formatter(self): | 
|  | 1390 | soup = BeautifulSoup("<html><body>foo</body></html>") | 
|  | 1391 | pretty = soup.prettify(formatter = lambda x: x.upper()) | 
|  | 1392 | self.assertTrue("FOO" in pretty) | 
|  | 1393 |  | 
|  | 1394 | def test_prettify_outputs_unicode_by_default(self): | 
|  | 1395 | soup = self.soup("<a></a>") | 
|  | 1396 | self.assertEqual(unicode, type(soup.prettify())) | 
|  | 1397 |  | 
|  | 1398 | def test_prettify_can_encode_data(self): | 
|  | 1399 | soup = self.soup("<a></a>") | 
|  | 1400 | self.assertEqual(bytes, type(soup.prettify("utf-8"))) | 
|  | 1401 |  | 
|  | 1402 | def test_html_entity_substitution_off_by_default(self): | 
|  | 1403 | markup = u"<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>" | 
|  | 1404 | soup = self.soup(markup) | 
|  | 1405 | encoded = soup.b.encode("utf-8") | 
|  | 1406 | self.assertEqual(encoded, markup.encode('utf-8')) | 
|  | 1407 |  | 
|  | 1408 | def test_encoding_substitution(self): | 
|  | 1409 | # Here's the <meta> tag saying that a document is | 
|  | 1410 | # encoded in Shift-JIS. | 
|  | 1411 | meta_tag = ('<meta content="text/html; charset=x-sjis" ' | 
|  | 1412 | 'http-equiv="Content-type"/>') | 
|  | 1413 | soup = self.soup(meta_tag) | 
|  | 1414 |  | 
|  | 1415 | # Parse the document, and the charset apprears unchanged. | 
|  | 1416 | self.assertEqual(soup.meta['content'], 'text/html; charset=x-sjis') | 
|  | 1417 |  | 
|  | 1418 | # Encode the document into some encoding, and the encoding is | 
|  | 1419 | # substituted into the meta tag. | 
|  | 1420 | utf_8 = soup.encode("utf-8") | 
|  | 1421 | self.assertTrue(b"charset=utf-8" in utf_8) | 
|  | 1422 |  | 
|  | 1423 | euc_jp = soup.encode("euc_jp") | 
|  | 1424 | self.assertTrue(b"charset=euc_jp" in euc_jp) | 
|  | 1425 |  | 
|  | 1426 | shift_jis = soup.encode("shift-jis") | 
|  | 1427 | self.assertTrue(b"charset=shift-jis" in shift_jis) | 
|  | 1428 |  | 
|  | 1429 | utf_16_u = soup.encode("utf-16").decode("utf-16") | 
|  | 1430 | self.assertTrue("charset=utf-16" in utf_16_u) | 
|  | 1431 |  | 
|  | 1432 | def test_encoding_substitution_doesnt_happen_if_tag_is_strained(self): | 
|  | 1433 | markup = ('<head><meta content="text/html; charset=x-sjis" ' | 
|  | 1434 | 'http-equiv="Content-type"/></head><pre>foo</pre>') | 
|  | 1435 |  | 
|  | 1436 | # Beautiful Soup used to try to rewrite the meta tag even if the | 
|  | 1437 | # meta tag got filtered out by the strainer. This test makes | 
|  | 1438 | # sure that doesn't happen. | 
|  | 1439 | strainer = SoupStrainer('pre') | 
|  | 1440 | soup = self.soup(markup, parse_only=strainer) | 
|  | 1441 | self.assertEqual(soup.contents[0].name, 'pre') | 
|  | 1442 |  | 
|  | 1443 | class TestEncoding(SoupTest): | 
|  | 1444 | """Test the ability to encode objects into strings.""" | 
|  | 1445 |  | 
|  | 1446 | def test_unicode_string_can_be_encoded(self): | 
|  | 1447 | html = u"<b>\N{SNOWMAN}</b>" | 
|  | 1448 | soup = self.soup(html) | 
|  | 1449 | self.assertEqual(soup.b.string.encode("utf-8"), | 
|  | 1450 | u"\N{SNOWMAN}".encode("utf-8")) | 
|  | 1451 |  | 
|  | 1452 | def test_tag_containing_unicode_string_can_be_encoded(self): | 
|  | 1453 | html = u"<b>\N{SNOWMAN}</b>" | 
|  | 1454 | soup = self.soup(html) | 
|  | 1455 | self.assertEqual( | 
|  | 1456 | soup.b.encode("utf-8"), html.encode("utf-8")) | 
|  | 1457 |  | 
|  | 1458 | def test_encoding_substitutes_unrecognized_characters_by_default(self): | 
|  | 1459 | html = u"<b>\N{SNOWMAN}</b>" | 
|  | 1460 | soup = self.soup(html) | 
|  | 1461 | self.assertEqual(soup.b.encode("ascii"), b"<b>☃</b>") | 
|  | 1462 |  | 
|  | 1463 | def test_encoding_can_be_made_strict(self): | 
|  | 1464 | html = u"<b>\N{SNOWMAN}</b>" | 
|  | 1465 | soup = self.soup(html) | 
|  | 1466 | self.assertRaises( | 
|  | 1467 | UnicodeEncodeError, soup.encode, "ascii", errors="strict") | 
|  | 1468 |  | 
|  | 1469 | def test_decode_contents(self): | 
|  | 1470 | html = u"<b>\N{SNOWMAN}</b>" | 
|  | 1471 | soup = self.soup(html) | 
|  | 1472 | self.assertEqual(u"\N{SNOWMAN}", soup.b.decode_contents()) | 
|  | 1473 |  | 
|  | 1474 | def test_encode_contents(self): | 
|  | 1475 | html = u"<b>\N{SNOWMAN}</b>" | 
|  | 1476 | soup = self.soup(html) | 
|  | 1477 | self.assertEqual( | 
|  | 1478 | u"\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents( | 
|  | 1479 | encoding="utf8")) | 
|  | 1480 |  | 
|  | 1481 | def test_deprecated_renderContents(self): | 
|  | 1482 | html = u"<b>\N{SNOWMAN}</b>" | 
|  | 1483 | soup = self.soup(html) | 
|  | 1484 | self.assertEqual( | 
|  | 1485 | u"\N{SNOWMAN}".encode("utf8"), soup.b.renderContents()) | 
|  | 1486 |  | 
|  | 1487 | class TestNavigableStringSubclasses(SoupTest): | 
|  | 1488 |  | 
|  | 1489 | def test_cdata(self): | 
|  | 1490 | # None of the current builders turn CDATA sections into CData | 
|  | 1491 | # objects, but you can create them manually. | 
|  | 1492 | soup = self.soup("") | 
|  | 1493 | cdata = CData("foo") | 
|  | 1494 | soup.insert(1, cdata) | 
|  | 1495 | self.assertEqual(str(soup), "<![CDATA[foo]]>") | 
|  | 1496 | self.assertEqual(soup.find(text="foo"), "foo") | 
|  | 1497 | self.assertEqual(soup.contents[0], "foo") | 
|  | 1498 |  | 
|  | 1499 | def test_cdata_is_never_formatted(self): | 
|  | 1500 | """Text inside a CData object is passed into the formatter. | 
|  | 1501 |  | 
|  | 1502 | But the return value is ignored. | 
|  | 1503 | """ | 
|  | 1504 |  | 
|  | 1505 | self.count = 0 | 
|  | 1506 | def increment(*args): | 
|  | 1507 | self.count += 1 | 
|  | 1508 | return "BITTER FAILURE" | 
|  | 1509 |  | 
|  | 1510 | soup = self.soup("") | 
|  | 1511 | cdata = CData("<><><>") | 
|  | 1512 | soup.insert(1, cdata) | 
|  | 1513 | self.assertEqual( | 
|  | 1514 | b"<![CDATA[<><><>]]>", soup.encode(formatter=increment)) | 
|  | 1515 | self.assertEqual(1, self.count) | 
|  | 1516 |  | 
|  | 1517 | def test_doctype_ends_in_newline(self): | 
|  | 1518 | # Unlike other NavigableString subclasses, a DOCTYPE always ends | 
|  | 1519 | # in a newline. | 
|  | 1520 | doctype = Doctype("foo") | 
|  | 1521 | soup = self.soup("") | 
|  | 1522 | soup.insert(1, doctype) | 
|  | 1523 | self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n") | 
|  | 1524 |  | 
|  | 1525 |  | 
|  | 1526 | class TestSoupSelector(TreeTest): | 
|  | 1527 |  | 
|  | 1528 | HTML = """ | 
|  | 1529 | <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" | 
|  | 1530 | "http://www.w3.org/TR/html4/strict.dtd"> | 
|  | 1531 | <html> | 
|  | 1532 | <head> | 
|  | 1533 | <title>The title</title> | 
|  | 1534 | <link rel="stylesheet" href="blah.css" type="text/css" id="l1"> | 
|  | 1535 | </head> | 
|  | 1536 | <body> | 
|  | 1537 |  | 
|  | 1538 | <div id="main" class="fancy"> | 
|  | 1539 | <div id="inner"> | 
|  | 1540 | <h1 id="header1">An H1</h1> | 
|  | 1541 | <p>Some text</p> | 
|  | 1542 | <p class="onep" id="p1">Some more text</p> | 
|  | 1543 | <h2 id="header2">An H2</h2> | 
|  | 1544 | <p class="class1 class2 class3" id="pmulti">Another</p> | 
|  | 1545 | <a href="http://bob.example.org/" rel="friend met" id="bob">Bob</a> | 
|  | 1546 | <h2 id="header3">Another H2</h2> | 
|  | 1547 | <a id="me" href="http://simonwillison.net/" rel="me">me</a> | 
|  | 1548 | <span class="s1"> | 
|  | 1549 | <a href="#" id="s1a1">span1a1</a> | 
|  | 1550 | <a href="#" id="s1a2">span1a2 <span id="s1a2s1">test</span></a> | 
|  | 1551 | <span class="span2"> | 
|  | 1552 | <a href="#" id="s2a1">span2a1</a> | 
|  | 1553 | </span> | 
|  | 1554 | <span class="span3"></span> | 
|  | 1555 | </span> | 
|  | 1556 | </div> | 
|  | 1557 | <p lang="en" id="lang-en">English</p> | 
|  | 1558 | <p lang="en-gb" id="lang-en-gb">English UK</p> | 
|  | 1559 | <p lang="en-us" id="lang-en-us">English US</p> | 
|  | 1560 | <p lang="fr" id="lang-fr">French</p> | 
|  | 1561 | </div> | 
|  | 1562 |  | 
|  | 1563 | <div id="footer"> | 
|  | 1564 | </div> | 
|  | 1565 | """ | 
|  | 1566 |  | 
|  | 1567 | def setUp(self): | 
|  | 1568 | self.soup = BeautifulSoup(self.HTML) | 
|  | 1569 |  | 
|  | 1570 | def assertSelects(self, selector, expected_ids): | 
|  | 1571 | el_ids = [el['id'] for el in self.soup.select(selector)] | 
|  | 1572 | el_ids.sort() | 
|  | 1573 | expected_ids.sort() | 
|  | 1574 | self.assertEqual(expected_ids, el_ids, | 
|  | 1575 | "Selector %s, expected [%s], got [%s]" % ( | 
|  | 1576 | selector, ', '.join(expected_ids), ', '.join(el_ids) | 
|  | 1577 | ) | 
|  | 1578 | ) | 
|  | 1579 |  | 
|  | 1580 | assertSelect = assertSelects | 
|  | 1581 |  | 
|  | 1582 | def assertSelectMultiple(self, *tests): | 
|  | 1583 | for selector, expected_ids in tests: | 
|  | 1584 | self.assertSelect(selector, expected_ids) | 
|  | 1585 |  | 
|  | 1586 | def test_one_tag_one(self): | 
|  | 1587 | els = self.soup.select('title') | 
|  | 1588 | self.assertEqual(len(els), 1) | 
|  | 1589 | self.assertEqual(els[0].name, 'title') | 
|  | 1590 | self.assertEqual(els[0].contents, [u'The title']) | 
|  | 1591 |  | 
|  | 1592 | def test_one_tag_many(self): | 
|  | 1593 | els = self.soup.select('div') | 
|  | 1594 | self.assertEqual(len(els), 3) | 
|  | 1595 | for div in els: | 
|  | 1596 | self.assertEqual(div.name, 'div') | 
|  | 1597 |  | 
|  | 1598 | def test_tag_in_tag_one(self): | 
|  | 1599 | els = self.soup.select('div div') | 
|  | 1600 | self.assertSelects('div div', ['inner']) | 
|  | 1601 |  | 
|  | 1602 | def test_tag_in_tag_many(self): | 
|  | 1603 | for selector in ('html div', 'html body div', 'body div'): | 
|  | 1604 | self.assertSelects(selector, ['main', 'inner', 'footer']) | 
|  | 1605 |  | 
|  | 1606 | def test_tag_no_match(self): | 
|  | 1607 | self.assertEqual(len(self.soup.select('del')), 0) | 
|  | 1608 |  | 
|  | 1609 | def test_invalid_tag(self): | 
|  | 1610 | self.assertRaises(ValueError, self.soup.select, 'tag%t') | 
|  | 1611 |  | 
|  | 1612 | def test_header_tags(self): | 
|  | 1613 | self.assertSelectMultiple( | 
|  | 1614 | ('h1', ['header1']), | 
|  | 1615 | ('h2', ['header2', 'header3']), | 
|  | 1616 | ) | 
|  | 1617 |  | 
|  | 1618 | def test_class_one(self): | 
|  | 1619 | for selector in ('.onep', 'p.onep', 'html p.onep'): | 
|  | 1620 | els = self.soup.select(selector) | 
|  | 1621 | self.assertEqual(len(els), 1) | 
|  | 1622 | self.assertEqual(els[0].name, 'p') | 
|  | 1623 | self.assertEqual(els[0]['class'], ['onep']) | 
|  | 1624 |  | 
|  | 1625 | def test_class_mismatched_tag(self): | 
|  | 1626 | els = self.soup.select('div.onep') | 
|  | 1627 | self.assertEqual(len(els), 0) | 
|  | 1628 |  | 
|  | 1629 | def test_one_id(self): | 
|  | 1630 | for selector in ('div#inner', '#inner', 'div div#inner'): | 
|  | 1631 | self.assertSelects(selector, ['inner']) | 
|  | 1632 |  | 
|  | 1633 | def test_bad_id(self): | 
|  | 1634 | els = self.soup.select('#doesnotexist') | 
|  | 1635 | self.assertEqual(len(els), 0) | 
|  | 1636 |  | 
|  | 1637 | def test_items_in_id(self): | 
|  | 1638 | els = self.soup.select('div#inner p') | 
|  | 1639 | self.assertEqual(len(els), 3) | 
|  | 1640 | for el in els: | 
|  | 1641 | self.assertEqual(el.name, 'p') | 
|  | 1642 | self.assertEqual(els[1]['class'], ['onep']) | 
|  | 1643 | self.assertFalse(els[0].has_attr('class')) | 
|  | 1644 |  | 
|  | 1645 | def test_a_bunch_of_emptys(self): | 
|  | 1646 | for selector in ('div#main del', 'div#main div.oops', 'div div#main'): | 
|  | 1647 | self.assertEqual(len(self.soup.select(selector)), 0) | 
|  | 1648 |  | 
|  | 1649 | def test_multi_class_support(self): | 
|  | 1650 | for selector in ('.class1', 'p.class1', '.class2', 'p.class2', | 
|  | 1651 | '.class3', 'p.class3', 'html p.class2', 'div#inner .class2'): | 
|  | 1652 | self.assertSelects(selector, ['pmulti']) | 
|  | 1653 |  | 
|  | 1654 | def test_multi_class_selection(self): | 
|  | 1655 | for selector in ('.class1.class3', '.class3.class2', | 
|  | 1656 | '.class1.class2.class3'): | 
|  | 1657 | self.assertSelects(selector, ['pmulti']) | 
|  | 1658 |  | 
|  | 1659 | def test_child_selector(self): | 
|  | 1660 | self.assertSelects('.s1 > a', ['s1a1', 's1a2']) | 
|  | 1661 | self.assertSelects('.s1 > a span', ['s1a2s1']) | 
|  | 1662 |  | 
|  | 1663 | def test_child_selector_id(self): | 
|  | 1664 | self.assertSelects('.s1 > a#s1a2 span', ['s1a2s1']) | 
|  | 1665 |  | 
|  | 1666 | def test_attribute_equals(self): | 
|  | 1667 | self.assertSelectMultiple( | 
|  | 1668 | ('p[class="onep"]', ['p1']), | 
|  | 1669 | ('p[id="p1"]', ['p1']), | 
|  | 1670 | ('[class="onep"]', ['p1']), | 
|  | 1671 | ('[id="p1"]', ['p1']), | 
|  | 1672 | ('link[rel="stylesheet"]', ['l1']), | 
|  | 1673 | ('link[type="text/css"]', ['l1']), | 
|  | 1674 | ('link[href="blah.css"]', ['l1']), | 
|  | 1675 | ('link[href="no-blah.css"]', []), | 
|  | 1676 | ('[rel="stylesheet"]', ['l1']), | 
|  | 1677 | ('[type="text/css"]', ['l1']), | 
|  | 1678 | ('[href="blah.css"]', ['l1']), | 
|  | 1679 | ('[href="no-blah.css"]', []), | 
|  | 1680 | ('p[href="no-blah.css"]', []), | 
|  | 1681 | ('[href="no-blah.css"]', []), | 
|  | 1682 | ) | 
|  | 1683 |  | 
|  | 1684 | def test_attribute_tilde(self): | 
|  | 1685 | self.assertSelectMultiple( | 
|  | 1686 | ('p[class~="class1"]', ['pmulti']), | 
|  | 1687 | ('p[class~="class2"]', ['pmulti']), | 
|  | 1688 | ('p[class~="class3"]', ['pmulti']), | 
|  | 1689 | ('[class~="class1"]', ['pmulti']), | 
|  | 1690 | ('[class~="class2"]', ['pmulti']), | 
|  | 1691 | ('[class~="class3"]', ['pmulti']), | 
|  | 1692 | ('a[rel~="friend"]', ['bob']), | 
|  | 1693 | ('a[rel~="met"]', ['bob']), | 
|  | 1694 | ('[rel~="friend"]', ['bob']), | 
|  | 1695 | ('[rel~="met"]', ['bob']), | 
|  | 1696 | ) | 
|  | 1697 |  | 
|  | 1698 | def test_attribute_startswith(self): | 
|  | 1699 | self.assertSelectMultiple( | 
|  | 1700 | ('[rel^="style"]', ['l1']), | 
|  | 1701 | ('link[rel^="style"]', ['l1']), | 
|  | 1702 | ('notlink[rel^="notstyle"]', []), | 
|  | 1703 | ('[rel^="notstyle"]', []), | 
|  | 1704 | ('link[rel^="notstyle"]', []), | 
|  | 1705 | ('link[href^="bla"]', ['l1']), | 
|  | 1706 | ('a[href^="http://"]', ['bob', 'me']), | 
|  | 1707 | ('[href^="http://"]', ['bob', 'me']), | 
|  | 1708 | ('[id^="p"]', ['pmulti', 'p1']), | 
|  | 1709 | ('[id^="m"]', ['me', 'main']), | 
|  | 1710 | ('div[id^="m"]', ['main']), | 
|  | 1711 | ('a[id^="m"]', ['me']), | 
|  | 1712 | ) | 
|  | 1713 |  | 
|  | 1714 | def test_attribute_endswith(self): | 
|  | 1715 | self.assertSelectMultiple( | 
|  | 1716 | ('[href$=".css"]', ['l1']), | 
|  | 1717 | ('link[href$=".css"]', ['l1']), | 
|  | 1718 | ('link[id$="1"]', ['l1']), | 
|  | 1719 | ('[id$="1"]', ['l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1']), | 
|  | 1720 | ('div[id$="1"]', []), | 
|  | 1721 | ('[id$="noending"]', []), | 
|  | 1722 | ) | 
|  | 1723 |  | 
|  | 1724 | def test_attribute_contains(self): | 
|  | 1725 | self.assertSelectMultiple( | 
|  | 1726 | # From test_attribute_startswith | 
|  | 1727 | ('[rel*="style"]', ['l1']), | 
|  | 1728 | ('link[rel*="style"]', ['l1']), | 
|  | 1729 | ('notlink[rel*="notstyle"]', []), | 
|  | 1730 | ('[rel*="notstyle"]', []), | 
|  | 1731 | ('link[rel*="notstyle"]', []), | 
|  | 1732 | ('link[href*="bla"]', ['l1']), | 
|  | 1733 | ('a[href*="http://"]', ['bob', 'me']), | 
|  | 1734 | ('[href*="http://"]', ['bob', 'me']), | 
|  | 1735 | ('[id*="p"]', ['pmulti', 'p1']), | 
|  | 1736 | ('div[id*="m"]', ['main']), | 
|  | 1737 | ('a[id*="m"]', ['me']), | 
|  | 1738 | # From test_attribute_endswith | 
|  | 1739 | ('[href*=".css"]', ['l1']), | 
|  | 1740 | ('link[href*=".css"]', ['l1']), | 
|  | 1741 | ('link[id*="1"]', ['l1']), | 
|  | 1742 | ('[id*="1"]', ['l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1']), | 
|  | 1743 | ('div[id*="1"]', []), | 
|  | 1744 | ('[id*="noending"]', []), | 
|  | 1745 | # New for this test | 
|  | 1746 | ('[href*="."]', ['bob', 'me', 'l1']), | 
|  | 1747 | ('a[href*="."]', ['bob', 'me']), | 
|  | 1748 | ('link[href*="."]', ['l1']), | 
|  | 1749 | ('div[id*="n"]', ['main', 'inner']), | 
|  | 1750 | ('div[id*="nn"]', ['inner']), | 
|  | 1751 | ) | 
|  | 1752 |  | 
|  | 1753 | def test_attribute_exact_or_hypen(self): | 
|  | 1754 | self.assertSelectMultiple( | 
|  | 1755 | ('p[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), | 
|  | 1756 | ('[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']), | 
|  | 1757 | ('p[lang|="fr"]', ['lang-fr']), | 
|  | 1758 | ('p[lang|="gb"]', []), | 
|  | 1759 | ) | 
|  | 1760 |  | 
|  | 1761 | def test_attribute_exists(self): | 
|  | 1762 | self.assertSelectMultiple( | 
|  | 1763 | ('[rel]', ['l1', 'bob', 'me']), | 
|  | 1764 | ('link[rel]', ['l1']), | 
|  | 1765 | ('a[rel]', ['bob', 'me']), | 
|  | 1766 | ('[lang]', ['lang-en', 'lang-en-gb', 'lang-en-us', 'lang-fr']), | 
|  | 1767 | ('p[class]', ['p1', 'pmulti']), | 
|  | 1768 | ('[blah]', []), | 
|  | 1769 | ('p[blah]', []), | 
|  | 1770 | ) | 
|  | 1771 |  | 
|  | 1772 | def test_nth_of_type(self): | 
|  | 1773 | # Try to select first paragraph | 
|  | 1774 | els = self.soup.select('div#inner p:nth-of-type(1)') | 
|  | 1775 | self.assertEqual(len(els), 1) | 
|  | 1776 | self.assertEqual(els[0].string, u'Some text') | 
|  | 1777 |  | 
|  | 1778 | # Try to select third paragraph | 
|  | 1779 | els = self.soup.select('div#inner p:nth-of-type(3)') | 
|  | 1780 | self.assertEqual(len(els), 1) | 
|  | 1781 | self.assertEqual(els[0].string, u'Another') | 
|  | 1782 |  | 
|  | 1783 | # Try to select (non-existent!) fourth paragraph | 
|  | 1784 | els = self.soup.select('div#inner p:nth-of-type(4)') | 
|  | 1785 | self.assertEqual(len(els), 0) | 
|  | 1786 |  | 
|  | 1787 | # Pass in an invalid value. | 
|  | 1788 | self.assertRaises( | 
|  | 1789 | ValueError, self.soup.select, 'div p:nth-of-type(0)') | 
|  | 1790 |  | 
|  | 1791 | def test_nth_of_type_direct_descendant(self): | 
|  | 1792 | els = self.soup.select('div#inner > p:nth-of-type(1)') | 
|  | 1793 | self.assertEqual(len(els), 1) | 
|  | 1794 | self.assertEqual(els[0].string, u'Some text') | 
|  | 1795 |  | 
|  | 1796 | def test_id_child_selector_nth_of_type(self): | 
|  | 1797 | self.assertSelects('#inner > p:nth-of-type(2)', ['p1']) | 
|  | 1798 |  | 
|  | 1799 | def test_select_on_element(self): | 
|  | 1800 | # Other tests operate on the tree; this operates on an element | 
|  | 1801 | # within the tree. | 
|  | 1802 | inner = self.soup.find("div", id="main") | 
|  | 1803 | selected = inner.select("div") | 
|  | 1804 | # The <div id="inner"> tag was selected. The <div id="footer"> | 
|  | 1805 | # tag was not. | 
|  | 1806 | self.assertSelectsIDs(selected, ['inner']) | 
|  | 1807 |  | 
|  | 1808 | def test_overspecified_child_id(self): | 
|  | 1809 | self.assertSelects(".fancy #inner", ['inner']) | 
|  | 1810 | self.assertSelects(".normal #inner", []) | 
|  | 1811 |  | 
|  | 1812 | def test_adjacent_sibling_selector(self): | 
|  | 1813 | self.assertSelects('#p1 + h2', ['header2']) | 
|  | 1814 | self.assertSelects('#p1 + h2 + p', ['pmulti']) | 
|  | 1815 | self.assertSelects('#p1 + #header2 + .class1', ['pmulti']) | 
|  | 1816 | self.assertEqual([], self.soup.select('#p1 + p')) | 
|  | 1817 |  | 
|  | 1818 | def test_general_sibling_selector(self): | 
|  | 1819 | self.assertSelects('#p1 ~ h2', ['header2', 'header3']) | 
|  | 1820 | self.assertSelects('#p1 ~ #header2', ['header2']) | 
|  | 1821 | self.assertSelects('#p1 ~ h2 + a', ['me']) | 
|  | 1822 | self.assertSelects('#p1 ~ h2 + [rel="me"]', ['me']) | 
|  | 1823 | self.assertEqual([], self.soup.select('#inner ~ h2')) | 
|  | 1824 |  | 
|  | 1825 | def test_dangling_combinator(self): | 
|  | 1826 | self.assertRaises(ValueError, self.soup.select, 'h1 >') | 
|  | 1827 |  | 
|  | 1828 | def test_sibling_combinator_wont_select_same_tag_twice(self): | 
|  | 1829 | self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr']) |