blob: 8e5c66426e8cc980bd52c323cb028393ff6db5c0 [file] [log] [blame]
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001# -*- coding: utf-8 -*-
2"""Tests for Beautiful Soup's tree traversal methods.
3
4The tree traversal methods are the main advantage of using Beautiful
5Soup over just using a parser.
6
7Different parsers will build different Beautiful Soup trees given the
8same markup, but all Beautiful Soup trees can be traversed with the
9methods tested here.
10"""
11
12import copy
13import pickle
14import re
15import warnings
16from bs4 import BeautifulSoup
Andrew Geissler82c905d2020-04-13 13:39:40 -050017from bs4.builder import builder_registry
Patrick Williamsc124f4f2015-09-15 14:41:29 -050018from bs4.element import (
Patrick Williamsc0f7c042017-02-23 20:41:17 -060019 PY3K,
Patrick Williamsc124f4f2015-09-15 14:41:29 -050020 CData,
21 Comment,
Patrick Williamsc0f7c042017-02-23 20:41:17 -060022 Declaration,
Patrick Williamsc124f4f2015-09-15 14:41:29 -050023 Doctype,
24 NavigableString,
25 SoupStrainer,
26 Tag,
27)
Andrew Geissler82c905d2020-04-13 13:39:40 -050028from bs4.testing import SoupTest
Patrick Williamsc124f4f2015-09-15 14:41:29 -050029
30XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None)
31LXML_PRESENT = (builder_registry.lookup("lxml") is not None)
32
33class TreeTest(SoupTest):
34
35 def assertSelects(self, tags, should_match):
36 """Make sure that the given tags have the correct text.
37
38 This is used in tests that define a bunch of tags, each
39 containing a single string, and then select certain strings by
40 some mechanism.
41 """
42 self.assertEqual([tag.string for tag in tags], should_match)
43
44 def assertSelectsIDs(self, tags, should_match):
45 """Make sure that the given tags have the correct IDs.
46
47 This is used in tests that define a bunch of tags, each
48 containing a single string, and then select certain strings by
49 some mechanism.
50 """
51 self.assertEqual([tag['id'] for tag in tags], should_match)
52
53
54class TestFind(TreeTest):
55 """Basic tests of the find() method.
56
57 find() just calls find_all() with limit=1, so it's not tested all
58 that thouroughly here.
59 """
60
61 def test_find_tag(self):
62 soup = self.soup("<a>1</a><b>2</b><a>3</a><b>4</b>")
63 self.assertEqual(soup.find("b").string, "2")
64
65 def test_unicode_text_find(self):
Patrick Williamsc0f7c042017-02-23 20:41:17 -060066 soup = self.soup('<h1>Räksmörgås</h1>')
67 self.assertEqual(soup.find(string='Räksmörgås'), 'Räksmörgås')
68
69 def test_unicode_attribute_find(self):
70 soup = self.soup('<h1 id="Räksmörgås">here it is</h1>')
71 str(soup)
72 self.assertEqual("here it is", soup.find(id='Räksmörgås').text)
73
Patrick Williamsc124f4f2015-09-15 14:41:29 -050074
75 def test_find_everything(self):
76 """Test an optimization that finds all tags."""
77 soup = self.soup("<a>foo</a><b>bar</b>")
78 self.assertEqual(2, len(soup.find_all()))
79
80 def test_find_everything_with_name(self):
81 """Test an optimization that finds all tags with a given name."""
82 soup = self.soup("<a>foo</a><b>bar</b><a>baz</a>")
83 self.assertEqual(2, len(soup.find_all('a')))
84
85class TestFindAll(TreeTest):
86 """Basic tests of the find_all() method."""
87
88 def test_find_all_text_nodes(self):
89 """You can search the tree for text nodes."""
90 soup = self.soup("<html>Foo<b>bar</b>\xbb</html>")
91 # Exact match.
Patrick Williamsc0f7c042017-02-23 20:41:17 -060092 self.assertEqual(soup.find_all(string="bar"), ["bar"])
93 self.assertEqual(soup.find_all(text="bar"), ["bar"])
Patrick Williamsc124f4f2015-09-15 14:41:29 -050094 # Match any of a number of strings.
95 self.assertEqual(
Patrick Williamsc0f7c042017-02-23 20:41:17 -060096 soup.find_all(text=["Foo", "bar"]), ["Foo", "bar"])
Patrick Williamsc124f4f2015-09-15 14:41:29 -050097 # Match a regular expression.
98 self.assertEqual(soup.find_all(text=re.compile('.*')),
Patrick Williamsc0f7c042017-02-23 20:41:17 -060099 ["Foo", "bar", '\xbb'])
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500100 # Match anything.
101 self.assertEqual(soup.find_all(text=True),
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600102 ["Foo", "bar", '\xbb'])
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500103
104 def test_find_all_limit(self):
105 """You can limit the number of items returned by find_all."""
106 soup = self.soup("<a>1</a><a>2</a><a>3</a><a>4</a><a>5</a>")
107 self.assertSelects(soup.find_all('a', limit=3), ["1", "2", "3"])
108 self.assertSelects(soup.find_all('a', limit=1), ["1"])
109 self.assertSelects(
110 soup.find_all('a', limit=10), ["1", "2", "3", "4", "5"])
111
112 # A limit of 0 means no limit.
113 self.assertSelects(
114 soup.find_all('a', limit=0), ["1", "2", "3", "4", "5"])
115
116 def test_calling_a_tag_is_calling_findall(self):
117 soup = self.soup("<a>1</a><b>2<a id='foo'>3</a></b>")
118 self.assertSelects(soup('a', limit=1), ["1"])
119 self.assertSelects(soup.b(id="foo"), ["3"])
120
121 def test_find_all_with_self_referential_data_structure_does_not_cause_infinite_recursion(self):
122 soup = self.soup("<a></a>")
123 # Create a self-referential list.
124 l = []
125 l.append(l)
126
127 # Without special code in _normalize_search_value, this would cause infinite
128 # recursion.
129 self.assertEqual([], soup.find_all(l))
130
131 def test_find_all_resultset(self):
132 """All find_all calls return a ResultSet"""
133 soup = self.soup("<a></a>")
134 result = soup.find_all("a")
135 self.assertTrue(hasattr(result, "source"))
136
137 result = soup.find_all(True)
138 self.assertTrue(hasattr(result, "source"))
139
140 result = soup.find_all(text="foo")
141 self.assertTrue(hasattr(result, "source"))
142
143
144class TestFindAllBasicNamespaces(TreeTest):
145
146 def test_find_by_namespaced_name(self):
147 soup = self.soup('<mathml:msqrt>4</mathml:msqrt><a svg:fill="red">')
148 self.assertEqual("4", soup.find("mathml:msqrt").string)
149 self.assertEqual("a", soup.find(attrs= { "svg:fill" : "red" }).name)
150
151
152class TestFindAllByName(TreeTest):
153 """Test ways of finding tags by tag name."""
154
155 def setUp(self):
156 super(TreeTest, self).setUp()
157 self.tree = self.soup("""<a>First tag.</a>
158 <b>Second tag.</b>
159 <c>Third <a>Nested tag.</a> tag.</c>""")
160
161 def test_find_all_by_tag_name(self):
162 # Find all the <a> tags.
163 self.assertSelects(
164 self.tree.find_all('a'), ['First tag.', 'Nested tag.'])
165
166 def test_find_all_by_name_and_text(self):
167 self.assertSelects(
168 self.tree.find_all('a', text='First tag.'), ['First tag.'])
169
170 self.assertSelects(
171 self.tree.find_all('a', text=True), ['First tag.', 'Nested tag.'])
172
173 self.assertSelects(
174 self.tree.find_all('a', text=re.compile("tag")),
175 ['First tag.', 'Nested tag.'])
176
177
178 def test_find_all_on_non_root_element(self):
179 # You can call find_all on any node, not just the root.
180 self.assertSelects(self.tree.c.find_all('a'), ['Nested tag.'])
181
182 def test_calling_element_invokes_find_all(self):
183 self.assertSelects(self.tree('a'), ['First tag.', 'Nested tag.'])
184
185 def test_find_all_by_tag_strainer(self):
186 self.assertSelects(
187 self.tree.find_all(SoupStrainer('a')),
188 ['First tag.', 'Nested tag.'])
189
190 def test_find_all_by_tag_names(self):
191 self.assertSelects(
192 self.tree.find_all(['a', 'b']),
193 ['First tag.', 'Second tag.', 'Nested tag.'])
194
195 def test_find_all_by_tag_dict(self):
196 self.assertSelects(
197 self.tree.find_all({'a' : True, 'b' : True}),
198 ['First tag.', 'Second tag.', 'Nested tag.'])
199
200 def test_find_all_by_tag_re(self):
201 self.assertSelects(
202 self.tree.find_all(re.compile('^[ab]$')),
203 ['First tag.', 'Second tag.', 'Nested tag.'])
204
205 def test_find_all_with_tags_matching_method(self):
206 # You can define an oracle method that determines whether
207 # a tag matches the search.
208 def id_matches_name(tag):
209 return tag.name == tag.get('id')
210
211 tree = self.soup("""<a id="a">Match 1.</a>
212 <a id="1">Does not match.</a>
213 <b id="b">Match 2.</a>""")
214
215 self.assertSelects(
216 tree.find_all(id_matches_name), ["Match 1.", "Match 2."])
217
218
219class TestFindAllByAttribute(TreeTest):
220
221 def test_find_all_by_attribute_name(self):
222 # You can pass in keyword arguments to find_all to search by
223 # attribute.
224 tree = self.soup("""
225 <a id="first">Matching a.</a>
226 <a id="second">
227 Non-matching <b id="first">Matching b.</b>a.
228 </a>""")
229 self.assertSelects(tree.find_all(id='first'),
230 ["Matching a.", "Matching b."])
231
232 def test_find_all_by_utf8_attribute_value(self):
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600233 peace = "םולש".encode("utf8")
234 data = '<a title="םולש"></a>'.encode("utf8")
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500235 soup = self.soup(data)
236 self.assertEqual([soup.a], soup.find_all(title=peace))
237 self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8")))
238 self.assertEqual([soup.a], soup.find_all(title=[peace, "something else"]))
239
240 def test_find_all_by_attribute_dict(self):
241 # You can pass in a dictionary as the argument 'attrs'. This
242 # lets you search for attributes like 'name' (a fixed argument
243 # to find_all) and 'class' (a reserved word in Python.)
244 tree = self.soup("""
245 <a name="name1" class="class1">Name match.</a>
246 <a name="name2" class="class2">Class match.</a>
247 <a name="name3" class="class3">Non-match.</a>
248 <name1>A tag called 'name1'.</name1>
249 """)
250
251 # This doesn't do what you want.
252 self.assertSelects(tree.find_all(name='name1'),
253 ["A tag called 'name1'."])
254 # This does what you want.
255 self.assertSelects(tree.find_all(attrs={'name' : 'name1'}),
256 ["Name match."])
257
258 self.assertSelects(tree.find_all(attrs={'class' : 'class2'}),
259 ["Class match."])
260
261 def test_find_all_by_class(self):
262 tree = self.soup("""
263 <a class="1">Class 1.</a>
264 <a class="2">Class 2.</a>
265 <b class="1">Class 1.</b>
266 <c class="3 4">Class 3 and 4.</c>
267 """)
268
269 # Passing in the class_ keyword argument will search against
270 # the 'class' attribute.
271 self.assertSelects(tree.find_all('a', class_='1'), ['Class 1.'])
272 self.assertSelects(tree.find_all('c', class_='3'), ['Class 3 and 4.'])
273 self.assertSelects(tree.find_all('c', class_='4'), ['Class 3 and 4.'])
274
275 # Passing in a string to 'attrs' will also search the CSS class.
276 self.assertSelects(tree.find_all('a', '1'), ['Class 1.'])
277 self.assertSelects(tree.find_all(attrs='1'), ['Class 1.', 'Class 1.'])
278 self.assertSelects(tree.find_all('c', '3'), ['Class 3 and 4.'])
279 self.assertSelects(tree.find_all('c', '4'), ['Class 3 and 4.'])
280
281 def test_find_by_class_when_multiple_classes_present(self):
282 tree = self.soup("<gar class='foo bar'>Found it</gar>")
283
284 f = tree.find_all("gar", class_=re.compile("o"))
285 self.assertSelects(f, ["Found it"])
286
287 f = tree.find_all("gar", class_=re.compile("a"))
288 self.assertSelects(f, ["Found it"])
289
290 # Since the class is not the string "foo bar", but the two
291 # strings "foo" and "bar", this will not find anything.
292 f = tree.find_all("gar", class_=re.compile("o b"))
293 self.assertSelects(f, [])
294
295 def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self):
296 soup = self.soup("<a class='bar'>Found it</a>")
297
298 self.assertSelects(soup.find_all("a", re.compile("ba")), ["Found it"])
299
300 def big_attribute_value(value):
301 return len(value) > 3
302
303 self.assertSelects(soup.find_all("a", big_attribute_value), [])
304
305 def small_attribute_value(value):
306 return len(value) <= 3
307
308 self.assertSelects(
309 soup.find_all("a", small_attribute_value), ["Found it"])
310
311 def test_find_all_with_string_for_attrs_finds_multiple_classes(self):
312 soup = self.soup('<a class="foo bar"></a><a class="foo"></a>')
313 a, a2 = soup.find_all("a")
314 self.assertEqual([a, a2], soup.find_all("a", "foo"))
315 self.assertEqual([a], soup.find_all("a", "bar"))
316
317 # If you specify the class as a string that contains a
318 # space, only that specific value will be found.
319 self.assertEqual([a], soup.find_all("a", class_="foo bar"))
320 self.assertEqual([a], soup.find_all("a", "foo bar"))
321 self.assertEqual([], soup.find_all("a", "bar foo"))
322
323 def test_find_all_by_attribute_soupstrainer(self):
324 tree = self.soup("""
325 <a id="first">Match.</a>
326 <a id="second">Non-match.</a>""")
327
328 strainer = SoupStrainer(attrs={'id' : 'first'})
329 self.assertSelects(tree.find_all(strainer), ['Match.'])
330
331 def test_find_all_with_missing_atribute(self):
332 # You can pass in None as the value of an attribute to find_all.
333 # This will match tags that do not have that attribute set.
334 tree = self.soup("""<a id="1">ID present.</a>
335 <a>No ID present.</a>
336 <a id="">ID is empty.</a>""")
337 self.assertSelects(tree.find_all('a', id=None), ["No ID present."])
338
339 def test_find_all_with_defined_attribute(self):
340 # You can pass in None as the value of an attribute to find_all.
341 # This will match tags that have that attribute set to any value.
342 tree = self.soup("""<a id="1">ID present.</a>
343 <a>No ID present.</a>
344 <a id="">ID is empty.</a>""")
345 self.assertSelects(
346 tree.find_all(id=True), ["ID present.", "ID is empty."])
347
348 def test_find_all_with_numeric_attribute(self):
349 # If you search for a number, it's treated as a string.
350 tree = self.soup("""<a id=1>Unquoted attribute.</a>
351 <a id="1">Quoted attribute.</a>""")
352
353 expected = ["Unquoted attribute.", "Quoted attribute."]
354 self.assertSelects(tree.find_all(id=1), expected)
355 self.assertSelects(tree.find_all(id="1"), expected)
356
357 def test_find_all_with_list_attribute_values(self):
358 # You can pass a list of attribute values instead of just one,
359 # and you'll get tags that match any of the values.
360 tree = self.soup("""<a id="1">1</a>
361 <a id="2">2</a>
362 <a id="3">3</a>
363 <a>No ID.</a>""")
364 self.assertSelects(tree.find_all(id=["1", "3", "4"]),
365 ["1", "3"])
366
367 def test_find_all_with_regular_expression_attribute_value(self):
368 # You can pass a regular expression as an attribute value, and
369 # you'll get tags whose values for that attribute match the
370 # regular expression.
371 tree = self.soup("""<a id="a">One a.</a>
372 <a id="aa">Two as.</a>
373 <a id="ab">Mixed as and bs.</a>
374 <a id="b">One b.</a>
375 <a>No ID.</a>""")
376
377 self.assertSelects(tree.find_all(id=re.compile("^a+$")),
378 ["One a.", "Two as."])
379
380 def test_find_by_name_and_containing_string(self):
381 soup = self.soup("<b>foo</b><b>bar</b><a>foo</a>")
382 a = soup.a
383
384 self.assertEqual([a], soup.find_all("a", text="foo"))
385 self.assertEqual([], soup.find_all("a", text="bar"))
386 self.assertEqual([], soup.find_all("a", text="bar"))
387
388 def test_find_by_name_and_containing_string_when_string_is_buried(self):
389 soup = self.soup("<a>foo</a><a><b><c>foo</c></b></a>")
390 self.assertEqual(soup.find_all("a"), soup.find_all("a", text="foo"))
391
392 def test_find_by_attribute_and_containing_string(self):
393 soup = self.soup('<b id="1">foo</b><a id="2">foo</a>')
394 a = soup.a
395
396 self.assertEqual([a], soup.find_all(id=2, text="foo"))
397 self.assertEqual([], soup.find_all(id=1, text="bar"))
398
399
400
401
402class TestIndex(TreeTest):
403 """Test Tag.index"""
404 def test_index(self):
405 tree = self.soup("""<div>
406 <a>Identical</a>
407 <b>Not identical</b>
408 <a>Identical</a>
409
410 <c><d>Identical with child</d></c>
411 <b>Also not identical</b>
412 <c><d>Identical with child</d></c>
413 </div>""")
414 div = tree.div
415 for i, element in enumerate(div.contents):
416 self.assertEqual(i, div.index(element))
417 self.assertRaises(ValueError, tree.index, 1)
418
419
420class TestParentOperations(TreeTest):
421 """Test navigation and searching through an element's parents."""
422
423 def setUp(self):
424 super(TestParentOperations, self).setUp()
425 self.tree = self.soup('''<ul id="empty"></ul>
426 <ul id="top">
427 <ul id="middle">
428 <ul id="bottom">
429 <b>Start here</b>
430 </ul>
431 </ul>''')
432 self.start = self.tree.b
433
434
435 def test_parent(self):
436 self.assertEqual(self.start.parent['id'], 'bottom')
437 self.assertEqual(self.start.parent.parent['id'], 'middle')
438 self.assertEqual(self.start.parent.parent.parent['id'], 'top')
439
440 def test_parent_of_top_tag_is_soup_object(self):
441 top_tag = self.tree.contents[0]
442 self.assertEqual(top_tag.parent, self.tree)
443
444 def test_soup_object_has_no_parent(self):
445 self.assertEqual(None, self.tree.parent)
446
447 def test_find_parents(self):
448 self.assertSelectsIDs(
449 self.start.find_parents('ul'), ['bottom', 'middle', 'top'])
450 self.assertSelectsIDs(
451 self.start.find_parents('ul', id="middle"), ['middle'])
452
453 def test_find_parent(self):
454 self.assertEqual(self.start.find_parent('ul')['id'], 'bottom')
455 self.assertEqual(self.start.find_parent('ul', id='top')['id'], 'top')
456
457 def test_parent_of_text_element(self):
458 text = self.tree.find(text="Start here")
459 self.assertEqual(text.parent.name, 'b')
460
461 def test_text_element_find_parent(self):
462 text = self.tree.find(text="Start here")
463 self.assertEqual(text.find_parent('ul')['id'], 'bottom')
464
465 def test_parent_generator(self):
466 parents = [parent['id'] for parent in self.start.parents
467 if parent is not None and 'id' in parent.attrs]
468 self.assertEqual(parents, ['bottom', 'middle', 'top'])
469
470
471class ProximityTest(TreeTest):
472
473 def setUp(self):
474 super(TreeTest, self).setUp()
475 self.tree = self.soup(
476 '<html id="start"><head></head><body><b id="1">One</b><b id="2">Two</b><b id="3">Three</b></body></html>')
477
478
479class TestNextOperations(ProximityTest):
480
481 def setUp(self):
482 super(TestNextOperations, self).setUp()
483 self.start = self.tree.b
484
485 def test_next(self):
486 self.assertEqual(self.start.next_element, "One")
487 self.assertEqual(self.start.next_element.next_element['id'], "2")
488
489 def test_next_of_last_item_is_none(self):
490 last = self.tree.find(text="Three")
491 self.assertEqual(last.next_element, None)
492
493 def test_next_of_root_is_none(self):
494 # The document root is outside the next/previous chain.
495 self.assertEqual(self.tree.next_element, None)
496
497 def test_find_all_next(self):
498 self.assertSelects(self.start.find_all_next('b'), ["Two", "Three"])
499 self.start.find_all_next(id=3)
500 self.assertSelects(self.start.find_all_next(id=3), ["Three"])
501
502 def test_find_next(self):
503 self.assertEqual(self.start.find_next('b')['id'], '2')
504 self.assertEqual(self.start.find_next(text="Three"), "Three")
505
506 def test_find_next_for_text_element(self):
507 text = self.tree.find(text="One")
508 self.assertEqual(text.find_next("b").string, "Two")
509 self.assertSelects(text.find_all_next("b"), ["Two", "Three"])
510
511 def test_next_generator(self):
512 start = self.tree.find(text="Two")
513 successors = [node for node in start.next_elements]
514 # There are two successors: the final <b> tag and its text contents.
515 tag, contents = successors
516 self.assertEqual(tag['id'], '3')
517 self.assertEqual(contents, "Three")
518
519class TestPreviousOperations(ProximityTest):
520
521 def setUp(self):
522 super(TestPreviousOperations, self).setUp()
523 self.end = self.tree.find(text="Three")
524
525 def test_previous(self):
526 self.assertEqual(self.end.previous_element['id'], "3")
527 self.assertEqual(self.end.previous_element.previous_element, "Two")
528
529 def test_previous_of_first_item_is_none(self):
530 first = self.tree.find('html')
531 self.assertEqual(first.previous_element, None)
532
533 def test_previous_of_root_is_none(self):
534 # The document root is outside the next/previous chain.
535 # XXX This is broken!
536 #self.assertEqual(self.tree.previous_element, None)
537 pass
538
539 def test_find_all_previous(self):
540 # The <b> tag containing the "Three" node is the predecessor
541 # of the "Three" node itself, which is why "Three" shows up
542 # here.
543 self.assertSelects(
544 self.end.find_all_previous('b'), ["Three", "Two", "One"])
545 self.assertSelects(self.end.find_all_previous(id=1), ["One"])
546
547 def test_find_previous(self):
548 self.assertEqual(self.end.find_previous('b')['id'], '3')
549 self.assertEqual(self.end.find_previous(text="One"), "One")
550
551 def test_find_previous_for_text_element(self):
552 text = self.tree.find(text="Three")
553 self.assertEqual(text.find_previous("b").string, "Three")
554 self.assertSelects(
555 text.find_all_previous("b"), ["Three", "Two", "One"])
556
557 def test_previous_generator(self):
558 start = self.tree.find(text="One")
559 predecessors = [node for node in start.previous_elements]
560
561 # There are four predecessors: the <b> tag containing "One"
562 # the <body> tag, the <head> tag, and the <html> tag.
563 b, body, head, html = predecessors
564 self.assertEqual(b['id'], '1')
565 self.assertEqual(body.name, "body")
566 self.assertEqual(head.name, "head")
567 self.assertEqual(html.name, "html")
568
569
570class SiblingTest(TreeTest):
571
572 def setUp(self):
573 super(SiblingTest, self).setUp()
574 markup = '''<html>
575 <span id="1">
576 <span id="1.1"></span>
577 </span>
578 <span id="2">
579 <span id="2.1"></span>
580 </span>
581 <span id="3">
582 <span id="3.1"></span>
583 </span>
584 <span id="4"></span>
585 </html>'''
586 # All that whitespace looks good but makes the tests more
587 # difficult. Get rid of it.
588 markup = re.compile("\n\s*").sub("", markup)
589 self.tree = self.soup(markup)
590
591
592class TestNextSibling(SiblingTest):
593
594 def setUp(self):
595 super(TestNextSibling, self).setUp()
596 self.start = self.tree.find(id="1")
597
598 def test_next_sibling_of_root_is_none(self):
599 self.assertEqual(self.tree.next_sibling, None)
600
601 def test_next_sibling(self):
602 self.assertEqual(self.start.next_sibling['id'], '2')
603 self.assertEqual(self.start.next_sibling.next_sibling['id'], '3')
604
605 # Note the difference between next_sibling and next_element.
606 self.assertEqual(self.start.next_element['id'], '1.1')
607
608 def test_next_sibling_may_not_exist(self):
609 self.assertEqual(self.tree.html.next_sibling, None)
610
611 nested_span = self.tree.find(id="1.1")
612 self.assertEqual(nested_span.next_sibling, None)
613
614 last_span = self.tree.find(id="4")
615 self.assertEqual(last_span.next_sibling, None)
616
617 def test_find_next_sibling(self):
618 self.assertEqual(self.start.find_next_sibling('span')['id'], '2')
619
620 def test_next_siblings(self):
621 self.assertSelectsIDs(self.start.find_next_siblings("span"),
622 ['2', '3', '4'])
623
624 self.assertSelectsIDs(self.start.find_next_siblings(id='3'), ['3'])
625
626 def test_next_sibling_for_text_element(self):
627 soup = self.soup("Foo<b>bar</b>baz")
628 start = soup.find(text="Foo")
629 self.assertEqual(start.next_sibling.name, 'b')
630 self.assertEqual(start.next_sibling.next_sibling, 'baz')
631
632 self.assertSelects(start.find_next_siblings('b'), ['bar'])
633 self.assertEqual(start.find_next_sibling(text="baz"), "baz")
634 self.assertEqual(start.find_next_sibling(text="nonesuch"), None)
635
636
637class TestPreviousSibling(SiblingTest):
638
639 def setUp(self):
640 super(TestPreviousSibling, self).setUp()
641 self.end = self.tree.find(id="4")
642
643 def test_previous_sibling_of_root_is_none(self):
644 self.assertEqual(self.tree.previous_sibling, None)
645
646 def test_previous_sibling(self):
647 self.assertEqual(self.end.previous_sibling['id'], '3')
648 self.assertEqual(self.end.previous_sibling.previous_sibling['id'], '2')
649
650 # Note the difference between previous_sibling and previous_element.
651 self.assertEqual(self.end.previous_element['id'], '3.1')
652
653 def test_previous_sibling_may_not_exist(self):
654 self.assertEqual(self.tree.html.previous_sibling, None)
655
656 nested_span = self.tree.find(id="1.1")
657 self.assertEqual(nested_span.previous_sibling, None)
658
659 first_span = self.tree.find(id="1")
660 self.assertEqual(first_span.previous_sibling, None)
661
662 def test_find_previous_sibling(self):
663 self.assertEqual(self.end.find_previous_sibling('span')['id'], '3')
664
665 def test_previous_siblings(self):
666 self.assertSelectsIDs(self.end.find_previous_siblings("span"),
667 ['3', '2', '1'])
668
669 self.assertSelectsIDs(self.end.find_previous_siblings(id='1'), ['1'])
670
671 def test_previous_sibling_for_text_element(self):
672 soup = self.soup("Foo<b>bar</b>baz")
673 start = soup.find(text="baz")
674 self.assertEqual(start.previous_sibling.name, 'b')
675 self.assertEqual(start.previous_sibling.previous_sibling, 'Foo')
676
677 self.assertSelects(start.find_previous_siblings('b'), ['bar'])
678 self.assertEqual(start.find_previous_sibling(text="Foo"), "Foo")
679 self.assertEqual(start.find_previous_sibling(text="nonesuch"), None)
680
681
682class TestTagCreation(SoupTest):
683 """Test the ability to create new tags."""
684 def test_new_tag(self):
685 soup = self.soup("")
686 new_tag = soup.new_tag("foo", bar="baz")
687 self.assertTrue(isinstance(new_tag, Tag))
688 self.assertEqual("foo", new_tag.name)
689 self.assertEqual(dict(bar="baz"), new_tag.attrs)
690 self.assertEqual(None, new_tag.parent)
691
692 def test_tag_inherits_self_closing_rules_from_builder(self):
693 if XML_BUILDER_PRESENT:
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600694 xml_soup = BeautifulSoup("", "lxml-xml")
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500695 xml_br = xml_soup.new_tag("br")
696 xml_p = xml_soup.new_tag("p")
697
698 # Both the <br> and <p> tag are empty-element, just because
699 # they have no contents.
700 self.assertEqual(b"<br/>", xml_br.encode())
701 self.assertEqual(b"<p/>", xml_p.encode())
702
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600703 html_soup = BeautifulSoup("", "html.parser")
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500704 html_br = html_soup.new_tag("br")
705 html_p = html_soup.new_tag("p")
706
707 # The HTML builder users HTML's rules about which tags are
708 # empty-element tags, and the new tags reflect these rules.
709 self.assertEqual(b"<br/>", html_br.encode())
710 self.assertEqual(b"<p></p>", html_p.encode())
711
712 def test_new_string_creates_navigablestring(self):
713 soup = self.soup("")
714 s = soup.new_string("foo")
715 self.assertEqual("foo", s)
716 self.assertTrue(isinstance(s, NavigableString))
717
718 def test_new_string_can_create_navigablestring_subclass(self):
719 soup = self.soup("")
720 s = soup.new_string("foo", Comment)
721 self.assertEqual("foo", s)
722 self.assertTrue(isinstance(s, Comment))
723
724class TestTreeModification(SoupTest):
725
726 def test_attribute_modification(self):
727 soup = self.soup('<a id="1"></a>')
728 soup.a['id'] = 2
729 self.assertEqual(soup.decode(), self.document_for('<a id="2"></a>'))
730 del(soup.a['id'])
731 self.assertEqual(soup.decode(), self.document_for('<a></a>'))
732 soup.a['id2'] = 'foo'
733 self.assertEqual(soup.decode(), self.document_for('<a id2="foo"></a>'))
734
735 def test_new_tag_creation(self):
736 builder = builder_registry.lookup('html')()
737 soup = self.soup("<body></body>", builder=builder)
738 a = Tag(soup, builder, 'a')
739 ol = Tag(soup, builder, 'ol')
740 a['href'] = 'http://foo.com/'
741 soup.body.insert(0, a)
742 soup.body.insert(1, ol)
743 self.assertEqual(
744 soup.body.encode(),
745 b'<body><a href="http://foo.com/"></a><ol></ol></body>')
746
747 def test_append_to_contents_moves_tag(self):
748 doc = """<p id="1">Don't leave me <b>here</b>.</p>
749 <p id="2">Don\'t leave!</p>"""
750 soup = self.soup(doc)
751 second_para = soup.find(id='2')
752 bold = soup.b
753
754 # Move the <b> tag to the end of the second paragraph.
755 soup.find(id='2').append(soup.b)
756
757 # The <b> tag is now a child of the second paragraph.
758 self.assertEqual(bold.parent, second_para)
759
760 self.assertEqual(
761 soup.decode(), self.document_for(
762 '<p id="1">Don\'t leave me .</p>\n'
763 '<p id="2">Don\'t leave!<b>here</b></p>'))
764
765 def test_replace_with_returns_thing_that_was_replaced(self):
766 text = "<a></a><b><c></c></b>"
767 soup = self.soup(text)
768 a = soup.a
769 new_a = a.replace_with(soup.c)
770 self.assertEqual(a, new_a)
771
772 def test_unwrap_returns_thing_that_was_replaced(self):
773 text = "<a><b></b><c></c></a>"
774 soup = self.soup(text)
775 a = soup.a
776 new_a = a.unwrap()
777 self.assertEqual(a, new_a)
778
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600779 def test_replace_with_and_unwrap_give_useful_exception_when_tag_has_no_parent(self):
780 soup = self.soup("<a><b>Foo</b></a><c>Bar</c>")
781 a = soup.a
782 a.extract()
783 self.assertEqual(None, a.parent)
784 self.assertRaises(ValueError, a.unwrap)
785 self.assertRaises(ValueError, a.replace_with, soup.c)
786
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500787 def test_replace_tag_with_itself(self):
788 text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>"
789 soup = self.soup(text)
790 c = soup.c
791 soup.c.replace_with(c)
792 self.assertEqual(soup.decode(), self.document_for(text))
793
794 def test_replace_tag_with_its_parent_raises_exception(self):
795 text = "<a><b></b></a>"
796 soup = self.soup(text)
797 self.assertRaises(ValueError, soup.b.replace_with, soup.a)
798
799 def test_insert_tag_into_itself_raises_exception(self):
800 text = "<a><b></b></a>"
801 soup = self.soup(text)
802 self.assertRaises(ValueError, soup.a.insert, 0, soup.a)
803
804 def test_replace_with_maintains_next_element_throughout(self):
805 soup = self.soup('<p><a>one</a><b>three</b></p>')
806 a = soup.a
807 b = a.contents[0]
808 # Make it so the <a> tag has two text children.
809 a.insert(1, "two")
810
811 # Now replace each one with the empty string.
812 left, right = a.contents
813 left.replaceWith('')
814 right.replaceWith('')
815
816 # The <b> tag is still connected to the tree.
817 self.assertEqual("three", soup.b.string)
818
819 def test_replace_final_node(self):
820 soup = self.soup("<b>Argh!</b>")
821 soup.find(text="Argh!").replace_with("Hooray!")
822 new_text = soup.find(text="Hooray!")
823 b = soup.b
824 self.assertEqual(new_text.previous_element, b)
825 self.assertEqual(new_text.parent, b)
826 self.assertEqual(new_text.previous_element.next_element, new_text)
827 self.assertEqual(new_text.next_element, None)
828
829 def test_consecutive_text_nodes(self):
830 # A builder should never create two consecutive text nodes,
831 # but if you insert one next to another, Beautiful Soup will
832 # handle it correctly.
833 soup = self.soup("<a><b>Argh!</b><c></c></a>")
834 soup.b.insert(1, "Hooray!")
835
836 self.assertEqual(
837 soup.decode(), self.document_for(
838 "<a><b>Argh!Hooray!</b><c></c></a>"))
839
840 new_text = soup.find(text="Hooray!")
841 self.assertEqual(new_text.previous_element, "Argh!")
842 self.assertEqual(new_text.previous_element.next_element, new_text)
843
844 self.assertEqual(new_text.previous_sibling, "Argh!")
845 self.assertEqual(new_text.previous_sibling.next_sibling, new_text)
846
847 self.assertEqual(new_text.next_sibling, None)
848 self.assertEqual(new_text.next_element, soup.c)
849
850 def test_insert_string(self):
851 soup = self.soup("<a></a>")
852 soup.a.insert(0, "bar")
853 soup.a.insert(0, "foo")
854 # The string were added to the tag.
855 self.assertEqual(["foo", "bar"], soup.a.contents)
856 # And they were converted to NavigableStrings.
857 self.assertEqual(soup.a.contents[0].next_element, "bar")
858
859 def test_insert_tag(self):
860 builder = self.default_builder
861 soup = self.soup(
862 "<a><b>Find</b><c>lady!</c><d></d></a>", builder=builder)
863 magic_tag = Tag(soup, builder, 'magictag')
864 magic_tag.insert(0, "the")
865 soup.a.insert(1, magic_tag)
866
867 self.assertEqual(
868 soup.decode(), self.document_for(
869 "<a><b>Find</b><magictag>the</magictag><c>lady!</c><d></d></a>"))
870
871 # Make sure all the relationships are hooked up correctly.
872 b_tag = soup.b
873 self.assertEqual(b_tag.next_sibling, magic_tag)
874 self.assertEqual(magic_tag.previous_sibling, b_tag)
875
876 find = b_tag.find(text="Find")
877 self.assertEqual(find.next_element, magic_tag)
878 self.assertEqual(magic_tag.previous_element, find)
879
880 c_tag = soup.c
881 self.assertEqual(magic_tag.next_sibling, c_tag)
882 self.assertEqual(c_tag.previous_sibling, magic_tag)
883
884 the = magic_tag.find(text="the")
885 self.assertEqual(the.parent, magic_tag)
886 self.assertEqual(the.next_element, c_tag)
887 self.assertEqual(c_tag.previous_element, the)
888
889 def test_append_child_thats_already_at_the_end(self):
890 data = "<a><b></b></a>"
891 soup = self.soup(data)
892 soup.a.append(soup.b)
893 self.assertEqual(data, soup.decode())
894
895 def test_move_tag_to_beginning_of_parent(self):
896 data = "<a><b></b><c></c><d></d></a>"
897 soup = self.soup(data)
898 soup.a.insert(0, soup.d)
899 self.assertEqual("<a><d></d><b></b><c></c></a>", soup.decode())
900
901 def test_insert_works_on_empty_element_tag(self):
902 # This is a little strange, since most HTML parsers don't allow
903 # markup like this to come through. But in general, we don't
904 # know what the parser would or wouldn't have allowed, so
905 # I'm letting this succeed for now.
906 soup = self.soup("<br/>")
907 soup.br.insert(1, "Contents")
908 self.assertEqual(str(soup.br), "<br>Contents</br>")
909
910 def test_insert_before(self):
911 soup = self.soup("<a>foo</a><b>bar</b>")
912 soup.b.insert_before("BAZ")
913 soup.a.insert_before("QUUX")
914 self.assertEqual(
915 soup.decode(), self.document_for("QUUX<a>foo</a>BAZ<b>bar</b>"))
916
917 soup.a.insert_before(soup.b)
918 self.assertEqual(
919 soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ"))
920
921 def test_insert_after(self):
922 soup = self.soup("<a>foo</a><b>bar</b>")
923 soup.b.insert_after("BAZ")
924 soup.a.insert_after("QUUX")
925 self.assertEqual(
926 soup.decode(), self.document_for("<a>foo</a>QUUX<b>bar</b>BAZ"))
927 soup.b.insert_after(soup.a)
928 self.assertEqual(
929 soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ"))
930
931 def test_insert_after_raises_exception_if_after_has_no_meaning(self):
932 soup = self.soup("")
933 tag = soup.new_tag("a")
934 string = soup.new_string("")
935 self.assertRaises(ValueError, string.insert_after, tag)
936 self.assertRaises(NotImplementedError, soup.insert_after, tag)
937 self.assertRaises(ValueError, tag.insert_after, tag)
938
939 def test_insert_before_raises_notimplementederror_if_before_has_no_meaning(self):
940 soup = self.soup("")
941 tag = soup.new_tag("a")
942 string = soup.new_string("")
943 self.assertRaises(ValueError, string.insert_before, tag)
944 self.assertRaises(NotImplementedError, soup.insert_before, tag)
945 self.assertRaises(ValueError, tag.insert_before, tag)
946
947 def test_replace_with(self):
948 soup = self.soup(
949 "<p>There's <b>no</b> business like <b>show</b> business</p>")
950 no, show = soup.find_all('b')
951 show.replace_with(no)
952 self.assertEqual(
953 soup.decode(),
954 self.document_for(
955 "<p>There's business like <b>no</b> business</p>"))
956
957 self.assertEqual(show.parent, None)
958 self.assertEqual(no.parent, soup.p)
959 self.assertEqual(no.next_element, "no")
960 self.assertEqual(no.next_sibling, " business")
961
962 def test_replace_first_child(self):
963 data = "<a><b></b><c></c></a>"
964 soup = self.soup(data)
965 soup.b.replace_with(soup.c)
966 self.assertEqual("<a><c></c></a>", soup.decode())
967
968 def test_replace_last_child(self):
969 data = "<a><b></b><c></c></a>"
970 soup = self.soup(data)
971 soup.c.replace_with(soup.b)
972 self.assertEqual("<a><b></b></a>", soup.decode())
973
974 def test_nested_tag_replace_with(self):
975 soup = self.soup(
976 """<a>We<b>reserve<c>the</c><d>right</d></b></a><e>to<f>refuse</f><g>service</g></e>""")
977
978 # Replace the entire <b> tag and its contents ("reserve the
979 # right") with the <f> tag ("refuse").
980 remove_tag = soup.b
981 move_tag = soup.f
982 remove_tag.replace_with(move_tag)
983
984 self.assertEqual(
985 soup.decode(), self.document_for(
986 "<a>We<f>refuse</f></a><e>to<g>service</g></e>"))
987
988 # The <b> tag is now an orphan.
989 self.assertEqual(remove_tag.parent, None)
990 self.assertEqual(remove_tag.find(text="right").next_element, None)
991 self.assertEqual(remove_tag.previous_element, None)
992 self.assertEqual(remove_tag.next_sibling, None)
993 self.assertEqual(remove_tag.previous_sibling, None)
994
995 # The <f> tag is now connected to the <a> tag.
996 self.assertEqual(move_tag.parent, soup.a)
997 self.assertEqual(move_tag.previous_element, "We")
998 self.assertEqual(move_tag.next_element.next_element, soup.e)
999 self.assertEqual(move_tag.next_sibling, None)
1000
1001 # The gap where the <f> tag used to be has been mended, and
1002 # the word "to" is now connected to the <g> tag.
1003 to_text = soup.find(text="to")
1004 g_tag = soup.g
1005 self.assertEqual(to_text.next_element, g_tag)
1006 self.assertEqual(to_text.next_sibling, g_tag)
1007 self.assertEqual(g_tag.previous_element, to_text)
1008 self.assertEqual(g_tag.previous_sibling, to_text)
1009
1010 def test_unwrap(self):
1011 tree = self.soup("""
1012 <p>Unneeded <em>formatting</em> is unneeded</p>
1013 """)
1014 tree.em.unwrap()
1015 self.assertEqual(tree.em, None)
1016 self.assertEqual(tree.p.text, "Unneeded formatting is unneeded")
1017
1018 def test_wrap(self):
1019 soup = self.soup("I wish I was bold.")
1020 value = soup.string.wrap(soup.new_tag("b"))
1021 self.assertEqual(value.decode(), "<b>I wish I was bold.</b>")
1022 self.assertEqual(
1023 soup.decode(), self.document_for("<b>I wish I was bold.</b>"))
1024
1025 def test_wrap_extracts_tag_from_elsewhere(self):
1026 soup = self.soup("<b></b>I wish I was bold.")
1027 soup.b.next_sibling.wrap(soup.b)
1028 self.assertEqual(
1029 soup.decode(), self.document_for("<b>I wish I was bold.</b>"))
1030
1031 def test_wrap_puts_new_contents_at_the_end(self):
1032 soup = self.soup("<b>I like being bold.</b>I wish I was bold.")
1033 soup.b.next_sibling.wrap(soup.b)
1034 self.assertEqual(2, len(soup.b.contents))
1035 self.assertEqual(
1036 soup.decode(), self.document_for(
1037 "<b>I like being bold.I wish I was bold.</b>"))
1038
1039 def test_extract(self):
1040 soup = self.soup(
1041 '<html><body>Some content. <div id="nav">Nav crap</div> More content.</body></html>')
1042
1043 self.assertEqual(len(soup.body.contents), 3)
1044 extracted = soup.find(id="nav").extract()
1045
1046 self.assertEqual(
1047 soup.decode(), "<html><body>Some content. More content.</body></html>")
1048 self.assertEqual(extracted.decode(), '<div id="nav">Nav crap</div>')
1049
1050 # The extracted tag is now an orphan.
1051 self.assertEqual(len(soup.body.contents), 2)
1052 self.assertEqual(extracted.parent, None)
1053 self.assertEqual(extracted.previous_element, None)
1054 self.assertEqual(extracted.next_element.next_element, None)
1055
1056 # The gap where the extracted tag used to be has been mended.
1057 content_1 = soup.find(text="Some content. ")
1058 content_2 = soup.find(text=" More content.")
1059 self.assertEqual(content_1.next_element, content_2)
1060 self.assertEqual(content_1.next_sibling, content_2)
1061 self.assertEqual(content_2.previous_element, content_1)
1062 self.assertEqual(content_2.previous_sibling, content_1)
1063
1064 def test_extract_distinguishes_between_identical_strings(self):
1065 soup = self.soup("<a>foo</a><b>bar</b>")
1066 foo_1 = soup.a.string
1067 bar_1 = soup.b.string
1068 foo_2 = soup.new_string("foo")
1069 bar_2 = soup.new_string("bar")
1070 soup.a.append(foo_2)
1071 soup.b.append(bar_2)
1072
1073 # Now there are two identical strings in the <a> tag, and two
1074 # in the <b> tag. Let's remove the first "foo" and the second
1075 # "bar".
1076 foo_1.extract()
1077 bar_2.extract()
1078 self.assertEqual(foo_2, soup.a.string)
1079 self.assertEqual(bar_2, soup.b.string)
1080
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001081 def test_extract_multiples_of_same_tag(self):
1082 soup = self.soup("""
1083<html>
1084<head>
1085<script>foo</script>
1086</head>
1087<body>
1088 <script>bar</script>
1089 <a></a>
1090</body>
1091<script>baz</script>
1092</html>""")
1093 [soup.script.extract() for i in soup.find_all("script")]
1094 self.assertEqual("<body>\n\n<a></a>\n</body>", str(soup.body))
1095
1096
1097 def test_extract_works_when_element_is_surrounded_by_identical_strings(self):
1098 soup = self.soup(
1099 '<html>\n'
1100 '<body>hi</body>\n'
1101 '</html>')
1102 soup.find('body').extract()
1103 self.assertEqual(None, soup.find('body'))
1104
1105
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001106 def test_clear(self):
1107 """Tag.clear()"""
1108 soup = self.soup("<p><a>String <em>Italicized</em></a> and another</p>")
1109 # clear using extract()
1110 a = soup.a
1111 soup.p.clear()
1112 self.assertEqual(len(soup.p.contents), 0)
1113 self.assertTrue(hasattr(a, "contents"))
1114
1115 # clear using decompose()
1116 em = a.em
1117 a.clear(decompose=True)
1118 self.assertEqual(0, len(em.contents))
1119
1120 def test_string_set(self):
1121 """Tag.string = 'string'"""
1122 soup = self.soup("<a></a> <b><c></c></b>")
1123 soup.a.string = "foo"
1124 self.assertEqual(soup.a.contents, ["foo"])
1125 soup.b.string = "bar"
1126 self.assertEqual(soup.b.contents, ["bar"])
1127
1128 def test_string_set_does_not_affect_original_string(self):
1129 soup = self.soup("<a><b>foo</b><c>bar</c>")
1130 soup.b.string = soup.c.string
1131 self.assertEqual(soup.a.encode(), b"<a><b>bar</b><c>bar</c></a>")
1132
1133 def test_set_string_preserves_class_of_string(self):
1134 soup = self.soup("<a></a>")
1135 cdata = CData("foo")
1136 soup.a.string = cdata
1137 self.assertTrue(isinstance(soup.a.string, CData))
1138
1139class TestElementObjects(SoupTest):
1140 """Test various features of element objects."""
1141
1142 def test_len(self):
1143 """The length of an element is its number of children."""
1144 soup = self.soup("<top>1<b>2</b>3</top>")
1145
1146 # The BeautifulSoup object itself contains one element: the
1147 # <top> tag.
1148 self.assertEqual(len(soup.contents), 1)
1149 self.assertEqual(len(soup), 1)
1150
1151 # The <top> tag contains three elements: the text node "1", the
1152 # <b> tag, and the text node "3".
1153 self.assertEqual(len(soup.top), 3)
1154 self.assertEqual(len(soup.top.contents), 3)
1155
1156 def test_member_access_invokes_find(self):
1157 """Accessing a Python member .foo invokes find('foo')"""
1158 soup = self.soup('<b><i></i></b>')
1159 self.assertEqual(soup.b, soup.find('b'))
1160 self.assertEqual(soup.b.i, soup.find('b').find('i'))
1161 self.assertEqual(soup.a, None)
1162
1163 def test_deprecated_member_access(self):
1164 soup = self.soup('<b><i></i></b>')
1165 with warnings.catch_warnings(record=True) as w:
1166 tag = soup.bTag
1167 self.assertEqual(soup.b, tag)
1168 self.assertEqual(
1169 '.bTag is deprecated, use .find("b") instead.',
1170 str(w[0].message))
1171
1172 def test_has_attr(self):
1173 """has_attr() checks for the presence of an attribute.
1174
1175 Please note note: has_attr() is different from
1176 __in__. has_attr() checks the tag's attributes and __in__
1177 checks the tag's chidlren.
1178 """
1179 soup = self.soup("<foo attr='bar'>")
1180 self.assertTrue(soup.foo.has_attr('attr'))
1181 self.assertFalse(soup.foo.has_attr('attr2'))
1182
1183
1184 def test_attributes_come_out_in_alphabetical_order(self):
1185 markup = '<b a="1" z="5" m="3" f="2" y="4"></b>'
1186 self.assertSoupEquals(markup, '<b a="1" f="2" m="3" y="4" z="5"></b>')
1187
1188 def test_string(self):
1189 # A tag that contains only a text node makes that node
1190 # available as .string.
1191 soup = self.soup("<b>foo</b>")
1192 self.assertEqual(soup.b.string, 'foo')
1193
1194 def test_empty_tag_has_no_string(self):
1195 # A tag with no children has no .stirng.
1196 soup = self.soup("<b></b>")
1197 self.assertEqual(soup.b.string, None)
1198
1199 def test_tag_with_multiple_children_has_no_string(self):
1200 # A tag with no children has no .string.
1201 soup = self.soup("<a>foo<b></b><b></b></b>")
1202 self.assertEqual(soup.b.string, None)
1203
1204 soup = self.soup("<a>foo<b></b>bar</b>")
1205 self.assertEqual(soup.b.string, None)
1206
1207 # Even if all the children are strings, due to trickery,
1208 # it won't work--but this would be a good optimization.
1209 soup = self.soup("<a>foo</b>")
1210 soup.a.insert(1, "bar")
1211 self.assertEqual(soup.a.string, None)
1212
1213 def test_tag_with_recursive_string_has_string(self):
1214 # A tag with a single child which has a .string inherits that
1215 # .string.
1216 soup = self.soup("<a><b>foo</b></a>")
1217 self.assertEqual(soup.a.string, "foo")
1218 self.assertEqual(soup.string, "foo")
1219
1220 def test_lack_of_string(self):
1221 """Only a tag containing a single text node has a .string."""
1222 soup = self.soup("<b>f<i>e</i>o</b>")
1223 self.assertFalse(soup.b.string)
1224
1225 soup = self.soup("<b></b>")
1226 self.assertFalse(soup.b.string)
1227
1228 def test_all_text(self):
1229 """Tag.text and Tag.get_text(sep=u"") -> all child text, concatenated"""
1230 soup = self.soup("<a>a<b>r</b> <r> t </r></a>")
1231 self.assertEqual(soup.a.text, "ar t ")
1232 self.assertEqual(soup.a.get_text(strip=True), "art")
1233 self.assertEqual(soup.a.get_text(","), "a,r, , t ")
1234 self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t")
1235
1236 def test_get_text_ignores_comments(self):
1237 soup = self.soup("foo<!--IGNORE-->bar")
1238 self.assertEqual(soup.get_text(), "foobar")
1239
1240 self.assertEqual(
1241 soup.get_text(types=(NavigableString, Comment)), "fooIGNOREbar")
1242 self.assertEqual(
1243 soup.get_text(types=None), "fooIGNOREbar")
1244
1245 def test_all_strings_ignores_comments(self):
1246 soup = self.soup("foo<!--IGNORE-->bar")
1247 self.assertEqual(['foo', 'bar'], list(soup.strings))
1248
1249class TestCDAtaListAttributes(SoupTest):
1250
1251 """Testing cdata-list attributes like 'class'.
1252 """
1253 def test_single_value_becomes_list(self):
1254 soup = self.soup("<a class='foo'>")
1255 self.assertEqual(["foo"],soup.a['class'])
1256
1257 def test_multiple_values_becomes_list(self):
1258 soup = self.soup("<a class='foo bar'>")
1259 self.assertEqual(["foo", "bar"], soup.a['class'])
1260
1261 def test_multiple_values_separated_by_weird_whitespace(self):
1262 soup = self.soup("<a class='foo\tbar\nbaz'>")
1263 self.assertEqual(["foo", "bar", "baz"],soup.a['class'])
1264
1265 def test_attributes_joined_into_string_on_output(self):
1266 soup = self.soup("<a class='foo\tbar'>")
1267 self.assertEqual(b'<a class="foo bar"></a>', soup.a.encode())
1268
1269 def test_accept_charset(self):
1270 soup = self.soup('<form accept-charset="ISO-8859-1 UTF-8">')
1271 self.assertEqual(['ISO-8859-1', 'UTF-8'], soup.form['accept-charset'])
1272
1273 def test_cdata_attribute_applying_only_to_one_tag(self):
1274 data = '<a accept-charset="ISO-8859-1 UTF-8"></a>'
1275 soup = self.soup(data)
1276 # We saw in another test that accept-charset is a cdata-list
1277 # attribute for the <form> tag. But it's not a cdata-list
1278 # attribute for any other tag.
1279 self.assertEqual('ISO-8859-1 UTF-8', soup.a['accept-charset'])
1280
1281 def test_string_has_immutable_name_property(self):
1282 string = self.soup("s").string
1283 self.assertEqual(None, string.name)
1284 def t():
1285 string.name = 'foo'
1286 self.assertRaises(AttributeError, t)
1287
1288class TestPersistence(SoupTest):
1289 "Testing features like pickle and deepcopy."
1290
1291 def setUp(self):
1292 super(TestPersistence, self).setUp()
1293 self.page = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
1294"http://www.w3.org/TR/REC-html40/transitional.dtd">
1295<html>
1296<head>
1297<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
1298<title>Beautiful Soup: We called him Tortoise because he taught us.</title>
1299<link rev="made" href="mailto:leonardr@segfault.org">
1300<meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping.">
1301<meta name="generator" content="Markov Approximation 1.4 (module: leonardr)">
1302<meta name="author" content="Leonard Richardson">
1303</head>
1304<body>
1305<a href="foo">foo</a>
1306<a href="foo"><b>bar</b></a>
1307</body>
1308</html>"""
1309 self.tree = self.soup(self.page)
1310
1311 def test_pickle_and_unpickle_identity(self):
1312 # Pickling a tree, then unpickling it, yields a tree identical
1313 # to the original.
1314 dumped = pickle.dumps(self.tree, 2)
1315 loaded = pickle.loads(dumped)
1316 self.assertEqual(loaded.__class__, BeautifulSoup)
1317 self.assertEqual(loaded.decode(), self.tree.decode())
1318
1319 def test_deepcopy_identity(self):
1320 # Making a deepcopy of a tree yields an identical tree.
1321 copied = copy.deepcopy(self.tree)
1322 self.assertEqual(copied.decode(), self.tree.decode())
1323
1324 def test_unicode_pickle(self):
1325 # A tree containing Unicode characters can be pickled.
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001326 html = "<b>\N{SNOWMAN}</b>"
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001327 soup = self.soup(html)
1328 dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL)
1329 loaded = pickle.loads(dumped)
1330 self.assertEqual(loaded.decode(), soup.decode())
1331
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001332 def test_copy_navigablestring_is_not_attached_to_tree(self):
1333 html = "<b>Foo<a></a></b><b>Bar</b>"
1334 soup = self.soup(html)
1335 s1 = soup.find(string="Foo")
1336 s2 = copy.copy(s1)
1337 self.assertEqual(s1, s2)
1338 self.assertEqual(None, s2.parent)
1339 self.assertEqual(None, s2.next_element)
1340 self.assertNotEqual(None, s1.next_sibling)
1341 self.assertEqual(None, s2.next_sibling)
1342 self.assertEqual(None, s2.previous_element)
1343
1344 def test_copy_navigablestring_subclass_has_same_type(self):
1345 html = "<b><!--Foo--></b>"
1346 soup = self.soup(html)
1347 s1 = soup.string
1348 s2 = copy.copy(s1)
1349 self.assertEqual(s1, s2)
1350 self.assertTrue(isinstance(s2, Comment))
1351
1352 def test_copy_entire_soup(self):
1353 html = "<div><b>Foo<a></a></b><b>Bar</b></div>end"
1354 soup = self.soup(html)
1355 soup_copy = copy.copy(soup)
1356 self.assertEqual(soup, soup_copy)
1357
1358 def test_copy_tag_copies_contents(self):
1359 html = "<div><b>Foo<a></a></b><b>Bar</b></div>end"
1360 soup = self.soup(html)
1361 div = soup.div
1362 div_copy = copy.copy(div)
1363
1364 # The two tags look the same, and evaluate to equal.
1365 self.assertEqual(str(div), str(div_copy))
1366 self.assertEqual(div, div_copy)
1367
1368 # But they're not the same object.
1369 self.assertFalse(div is div_copy)
1370
1371 # And they don't have the same relation to the parse tree. The
1372 # copy is not associated with a parse tree at all.
1373 self.assertEqual(None, div_copy.parent)
1374 self.assertEqual(None, div_copy.previous_element)
1375 self.assertEqual(None, div_copy.find(string='Bar').next_element)
1376 self.assertNotEqual(None, div.find(string='Bar').next_element)
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001377
1378class TestSubstitutions(SoupTest):
1379
1380 def test_default_formatter_is_minimal(self):
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001381 markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001382 soup = self.soup(markup)
1383 decoded = soup.decode(formatter="minimal")
1384 # The < is converted back into &lt; but the e-with-acute is left alone.
1385 self.assertEqual(
1386 decoded,
1387 self.document_for(
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001388 "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001389
1390 def test_formatter_html(self):
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001391 markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001392 soup = self.soup(markup)
1393 decoded = soup.decode(formatter="html")
1394 self.assertEqual(
1395 decoded,
1396 self.document_for("<b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>"))
1397
1398 def test_formatter_minimal(self):
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001399 markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001400 soup = self.soup(markup)
1401 decoded = soup.decode(formatter="minimal")
1402 # The < is converted back into &lt; but the e-with-acute is left alone.
1403 self.assertEqual(
1404 decoded,
1405 self.document_for(
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001406 "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"))
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001407
1408 def test_formatter_null(self):
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001409 markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001410 soup = self.soup(markup)
1411 decoded = soup.decode(formatter=None)
1412 # Neither the angle brackets nor the e-with-acute are converted.
1413 # This is not valid HTML, but it's what the user wanted.
1414 self.assertEqual(decoded,
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001415 self.document_for("<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"))
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001416
1417 def test_formatter_custom(self):
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001418 markup = "<b>&lt;foo&gt;</b><b>bar</b>"
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001419 soup = self.soup(markup)
1420 decoded = soup.decode(formatter = lambda x: x.upper())
1421 # Instead of normal entity conversion code, the custom
1422 # callable is called on every string.
1423 self.assertEqual(
1424 decoded,
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001425 self.document_for("<b><FOO></b><b>BAR</b>"))
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001426
1427 def test_formatter_is_run_on_attribute_values(self):
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001428 markup = '<a href="http://a.com?a=b&c=é">e</a>'
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001429 soup = self.soup(markup)
1430 a = soup.a
1431
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001432 expect_minimal = '<a href="http://a.com?a=b&amp;c=é">e</a>'
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001433
1434 self.assertEqual(expect_minimal, a.decode())
1435 self.assertEqual(expect_minimal, a.decode(formatter="minimal"))
1436
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001437 expect_html = '<a href="http://a.com?a=b&amp;c=&eacute;">e</a>'
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001438 self.assertEqual(expect_html, a.decode(formatter="html"))
1439
1440 self.assertEqual(markup, a.decode(formatter=None))
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001441 expect_upper = '<a href="HTTP://A.COM?A=B&C=É">E</a>'
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001442 self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper()))
1443
1444 def test_formatter_skips_script_tag_for_html_documents(self):
1445 doc = """
1446 <script type="text/javascript">
1447 console.log("< < hey > > ");
1448 </script>
1449"""
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001450 encoded = BeautifulSoup(doc, 'html.parser').encode()
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001451 self.assertTrue(b"< < hey > >" in encoded)
1452
1453 def test_formatter_skips_style_tag_for_html_documents(self):
1454 doc = """
1455 <style type="text/css">
1456 console.log("< < hey > > ");
1457 </style>
1458"""
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001459 encoded = BeautifulSoup(doc, 'html.parser').encode()
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001460 self.assertTrue(b"< < hey > >" in encoded)
1461
1462 def test_prettify_leaves_preformatted_text_alone(self):
1463 soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz ")
1464 # Everything outside the <pre> tag is reformatted, but everything
1465 # inside is left alone.
1466 self.assertEqual(
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001467 '<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n</div>',
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001468 soup.div.prettify())
1469
1470 def test_prettify_accepts_formatter(self):
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001471 soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser')
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001472 pretty = soup.prettify(formatter = lambda x: x.upper())
1473 self.assertTrue("FOO" in pretty)
1474
1475 def test_prettify_outputs_unicode_by_default(self):
1476 soup = self.soup("<a></a>")
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001477 self.assertEqual(str, type(soup.prettify()))
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001478
1479 def test_prettify_can_encode_data(self):
1480 soup = self.soup("<a></a>")
1481 self.assertEqual(bytes, type(soup.prettify("utf-8")))
1482
1483 def test_html_entity_substitution_off_by_default(self):
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001484 markup = "<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>"
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001485 soup = self.soup(markup)
1486 encoded = soup.b.encode("utf-8")
1487 self.assertEqual(encoded, markup.encode('utf-8'))
1488
1489 def test_encoding_substitution(self):
1490 # Here's the <meta> tag saying that a document is
1491 # encoded in Shift-JIS.
1492 meta_tag = ('<meta content="text/html; charset=x-sjis" '
1493 'http-equiv="Content-type"/>')
1494 soup = self.soup(meta_tag)
1495
1496 # Parse the document, and the charset apprears unchanged.
1497 self.assertEqual(soup.meta['content'], 'text/html; charset=x-sjis')
1498
1499 # Encode the document into some encoding, and the encoding is
1500 # substituted into the meta tag.
1501 utf_8 = soup.encode("utf-8")
1502 self.assertTrue(b"charset=utf-8" in utf_8)
1503
1504 euc_jp = soup.encode("euc_jp")
1505 self.assertTrue(b"charset=euc_jp" in euc_jp)
1506
1507 shift_jis = soup.encode("shift-jis")
1508 self.assertTrue(b"charset=shift-jis" in shift_jis)
1509
1510 utf_16_u = soup.encode("utf-16").decode("utf-16")
1511 self.assertTrue("charset=utf-16" in utf_16_u)
1512
1513 def test_encoding_substitution_doesnt_happen_if_tag_is_strained(self):
1514 markup = ('<head><meta content="text/html; charset=x-sjis" '
1515 'http-equiv="Content-type"/></head><pre>foo</pre>')
1516
1517 # Beautiful Soup used to try to rewrite the meta tag even if the
1518 # meta tag got filtered out by the strainer. This test makes
1519 # sure that doesn't happen.
1520 strainer = SoupStrainer('pre')
1521 soup = self.soup(markup, parse_only=strainer)
1522 self.assertEqual(soup.contents[0].name, 'pre')
1523
1524class TestEncoding(SoupTest):
1525 """Test the ability to encode objects into strings."""
1526
1527 def test_unicode_string_can_be_encoded(self):
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001528 html = "<b>\N{SNOWMAN}</b>"
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001529 soup = self.soup(html)
1530 self.assertEqual(soup.b.string.encode("utf-8"),
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001531 "\N{SNOWMAN}".encode("utf-8"))
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001532
1533 def test_tag_containing_unicode_string_can_be_encoded(self):
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001534 html = "<b>\N{SNOWMAN}</b>"
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001535 soup = self.soup(html)
1536 self.assertEqual(
1537 soup.b.encode("utf-8"), html.encode("utf-8"))
1538
1539 def test_encoding_substitutes_unrecognized_characters_by_default(self):
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001540 html = "<b>\N{SNOWMAN}</b>"
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001541 soup = self.soup(html)
1542 self.assertEqual(soup.b.encode("ascii"), b"<b>&#9731;</b>")
1543
1544 def test_encoding_can_be_made_strict(self):
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001545 html = "<b>\N{SNOWMAN}</b>"
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001546 soup = self.soup(html)
1547 self.assertRaises(
1548 UnicodeEncodeError, soup.encode, "ascii", errors="strict")
1549
1550 def test_decode_contents(self):
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001551 html = "<b>\N{SNOWMAN}</b>"
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001552 soup = self.soup(html)
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001553 self.assertEqual("\N{SNOWMAN}", soup.b.decode_contents())
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001554
1555 def test_encode_contents(self):
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001556 html = "<b>\N{SNOWMAN}</b>"
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001557 soup = self.soup(html)
1558 self.assertEqual(
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001559 "\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents(
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001560 encoding="utf8"))
1561
1562 def test_deprecated_renderContents(self):
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001563 html = "<b>\N{SNOWMAN}</b>"
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001564 soup = self.soup(html)
1565 self.assertEqual(
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001566 "\N{SNOWMAN}".encode("utf8"), soup.b.renderContents())
1567
1568 def test_repr(self):
1569 html = "<b>\N{SNOWMAN}</b>"
1570 soup = self.soup(html)
1571 if PY3K:
1572 self.assertEqual(html, repr(soup))
1573 else:
1574 self.assertEqual(b'<b>\\u2603</b>', repr(soup))
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001575
1576class TestNavigableStringSubclasses(SoupTest):
1577
1578 def test_cdata(self):
1579 # None of the current builders turn CDATA sections into CData
1580 # objects, but you can create them manually.
1581 soup = self.soup("")
1582 cdata = CData("foo")
1583 soup.insert(1, cdata)
1584 self.assertEqual(str(soup), "<![CDATA[foo]]>")
1585 self.assertEqual(soup.find(text="foo"), "foo")
1586 self.assertEqual(soup.contents[0], "foo")
1587
1588 def test_cdata_is_never_formatted(self):
1589 """Text inside a CData object is passed into the formatter.
1590
1591 But the return value is ignored.
1592 """
1593
1594 self.count = 0
1595 def increment(*args):
1596 self.count += 1
1597 return "BITTER FAILURE"
1598
1599 soup = self.soup("")
1600 cdata = CData("<><><>")
1601 soup.insert(1, cdata)
1602 self.assertEqual(
1603 b"<![CDATA[<><><>]]>", soup.encode(formatter=increment))
1604 self.assertEqual(1, self.count)
1605
1606 def test_doctype_ends_in_newline(self):
1607 # Unlike other NavigableString subclasses, a DOCTYPE always ends
1608 # in a newline.
1609 doctype = Doctype("foo")
1610 soup = self.soup("")
1611 soup.insert(1, doctype)
1612 self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n")
1613
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001614 def test_declaration(self):
1615 d = Declaration("foo")
1616 self.assertEqual("<?foo?>", d.output_ready())
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001617
1618class TestSoupSelector(TreeTest):
1619
1620 HTML = """
1621<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
1622"http://www.w3.org/TR/html4/strict.dtd">
1623<html>
1624<head>
1625<title>The title</title>
1626<link rel="stylesheet" href="blah.css" type="text/css" id="l1">
1627</head>
1628<body>
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001629<custom-dashed-tag class="dashed" id="dash1">Hello there.</custom-dashed-tag>
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001630<div id="main" class="fancy">
1631<div id="inner">
1632<h1 id="header1">An H1</h1>
1633<p>Some text</p>
1634<p class="onep" id="p1">Some more text</p>
1635<h2 id="header2">An H2</h2>
1636<p class="class1 class2 class3" id="pmulti">Another</p>
1637<a href="http://bob.example.org/" rel="friend met" id="bob">Bob</a>
1638<h2 id="header3">Another H2</h2>
1639<a id="me" href="http://simonwillison.net/" rel="me">me</a>
1640<span class="s1">
1641<a href="#" id="s1a1">span1a1</a>
1642<a href="#" id="s1a2">span1a2 <span id="s1a2s1">test</span></a>
1643<span class="span2">
1644<a href="#" id="s2a1">span2a1</a>
1645</span>
1646<span class="span3"></span>
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001647<custom-dashed-tag class="dashed" id="dash2"/>
1648<div data-tag="dashedvalue" id="data1"/>
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001649</span>
1650</div>
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001651<x id="xid">
1652<z id="zida"/>
1653<z id="zidab"/>
1654<z id="zidac"/>
1655</x>
1656<y id="yid">
1657<z id="zidb"/>
1658</y>
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001659<p lang="en" id="lang-en">English</p>
1660<p lang="en-gb" id="lang-en-gb">English UK</p>
1661<p lang="en-us" id="lang-en-us">English US</p>
1662<p lang="fr" id="lang-fr">French</p>
1663</div>
1664
1665<div id="footer">
1666</div>
1667"""
1668
1669 def setUp(self):
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001670 self.soup = BeautifulSoup(self.HTML, 'html.parser')
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001671
1672 def assertSelects(self, selector, expected_ids):
1673 el_ids = [el['id'] for el in self.soup.select(selector)]
1674 el_ids.sort()
1675 expected_ids.sort()
1676 self.assertEqual(expected_ids, el_ids,
1677 "Selector %s, expected [%s], got [%s]" % (
1678 selector, ', '.join(expected_ids), ', '.join(el_ids)
1679 )
1680 )
1681
1682 assertSelect = assertSelects
1683
1684 def assertSelectMultiple(self, *tests):
1685 for selector, expected_ids in tests:
1686 self.assertSelect(selector, expected_ids)
1687
1688 def test_one_tag_one(self):
1689 els = self.soup.select('title')
1690 self.assertEqual(len(els), 1)
1691 self.assertEqual(els[0].name, 'title')
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001692 self.assertEqual(els[0].contents, ['The title'])
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001693
1694 def test_one_tag_many(self):
1695 els = self.soup.select('div')
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001696 self.assertEqual(len(els), 4)
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001697 for div in els:
1698 self.assertEqual(div.name, 'div')
1699
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001700 el = self.soup.select_one('div')
1701 self.assertEqual('main', el['id'])
1702
1703 def test_select_one_returns_none_if_no_match(self):
1704 match = self.soup.select_one('nonexistenttag')
1705 self.assertEqual(None, match)
1706
1707
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001708 def test_tag_in_tag_one(self):
1709 els = self.soup.select('div div')
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001710 self.assertSelects('div div', ['inner', 'data1'])
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001711
1712 def test_tag_in_tag_many(self):
1713 for selector in ('html div', 'html body div', 'body div'):
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001714 self.assertSelects(selector, ['data1', 'main', 'inner', 'footer'])
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001715
1716 def test_tag_no_match(self):
1717 self.assertEqual(len(self.soup.select('del')), 0)
1718
1719 def test_invalid_tag(self):
1720 self.assertRaises(ValueError, self.soup.select, 'tag%t')
1721
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001722 def test_select_dashed_tag_ids(self):
1723 self.assertSelects('custom-dashed-tag', ['dash1', 'dash2'])
1724
1725 def test_select_dashed_by_id(self):
1726 dashed = self.soup.select('custom-dashed-tag[id=\"dash2\"]')
1727 self.assertEqual(dashed[0].name, 'custom-dashed-tag')
1728 self.assertEqual(dashed[0]['id'], 'dash2')
1729
1730 def test_dashed_tag_text(self):
1731 self.assertEqual(self.soup.select('body > custom-dashed-tag')[0].text, 'Hello there.')
1732
1733 def test_select_dashed_matches_find_all(self):
1734 self.assertEqual(self.soup.select('custom-dashed-tag'), self.soup.find_all('custom-dashed-tag'))
1735
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001736 def test_header_tags(self):
1737 self.assertSelectMultiple(
1738 ('h1', ['header1']),
1739 ('h2', ['header2', 'header3']),
1740 )
1741
1742 def test_class_one(self):
1743 for selector in ('.onep', 'p.onep', 'html p.onep'):
1744 els = self.soup.select(selector)
1745 self.assertEqual(len(els), 1)
1746 self.assertEqual(els[0].name, 'p')
1747 self.assertEqual(els[0]['class'], ['onep'])
1748
1749 def test_class_mismatched_tag(self):
1750 els = self.soup.select('div.onep')
1751 self.assertEqual(len(els), 0)
1752
1753 def test_one_id(self):
1754 for selector in ('div#inner', '#inner', 'div div#inner'):
1755 self.assertSelects(selector, ['inner'])
1756
1757 def test_bad_id(self):
1758 els = self.soup.select('#doesnotexist')
1759 self.assertEqual(len(els), 0)
1760
1761 def test_items_in_id(self):
1762 els = self.soup.select('div#inner p')
1763 self.assertEqual(len(els), 3)
1764 for el in els:
1765 self.assertEqual(el.name, 'p')
1766 self.assertEqual(els[1]['class'], ['onep'])
1767 self.assertFalse(els[0].has_attr('class'))
1768
1769 def test_a_bunch_of_emptys(self):
1770 for selector in ('div#main del', 'div#main div.oops', 'div div#main'):
1771 self.assertEqual(len(self.soup.select(selector)), 0)
1772
1773 def test_multi_class_support(self):
1774 for selector in ('.class1', 'p.class1', '.class2', 'p.class2',
1775 '.class3', 'p.class3', 'html p.class2', 'div#inner .class2'):
1776 self.assertSelects(selector, ['pmulti'])
1777
1778 def test_multi_class_selection(self):
1779 for selector in ('.class1.class3', '.class3.class2',
1780 '.class1.class2.class3'):
1781 self.assertSelects(selector, ['pmulti'])
1782
1783 def test_child_selector(self):
1784 self.assertSelects('.s1 > a', ['s1a1', 's1a2'])
1785 self.assertSelects('.s1 > a span', ['s1a2s1'])
1786
1787 def test_child_selector_id(self):
1788 self.assertSelects('.s1 > a#s1a2 span', ['s1a2s1'])
1789
1790 def test_attribute_equals(self):
1791 self.assertSelectMultiple(
1792 ('p[class="onep"]', ['p1']),
1793 ('p[id="p1"]', ['p1']),
1794 ('[class="onep"]', ['p1']),
1795 ('[id="p1"]', ['p1']),
1796 ('link[rel="stylesheet"]', ['l1']),
1797 ('link[type="text/css"]', ['l1']),
1798 ('link[href="blah.css"]', ['l1']),
1799 ('link[href="no-blah.css"]', []),
1800 ('[rel="stylesheet"]', ['l1']),
1801 ('[type="text/css"]', ['l1']),
1802 ('[href="blah.css"]', ['l1']),
1803 ('[href="no-blah.css"]', []),
1804 ('p[href="no-blah.css"]', []),
1805 ('[href="no-blah.css"]', []),
1806 )
1807
1808 def test_attribute_tilde(self):
1809 self.assertSelectMultiple(
1810 ('p[class~="class1"]', ['pmulti']),
1811 ('p[class~="class2"]', ['pmulti']),
1812 ('p[class~="class3"]', ['pmulti']),
1813 ('[class~="class1"]', ['pmulti']),
1814 ('[class~="class2"]', ['pmulti']),
1815 ('[class~="class3"]', ['pmulti']),
1816 ('a[rel~="friend"]', ['bob']),
1817 ('a[rel~="met"]', ['bob']),
1818 ('[rel~="friend"]', ['bob']),
1819 ('[rel~="met"]', ['bob']),
1820 )
1821
1822 def test_attribute_startswith(self):
1823 self.assertSelectMultiple(
1824 ('[rel^="style"]', ['l1']),
1825 ('link[rel^="style"]', ['l1']),
1826 ('notlink[rel^="notstyle"]', []),
1827 ('[rel^="notstyle"]', []),
1828 ('link[rel^="notstyle"]', []),
1829 ('link[href^="bla"]', ['l1']),
1830 ('a[href^="http://"]', ['bob', 'me']),
1831 ('[href^="http://"]', ['bob', 'me']),
1832 ('[id^="p"]', ['pmulti', 'p1']),
1833 ('[id^="m"]', ['me', 'main']),
1834 ('div[id^="m"]', ['main']),
1835 ('a[id^="m"]', ['me']),
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001836 ('div[data-tag^="dashed"]', ['data1'])
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001837 )
1838
1839 def test_attribute_endswith(self):
1840 self.assertSelectMultiple(
1841 ('[href$=".css"]', ['l1']),
1842 ('link[href$=".css"]', ['l1']),
1843 ('link[id$="1"]', ['l1']),
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001844 ('[id$="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1', 'dash1']),
1845 ('div[id$="1"]', ['data1']),
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001846 ('[id$="noending"]', []),
1847 )
1848
1849 def test_attribute_contains(self):
1850 self.assertSelectMultiple(
1851 # From test_attribute_startswith
1852 ('[rel*="style"]', ['l1']),
1853 ('link[rel*="style"]', ['l1']),
1854 ('notlink[rel*="notstyle"]', []),
1855 ('[rel*="notstyle"]', []),
1856 ('link[rel*="notstyle"]', []),
1857 ('link[href*="bla"]', ['l1']),
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001858 ('[href*="http://"]', ['bob', 'me']),
1859 ('[id*="p"]', ['pmulti', 'p1']),
1860 ('div[id*="m"]', ['main']),
1861 ('a[id*="m"]', ['me']),
1862 # From test_attribute_endswith
1863 ('[href*=".css"]', ['l1']),
1864 ('link[href*=".css"]', ['l1']),
1865 ('link[id*="1"]', ['l1']),
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001866 ('[id*="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1', 'dash1']),
1867 ('div[id*="1"]', ['data1']),
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001868 ('[id*="noending"]', []),
1869 # New for this test
1870 ('[href*="."]', ['bob', 'me', 'l1']),
1871 ('a[href*="."]', ['bob', 'me']),
1872 ('link[href*="."]', ['l1']),
1873 ('div[id*="n"]', ['main', 'inner']),
1874 ('div[id*="nn"]', ['inner']),
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001875 ('div[data-tag*="edval"]', ['data1'])
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001876 )
1877
1878 def test_attribute_exact_or_hypen(self):
1879 self.assertSelectMultiple(
1880 ('p[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']),
1881 ('[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']),
1882 ('p[lang|="fr"]', ['lang-fr']),
1883 ('p[lang|="gb"]', []),
1884 )
1885
1886 def test_attribute_exists(self):
1887 self.assertSelectMultiple(
1888 ('[rel]', ['l1', 'bob', 'me']),
1889 ('link[rel]', ['l1']),
1890 ('a[rel]', ['bob', 'me']),
1891 ('[lang]', ['lang-en', 'lang-en-gb', 'lang-en-us', 'lang-fr']),
1892 ('p[class]', ['p1', 'pmulti']),
1893 ('[blah]', []),
1894 ('p[blah]', []),
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001895 ('div[data-tag]', ['data1'])
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001896 )
1897
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001898 def test_unsupported_pseudoclass(self):
1899 self.assertRaises(
1900 NotImplementedError, self.soup.select, "a:no-such-pseudoclass")
1901
1902 self.assertRaises(
1903 NotImplementedError, self.soup.select, "a:nth-of-type(a)")
1904
1905
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001906 def test_nth_of_type(self):
1907 # Try to select first paragraph
1908 els = self.soup.select('div#inner p:nth-of-type(1)')
1909 self.assertEqual(len(els), 1)
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001910 self.assertEqual(els[0].string, 'Some text')
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001911
1912 # Try to select third paragraph
1913 els = self.soup.select('div#inner p:nth-of-type(3)')
1914 self.assertEqual(len(els), 1)
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001915 self.assertEqual(els[0].string, 'Another')
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001916
1917 # Try to select (non-existent!) fourth paragraph
1918 els = self.soup.select('div#inner p:nth-of-type(4)')
1919 self.assertEqual(len(els), 0)
1920
1921 # Pass in an invalid value.
1922 self.assertRaises(
1923 ValueError, self.soup.select, 'div p:nth-of-type(0)')
1924
1925 def test_nth_of_type_direct_descendant(self):
1926 els = self.soup.select('div#inner > p:nth-of-type(1)')
1927 self.assertEqual(len(els), 1)
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001928 self.assertEqual(els[0].string, 'Some text')
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001929
1930 def test_id_child_selector_nth_of_type(self):
1931 self.assertSelects('#inner > p:nth-of-type(2)', ['p1'])
1932
1933 def test_select_on_element(self):
1934 # Other tests operate on the tree; this operates on an element
1935 # within the tree.
1936 inner = self.soup.find("div", id="main")
1937 selected = inner.select("div")
1938 # The <div id="inner"> tag was selected. The <div id="footer">
1939 # tag was not.
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001940 self.assertSelectsIDs(selected, ['inner', 'data1'])
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001941
1942 def test_overspecified_child_id(self):
1943 self.assertSelects(".fancy #inner", ['inner'])
1944 self.assertSelects(".normal #inner", [])
1945
1946 def test_adjacent_sibling_selector(self):
1947 self.assertSelects('#p1 + h2', ['header2'])
1948 self.assertSelects('#p1 + h2 + p', ['pmulti'])
1949 self.assertSelects('#p1 + #header2 + .class1', ['pmulti'])
1950 self.assertEqual([], self.soup.select('#p1 + p'))
1951
1952 def test_general_sibling_selector(self):
1953 self.assertSelects('#p1 ~ h2', ['header2', 'header3'])
1954 self.assertSelects('#p1 ~ #header2', ['header2'])
1955 self.assertSelects('#p1 ~ h2 + a', ['me'])
1956 self.assertSelects('#p1 ~ h2 + [rel="me"]', ['me'])
1957 self.assertEqual([], self.soup.select('#inner ~ h2'))
1958
1959 def test_dangling_combinator(self):
1960 self.assertRaises(ValueError, self.soup.select, 'h1 >')
1961
1962 def test_sibling_combinator_wont_select_same_tag_twice(self):
1963 self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001964
1965 # Test the selector grouping operator (the comma)
1966 def test_multiple_select(self):
1967 self.assertSelects('x, y', ['xid', 'yid'])
1968
1969 def test_multiple_select_with_no_space(self):
1970 self.assertSelects('x,y', ['xid', 'yid'])
1971
1972 def test_multiple_select_with_more_space(self):
1973 self.assertSelects('x, y', ['xid', 'yid'])
1974
1975 def test_multiple_select_duplicated(self):
1976 self.assertSelects('x, x', ['xid'])
1977
1978 def test_multiple_select_sibling(self):
1979 self.assertSelects('x, y ~ p[lang=fr]', ['xid', 'lang-fr'])
1980
1981 def test_multiple_select_tag_and_direct_descendant(self):
1982 self.assertSelects('x, y > z', ['xid', 'zidb'])
1983
1984 def test_multiple_select_direct_descendant_and_tags(self):
1985 self.assertSelects('div > x, y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
1986
1987 def test_multiple_select_indirect_descendant(self):
1988 self.assertSelects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
1989
1990 def test_invalid_multiple_select(self):
1991 self.assertRaises(ValueError, self.soup.select, ',x, y')
1992 self.assertRaises(ValueError, self.soup.select, 'x,,y')
1993
1994 def test_multiple_select_attrs(self):
1995 self.assertSelects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb'])
1996
1997 def test_multiple_select_ids(self):
1998 self.assertSelects('x, y > z[id=zida], z[id=zidab], z[id=zidb]', ['xid', 'zidb', 'zidab'])
1999
2000 def test_multiple_select_nested(self):
2001 self.assertSelects('body > div > x, y > z', ['xid', 'zidb'])
2002
2003
2004