blob: 68be42d138b68fadc65f23bcab8c90edd465c44d [file] [log] [blame]
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001__license__ = "MIT"
2
Brad Bishop19323692019-04-05 15:28:33 -04003import collections.abc
Patrick Williamsc124f4f2015-09-15 14:41:29 -05004import re
5import sys
6import warnings
7from bs4.dammit import EntitySubstitution
8
9DEFAULT_OUTPUT_ENCODING = "utf-8"
10PY3K = (sys.version_info[0] > 2)
11
Brad Bishop19323692019-04-05 15:28:33 -040012whitespace_re = re.compile(r"\s+")
Patrick Williamsc124f4f2015-09-15 14:41:29 -050013
14def _alias(attr):
15 """Alias one attribute name to another for backward compatibility"""
16 @property
17 def alias(self):
18 return getattr(self, attr)
19
20 @alias.setter
21 def alias(self):
22 return setattr(self, attr)
23 return alias
24
25
Patrick Williamsc0f7c042017-02-23 20:41:17 -060026class NamespacedAttribute(str):
Patrick Williamsc124f4f2015-09-15 14:41:29 -050027
28 def __new__(cls, prefix, name, namespace=None):
29 if name is None:
Patrick Williamsc0f7c042017-02-23 20:41:17 -060030 obj = str.__new__(cls, prefix)
Patrick Williamsc124f4f2015-09-15 14:41:29 -050031 elif prefix is None:
32 # Not really namespaced.
Patrick Williamsc0f7c042017-02-23 20:41:17 -060033 obj = str.__new__(cls, name)
Patrick Williamsc124f4f2015-09-15 14:41:29 -050034 else:
Patrick Williamsc0f7c042017-02-23 20:41:17 -060035 obj = str.__new__(cls, prefix + ":" + name)
Patrick Williamsc124f4f2015-09-15 14:41:29 -050036 obj.prefix = prefix
37 obj.name = name
38 obj.namespace = namespace
39 return obj
40
Patrick Williamsc0f7c042017-02-23 20:41:17 -060041class AttributeValueWithCharsetSubstitution(str):
Patrick Williamsc124f4f2015-09-15 14:41:29 -050042 """A stand-in object for a character encoding specified in HTML."""
43
44class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
45 """A generic stand-in for the value of a meta tag's 'charset' attribute.
46
47 When Beautiful Soup parses the markup '<meta charset="utf8">', the
48 value of the 'charset' attribute will be one of these objects.
49 """
50
51 def __new__(cls, original_value):
Patrick Williamsc0f7c042017-02-23 20:41:17 -060052 obj = str.__new__(cls, original_value)
Patrick Williamsc124f4f2015-09-15 14:41:29 -050053 obj.original_value = original_value
54 return obj
55
56 def encode(self, encoding):
57 return encoding
58
59
60class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
61 """A generic stand-in for the value of a meta tag's 'content' attribute.
62
63 When Beautiful Soup parses the markup:
64 <meta http-equiv="content-type" content="text/html; charset=utf8">
65
66 The value of the 'content' attribute will be one of these objects.
67 """
68
Brad Bishop19323692019-04-05 15:28:33 -040069 CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)
Patrick Williamsc124f4f2015-09-15 14:41:29 -050070
71 def __new__(cls, original_value):
72 match = cls.CHARSET_RE.search(original_value)
73 if match is None:
74 # No substitution necessary.
Patrick Williamsc0f7c042017-02-23 20:41:17 -060075 return str.__new__(str, original_value)
Patrick Williamsc124f4f2015-09-15 14:41:29 -050076
Patrick Williamsc0f7c042017-02-23 20:41:17 -060077 obj = str.__new__(cls, original_value)
Patrick Williamsc124f4f2015-09-15 14:41:29 -050078 obj.original_value = original_value
79 return obj
80
81 def encode(self, encoding):
82 def rewrite(match):
83 return match.group(1) + encoding
84 return self.CHARSET_RE.sub(rewrite, self.original_value)
85
86class HTMLAwareEntitySubstitution(EntitySubstitution):
87
88 """Entity substitution rules that are aware of some HTML quirks.
89
90 Specifically, the contents of <script> and <style> tags should not
91 undergo entity substitution.
92
93 Incoming NavigableString objects are checked to see if they're the
94 direct children of a <script> or <style> tag.
95 """
96
97 cdata_containing_tags = set(["script", "style"])
98
99 preformatted_tags = set(["pre"])
100
101 @classmethod
102 def _substitute_if_appropriate(cls, ns, f):
103 if (isinstance(ns, NavigableString)
104 and ns.parent is not None
105 and ns.parent.name in cls.cdata_containing_tags):
106 # Do nothing.
107 return ns
108 # Substitute.
109 return f(ns)
110
111 @classmethod
112 def substitute_html(cls, ns):
113 return cls._substitute_if_appropriate(
114 ns, EntitySubstitution.substitute_html)
115
116 @classmethod
117 def substitute_xml(cls, ns):
118 return cls._substitute_if_appropriate(
119 ns, EntitySubstitution.substitute_xml)
120
121class PageElement(object):
122 """Contains the navigational information for some part of the page
123 (either a tag or a piece of text)"""
124
125 # There are five possible values for the "formatter" argument passed in
126 # to methods like encode() and prettify():
127 #
128 # "html" - All Unicode characters with corresponding HTML entities
129 # are converted to those entities on output.
130 # "minimal" - Bare ampersands and angle brackets are converted to
131 # XML entities: &amp; &lt; &gt;
132 # None - The null formatter. Unicode characters are never
133 # converted to entities. This is not recommended, but it's
134 # faster than "minimal".
135 # A function - This function will be called on every string that
136 # needs to undergo entity substitution.
137 #
138
139 # In an HTML document, the default "html" and "minimal" functions
140 # will leave the contents of <script> and <style> tags alone. For
141 # an XML document, all tags will be given the same treatment.
142
143 HTML_FORMATTERS = {
144 "html" : HTMLAwareEntitySubstitution.substitute_html,
145 "minimal" : HTMLAwareEntitySubstitution.substitute_xml,
146 None : None
147 }
148
149 XML_FORMATTERS = {
150 "html" : EntitySubstitution.substitute_html,
151 "minimal" : EntitySubstitution.substitute_xml,
152 None : None
153 }
154
155 def format_string(self, s, formatter='minimal'):
156 """Format the given string using the given formatter."""
Brad Bishop19323692019-04-05 15:28:33 -0400157 if not isinstance(formatter, collections.abc.Callable):
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500158 formatter = self._formatter_for_name(formatter)
159 if formatter is None:
160 output = s
161 else:
162 output = formatter(s)
163 return output
164
165 @property
166 def _is_xml(self):
167 """Is this element part of an XML tree or an HTML tree?
168
169 This is used when mapping a formatter name ("minimal") to an
170 appropriate function (one that performs entity-substitution on
171 the contents of <script> and <style> tags, or not). It's
172 inefficient, but it should be called very rarely.
173 """
174 if self.parent is None:
175 # This is the top-level object. It should have .is_xml set
176 # from tree creation. If not, take a guess--BS is usually
177 # used on HTML markup.
178 return getattr(self, 'is_xml', False)
179 return self.parent._is_xml
180
181 def _formatter_for_name(self, name):
182 "Look up a formatter function based on its name and the tree."
183 if self._is_xml:
184 return self.XML_FORMATTERS.get(
185 name, EntitySubstitution.substitute_xml)
186 else:
187 return self.HTML_FORMATTERS.get(
188 name, HTMLAwareEntitySubstitution.substitute_xml)
189
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600190 def setup(self, parent=None, previous_element=None, next_element=None,
191 previous_sibling=None, next_sibling=None):
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500192 """Sets up the initial relations between this element and
193 other elements."""
194 self.parent = parent
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600195
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500196 self.previous_element = previous_element
197 if previous_element is not None:
198 self.previous_element.next_element = self
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600199
200 self.next_element = next_element
201 if self.next_element:
202 self.next_element.previous_element = self
203
204 self.next_sibling = next_sibling
205 if self.next_sibling:
206 self.next_sibling.previous_sibling = self
207
208 if (not previous_sibling
209 and self.parent is not None and self.parent.contents):
210 previous_sibling = self.parent.contents[-1]
211
212 self.previous_sibling = previous_sibling
213 if previous_sibling:
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500214 self.previous_sibling.next_sibling = self
215
216 nextSibling = _alias("next_sibling") # BS3
217 previousSibling = _alias("previous_sibling") # BS3
218
219 def replace_with(self, replace_with):
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600220 if not self.parent:
221 raise ValueError(
222 "Cannot replace one element with another when the"
223 "element to be replaced is not part of a tree.")
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500224 if replace_with is self:
225 return
226 if replace_with is self.parent:
227 raise ValueError("Cannot replace a Tag with its parent.")
228 old_parent = self.parent
229 my_index = self.parent.index(self)
230 self.extract()
231 old_parent.insert(my_index, replace_with)
232 return self
233 replaceWith = replace_with # BS3
234
235 def unwrap(self):
236 my_parent = self.parent
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600237 if not self.parent:
238 raise ValueError(
239 "Cannot replace an element with its contents when that"
240 "element is not part of a tree.")
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500241 my_index = self.parent.index(self)
242 self.extract()
243 for child in reversed(self.contents[:]):
244 my_parent.insert(my_index, child)
245 return self
246 replace_with_children = unwrap
247 replaceWithChildren = unwrap # BS3
248
249 def wrap(self, wrap_inside):
250 me = self.replace_with(wrap_inside)
251 wrap_inside.append(me)
252 return wrap_inside
253
254 def extract(self):
255 """Destructively rips this element out of the tree."""
256 if self.parent is not None:
257 del self.parent.contents[self.parent.index(self)]
258
259 #Find the two elements that would be next to each other if
260 #this element (and any children) hadn't been parsed. Connect
261 #the two.
262 last_child = self._last_descendant()
263 next_element = last_child.next_element
264
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600265 if (self.previous_element is not None and
266 self.previous_element is not next_element):
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500267 self.previous_element.next_element = next_element
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600268 if next_element is not None and next_element is not self.previous_element:
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500269 next_element.previous_element = self.previous_element
270 self.previous_element = None
271 last_child.next_element = None
272
273 self.parent = None
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600274 if (self.previous_sibling is not None
275 and self.previous_sibling is not self.next_sibling):
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500276 self.previous_sibling.next_sibling = self.next_sibling
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600277 if (self.next_sibling is not None
278 and self.next_sibling is not self.previous_sibling):
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500279 self.next_sibling.previous_sibling = self.previous_sibling
280 self.previous_sibling = self.next_sibling = None
281 return self
282
283 def _last_descendant(self, is_initialized=True, accept_self=True):
284 "Finds the last element beneath this object to be parsed."
285 if is_initialized and self.next_sibling:
286 last_child = self.next_sibling.previous_element
287 else:
288 last_child = self
289 while isinstance(last_child, Tag) and last_child.contents:
290 last_child = last_child.contents[-1]
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600291 if not accept_self and last_child is self:
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500292 last_child = None
293 return last_child
294 # BS3: Not part of the API!
295 _lastRecursiveChild = _last_descendant
296
297 def insert(self, position, new_child):
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600298 if new_child is None:
299 raise ValueError("Cannot insert None into a tag.")
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500300 if new_child is self:
301 raise ValueError("Cannot insert a tag into itself.")
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600302 if (isinstance(new_child, str)
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500303 and not isinstance(new_child, NavigableString)):
304 new_child = NavigableString(new_child)
305
306 position = min(position, len(self.contents))
307 if hasattr(new_child, 'parent') and new_child.parent is not None:
308 # We're 'inserting' an element that's already one
309 # of this object's children.
310 if new_child.parent is self:
311 current_index = self.index(new_child)
312 if current_index < position:
313 # We're moving this element further down the list
314 # of this object's children. That means that when
315 # we extract this element, our target index will
316 # jump down one.
317 position -= 1
318 new_child.extract()
319
320 new_child.parent = self
321 previous_child = None
322 if position == 0:
323 new_child.previous_sibling = None
324 new_child.previous_element = self
325 else:
326 previous_child = self.contents[position - 1]
327 new_child.previous_sibling = previous_child
328 new_child.previous_sibling.next_sibling = new_child
329 new_child.previous_element = previous_child._last_descendant(False)
330 if new_child.previous_element is not None:
331 new_child.previous_element.next_element = new_child
332
333 new_childs_last_element = new_child._last_descendant(False)
334
335 if position >= len(self.contents):
336 new_child.next_sibling = None
337
338 parent = self
339 parents_next_sibling = None
340 while parents_next_sibling is None and parent is not None:
341 parents_next_sibling = parent.next_sibling
342 parent = parent.parent
343 if parents_next_sibling is not None:
344 # We found the element that comes next in the document.
345 break
346 if parents_next_sibling is not None:
347 new_childs_last_element.next_element = parents_next_sibling
348 else:
349 # The last element of this tag is the last element in
350 # the document.
351 new_childs_last_element.next_element = None
352 else:
353 next_child = self.contents[position]
354 new_child.next_sibling = next_child
355 if new_child.next_sibling is not None:
356 new_child.next_sibling.previous_sibling = new_child
357 new_childs_last_element.next_element = next_child
358
359 if new_childs_last_element.next_element is not None:
360 new_childs_last_element.next_element.previous_element = new_childs_last_element
361 self.contents.insert(position, new_child)
362
363 def append(self, tag):
364 """Appends the given tag to the contents of this tag."""
365 self.insert(len(self.contents), tag)
366
367 def insert_before(self, predecessor):
368 """Makes the given element the immediate predecessor of this one.
369
370 The two elements will have the same parent, and the given element
371 will be immediately before this one.
372 """
373 if self is predecessor:
374 raise ValueError("Can't insert an element before itself.")
375 parent = self.parent
376 if parent is None:
377 raise ValueError(
378 "Element has no parent, so 'before' has no meaning.")
379 # Extract first so that the index won't be screwed up if they
380 # are siblings.
381 if isinstance(predecessor, PageElement):
382 predecessor.extract()
383 index = parent.index(self)
384 parent.insert(index, predecessor)
385
386 def insert_after(self, successor):
387 """Makes the given element the immediate successor of this one.
388
389 The two elements will have the same parent, and the given element
390 will be immediately after this one.
391 """
392 if self is successor:
393 raise ValueError("Can't insert an element after itself.")
394 parent = self.parent
395 if parent is None:
396 raise ValueError(
397 "Element has no parent, so 'after' has no meaning.")
398 # Extract first so that the index won't be screwed up if they
399 # are siblings.
400 if isinstance(successor, PageElement):
401 successor.extract()
402 index = parent.index(self)
403 parent.insert(index+1, successor)
404
405 def find_next(self, name=None, attrs={}, text=None, **kwargs):
406 """Returns the first item that matches the given criteria and
407 appears after this Tag in the document."""
408 return self._find_one(self.find_all_next, name, attrs, text, **kwargs)
409 findNext = find_next # BS3
410
411 def find_all_next(self, name=None, attrs={}, text=None, limit=None,
412 **kwargs):
413 """Returns all items that match the given criteria and appear
414 after this Tag in the document."""
415 return self._find_all(name, attrs, text, limit, self.next_elements,
416 **kwargs)
417 findAllNext = find_all_next # BS3
418
419 def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs):
420 """Returns the closest sibling to this Tag that matches the
421 given criteria and appears after this Tag in the document."""
422 return self._find_one(self.find_next_siblings, name, attrs, text,
423 **kwargs)
424 findNextSibling = find_next_sibling # BS3
425
426 def find_next_siblings(self, name=None, attrs={}, text=None, limit=None,
427 **kwargs):
428 """Returns the siblings of this Tag that match the given
429 criteria and appear after this Tag in the document."""
430 return self._find_all(name, attrs, text, limit,
431 self.next_siblings, **kwargs)
432 findNextSiblings = find_next_siblings # BS3
433 fetchNextSiblings = find_next_siblings # BS2
434
435 def find_previous(self, name=None, attrs={}, text=None, **kwargs):
436 """Returns the first item that matches the given criteria and
437 appears before this Tag in the document."""
438 return self._find_one(
439 self.find_all_previous, name, attrs, text, **kwargs)
440 findPrevious = find_previous # BS3
441
442 def find_all_previous(self, name=None, attrs={}, text=None, limit=None,
443 **kwargs):
444 """Returns all items that match the given criteria and appear
445 before this Tag in the document."""
446 return self._find_all(name, attrs, text, limit, self.previous_elements,
447 **kwargs)
448 findAllPrevious = find_all_previous # BS3
449 fetchPrevious = find_all_previous # BS2
450
451 def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs):
452 """Returns the closest sibling to this Tag that matches the
453 given criteria and appears before this Tag in the document."""
454 return self._find_one(self.find_previous_siblings, name, attrs, text,
455 **kwargs)
456 findPreviousSibling = find_previous_sibling # BS3
457
458 def find_previous_siblings(self, name=None, attrs={}, text=None,
459 limit=None, **kwargs):
460 """Returns the siblings of this Tag that match the given
461 criteria and appear before this Tag in the document."""
462 return self._find_all(name, attrs, text, limit,
463 self.previous_siblings, **kwargs)
464 findPreviousSiblings = find_previous_siblings # BS3
465 fetchPreviousSiblings = find_previous_siblings # BS2
466
467 def find_parent(self, name=None, attrs={}, **kwargs):
468 """Returns the closest parent of this Tag that matches the given
469 criteria."""
470 # NOTE: We can't use _find_one because findParents takes a different
471 # set of arguments.
472 r = None
473 l = self.find_parents(name, attrs, 1, **kwargs)
474 if l:
475 r = l[0]
476 return r
477 findParent = find_parent # BS3
478
479 def find_parents(self, name=None, attrs={}, limit=None, **kwargs):
480 """Returns the parents of this Tag that match the given
481 criteria."""
482
483 return self._find_all(name, attrs, None, limit, self.parents,
484 **kwargs)
485 findParents = find_parents # BS3
486 fetchParents = find_parents # BS2
487
488 @property
489 def next(self):
490 return self.next_element
491
492 @property
493 def previous(self):
494 return self.previous_element
495
496 #These methods do the real heavy lifting.
497
498 def _find_one(self, method, name, attrs, text, **kwargs):
499 r = None
500 l = method(name, attrs, text, 1, **kwargs)
501 if l:
502 r = l[0]
503 return r
504
505 def _find_all(self, name, attrs, text, limit, generator, **kwargs):
506 "Iterates over a generator looking for things that match."
507
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600508 if text is None and 'string' in kwargs:
509 text = kwargs['string']
510 del kwargs['string']
511
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500512 if isinstance(name, SoupStrainer):
513 strainer = name
514 else:
515 strainer = SoupStrainer(name, attrs, text, **kwargs)
516
517 if text is None and not limit and not attrs and not kwargs:
518 if name is True or name is None:
519 # Optimization to find all tags.
520 result = (element for element in generator
521 if isinstance(element, Tag))
522 return ResultSet(strainer, result)
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600523 elif isinstance(name, str):
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500524 # Optimization to find all tags with a given name.
525 result = (element for element in generator
526 if isinstance(element, Tag)
527 and element.name == name)
528 return ResultSet(strainer, result)
529 results = ResultSet(strainer)
530 while True:
531 try:
532 i = next(generator)
533 except StopIteration:
534 break
535 if i:
536 found = strainer.search(i)
537 if found:
538 results.append(found)
539 if limit and len(results) >= limit:
540 break
541 return results
542
543 #These generators can be used to navigate starting from both
544 #NavigableStrings and Tags.
545 @property
546 def next_elements(self):
547 i = self.next_element
548 while i is not None:
549 yield i
550 i = i.next_element
551
552 @property
553 def next_siblings(self):
554 i = self.next_sibling
555 while i is not None:
556 yield i
557 i = i.next_sibling
558
559 @property
560 def previous_elements(self):
561 i = self.previous_element
562 while i is not None:
563 yield i
564 i = i.previous_element
565
566 @property
567 def previous_siblings(self):
568 i = self.previous_sibling
569 while i is not None:
570 yield i
571 i = i.previous_sibling
572
573 @property
574 def parents(self):
575 i = self.parent
576 while i is not None:
577 yield i
578 i = i.parent
579
580 # Methods for supporting CSS selectors.
581
Brad Bishop19323692019-04-05 15:28:33 -0400582 tag_name_re = re.compile(r'^[a-zA-Z0-9][-.a-zA-Z0-9:_]*$')
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500583
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600584 # /^([a-zA-Z0-9][-.a-zA-Z0-9:_]*)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
585 # \---------------------------/ \---/\-------------/ \-------/
586 # | | | |
587 # | | | The value
588 # | | ~,|,^,$,* or =
589 # | Attribute
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500590 # Tag
591 attribselect_re = re.compile(
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600592 r'^(?P<tag>[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P<attribute>[\w-]+)(?P<operator>[=~\|\^\$\*]?)' +
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500593 r'=?"?(?P<value>[^\]"]*)"?\]$'
594 )
595
596 def _attr_value_as_string(self, value, default=None):
597 """Force an attribute value into a string representation.
598
599 A multi-valued attribute will be converted into a
600 space-separated stirng.
601 """
602 value = self.get(value, default)
603 if isinstance(value, list) or isinstance(value, tuple):
604 value =" ".join(value)
605 return value
606
607 def _tag_name_matches_and(self, function, tag_name):
608 if not tag_name:
609 return function
610 else:
611 def _match(tag):
612 return tag.name == tag_name and function(tag)
613 return _match
614
615 def _attribute_checker(self, operator, attribute, value=''):
616 """Create a function that performs a CSS selector operation.
617
618 Takes an operator, attribute and optional value. Returns a
619 function that will return True for elements that match that
620 combination.
621 """
622 if operator == '=':
623 # string representation of `attribute` is equal to `value`
624 return lambda el: el._attr_value_as_string(attribute) == value
625 elif operator == '~':
626 # space-separated list representation of `attribute`
627 # contains `value`
628 def _includes_value(element):
629 attribute_value = element.get(attribute, [])
630 if not isinstance(attribute_value, list):
631 attribute_value = attribute_value.split()
632 return value in attribute_value
633 return _includes_value
634 elif operator == '^':
635 # string representation of `attribute` starts with `value`
636 return lambda el: el._attr_value_as_string(
637 attribute, '').startswith(value)
638 elif operator == '$':
639 # string represenation of `attribute` ends with `value`
640 return lambda el: el._attr_value_as_string(
641 attribute, '').endswith(value)
642 elif operator == '*':
643 # string representation of `attribute` contains `value`
644 return lambda el: value in el._attr_value_as_string(attribute, '')
645 elif operator == '|':
646 # string representation of `attribute` is either exactly
647 # `value` or starts with `value` and then a dash.
648 def _is_or_starts_with_dash(element):
649 attribute_value = element._attr_value_as_string(attribute, '')
650 return (attribute_value == value or attribute_value.startswith(
651 value + '-'))
652 return _is_or_starts_with_dash
653 else:
654 return lambda el: el.has_attr(attribute)
655
656 # Old non-property versions of the generators, for backwards
657 # compatibility with BS3.
658 def nextGenerator(self):
659 return self.next_elements
660
661 def nextSiblingGenerator(self):
662 return self.next_siblings
663
664 def previousGenerator(self):
665 return self.previous_elements
666
667 def previousSiblingGenerator(self):
668 return self.previous_siblings
669
670 def parentGenerator(self):
671 return self.parents
672
673
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600674class NavigableString(str, PageElement):
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500675
676 PREFIX = ''
677 SUFFIX = ''
678
679 def __new__(cls, value):
680 """Create a new NavigableString.
681
682 When unpickling a NavigableString, this method is called with
683 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
684 passed in to the superclass's __new__ or the superclass won't know
685 how to handle non-ASCII characters.
686 """
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600687 if isinstance(value, str):
688 u = str.__new__(cls, value)
689 else:
690 u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
691 u.setup()
692 return u
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500693
694 def __copy__(self):
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600695 """A copy of a NavigableString has the same contents and class
696 as the original, but it is not connected to the parse tree.
697 """
698 return type(self)(self)
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500699
700 def __getnewargs__(self):
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600701 return (str(self),)
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500702
703 def __getattr__(self, attr):
704 """text.string gives you text. This is for backwards
705 compatibility for Navigable*String, but for CData* it lets you
706 get the string without the CData wrapper."""
707 if attr == 'string':
708 return self
709 else:
710 raise AttributeError(
711 "'%s' object has no attribute '%s'" % (
712 self.__class__.__name__, attr))
713
714 def output_ready(self, formatter="minimal"):
715 output = self.format_string(self, formatter)
716 return self.PREFIX + output + self.SUFFIX
717
718 @property
719 def name(self):
720 return None
721
722 @name.setter
723 def name(self, name):
724 raise AttributeError("A NavigableString cannot be given a name.")
725
726class PreformattedString(NavigableString):
727 """A NavigableString not subject to the normal formatting rules.
728
729 The string will be passed into the formatter (to trigger side effects),
730 but the return value will be ignored.
731 """
732
733 def output_ready(self, formatter="minimal"):
734 """CData strings are passed into the formatter.
735 But the return value is ignored."""
736 self.format_string(self, formatter)
737 return self.PREFIX + self + self.SUFFIX
738
739class CData(PreformattedString):
740
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600741 PREFIX = '<![CDATA['
742 SUFFIX = ']]>'
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500743
744class ProcessingInstruction(PreformattedString):
745
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600746 PREFIX = '<?'
747 SUFFIX = '>'
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500748
749class Comment(PreformattedString):
750
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600751 PREFIX = '<!--'
752 SUFFIX = '-->'
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500753
754
755class Declaration(PreformattedString):
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600756 PREFIX = '<?'
757 SUFFIX = '?>'
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500758
759
760class Doctype(PreformattedString):
761
762 @classmethod
763 def for_name_and_ids(cls, name, pub_id, system_id):
764 value = name or ''
765 if pub_id is not None:
766 value += ' PUBLIC "%s"' % pub_id
767 if system_id is not None:
768 value += ' "%s"' % system_id
769 elif system_id is not None:
770 value += ' SYSTEM "%s"' % system_id
771
772 return Doctype(value)
773
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600774 PREFIX = '<!DOCTYPE '
775 SUFFIX = '>\n'
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500776
777
778class Tag(PageElement):
779
780 """Represents a found HTML tag with its attributes and contents."""
781
782 def __init__(self, parser=None, builder=None, name=None, namespace=None,
783 prefix=None, attrs=None, parent=None, previous=None):
784 "Basic constructor."
785
786 if parser is None:
787 self.parser_class = None
788 else:
789 # We don't actually store the parser object: that lets extracted
790 # chunks be garbage-collected.
791 self.parser_class = parser.__class__
792 if name is None:
793 raise ValueError("No value provided for new tag's name.")
794 self.name = name
795 self.namespace = namespace
796 self.prefix = prefix
797 if attrs is None:
798 attrs = {}
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600799 elif attrs:
800 if builder is not None and builder.cdata_list_attributes:
801 attrs = builder._replace_cdata_list_attribute_values(
802 self.name, attrs)
803 else:
804 attrs = dict(attrs)
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500805 else:
806 attrs = dict(attrs)
807 self.attrs = attrs
808 self.contents = []
809 self.setup(parent, previous)
810 self.hidden = False
811
812 # Set up any substitutions, such as the charset in a META tag.
813 if builder is not None:
814 builder.set_up_substitutions(self)
815 self.can_be_empty_element = builder.can_be_empty_element(name)
816 else:
817 self.can_be_empty_element = False
818
819 parserClass = _alias("parser_class") # BS3
820
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600821 def __copy__(self):
822 """A copy of a Tag is a new Tag, unconnected to the parse tree.
823 Its contents are a copy of the old Tag's contents.
824 """
825 clone = type(self)(None, self.builder, self.name, self.namespace,
826 self.nsprefix, self.attrs)
827 for attr in ('can_be_empty_element', 'hidden'):
828 setattr(clone, attr, getattr(self, attr))
829 for child in self.contents:
830 clone.append(child.__copy__())
831 return clone
832
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500833 @property
834 def is_empty_element(self):
835 """Is this tag an empty-element tag? (aka a self-closing tag)
836
837 A tag that has contents is never an empty-element tag.
838
839 A tag that has no contents may or may not be an empty-element
840 tag. It depends on the builder used to create the tag. If the
841 builder has a designated list of empty-element tags, then only
842 a tag whose name shows up in that list is considered an
843 empty-element tag.
844
845 If the builder has no designated list of empty-element tags,
846 then any tag with no contents is an empty-element tag.
847 """
848 return len(self.contents) == 0 and self.can_be_empty_element
849 isSelfClosing = is_empty_element # BS3
850
851 @property
852 def string(self):
853 """Convenience property to get the single string within this tag.
854
855 :Return: If this tag has a single string child, return value
856 is that string. If this tag has no children, or more than one
857 child, return value is None. If this tag has one child tag,
858 return value is the 'string' attribute of the child tag,
859 recursively.
860 """
861 if len(self.contents) != 1:
862 return None
863 child = self.contents[0]
864 if isinstance(child, NavigableString):
865 return child
866 return child.string
867
868 @string.setter
869 def string(self, string):
870 self.clear()
871 self.append(string.__class__(string))
872
873 def _all_strings(self, strip=False, types=(NavigableString, CData)):
874 """Yield all strings of certain classes, possibly stripping them.
875
876 By default, yields only NavigableString and CData objects. So
877 no comments, processing instructions, etc.
878 """
879 for descendant in self.descendants:
880 if (
881 (types is None and not isinstance(descendant, NavigableString))
882 or
883 (types is not None and type(descendant) not in types)):
884 continue
885 if strip:
886 descendant = descendant.strip()
887 if len(descendant) == 0:
888 continue
889 yield descendant
890
891 strings = property(_all_strings)
892
893 @property
894 def stripped_strings(self):
895 for string in self._all_strings(True):
896 yield string
897
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600898 def get_text(self, separator="", strip=False,
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500899 types=(NavigableString, CData)):
900 """
901 Get all child strings, concatenated using the given separator.
902 """
903 return separator.join([s for s in self._all_strings(
904 strip, types=types)])
905 getText = get_text
906 text = property(get_text)
907
908 def decompose(self):
909 """Recursively destroys the contents of this tree."""
910 self.extract()
911 i = self
912 while i is not None:
913 next = i.next_element
914 i.__dict__.clear()
915 i.contents = []
916 i = next
917
918 def clear(self, decompose=False):
919 """
920 Extract all children. If decompose is True, decompose instead.
921 """
922 if decompose:
923 for element in self.contents[:]:
924 if isinstance(element, Tag):
925 element.decompose()
926 else:
927 element.extract()
928 else:
929 for element in self.contents[:]:
930 element.extract()
931
932 def index(self, element):
933 """
934 Find the index of a child by identity, not value. Avoids issues with
935 tag.contents.index(element) getting the index of equal elements.
936 """
937 for i, child in enumerate(self.contents):
938 if child is element:
939 return i
940 raise ValueError("Tag.index: element not in tag")
941
942 def get(self, key, default=None):
943 """Returns the value of the 'key' attribute for the tag, or
944 the value given for 'default' if it doesn't have that
945 attribute."""
946 return self.attrs.get(key, default)
947
948 def has_attr(self, key):
949 return key in self.attrs
950
951 def __hash__(self):
952 return str(self).__hash__()
953
954 def __getitem__(self, key):
955 """tag[key] returns the value of the 'key' attribute for the tag,
956 and throws an exception if it's not there."""
957 return self.attrs[key]
958
959 def __iter__(self):
960 "Iterating over a tag iterates over its contents."
961 return iter(self.contents)
962
963 def __len__(self):
964 "The length of a tag is the length of its list of contents."
965 return len(self.contents)
966
967 def __contains__(self, x):
968 return x in self.contents
969
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600970 def __bool__(self):
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500971 "A tag is non-None even if it has no contents."
972 return True
973
974 def __setitem__(self, key, value):
975 """Setting tag[key] sets the value of the 'key' attribute for the
976 tag."""
977 self.attrs[key] = value
978
979 def __delitem__(self, key):
980 "Deleting tag[key] deletes all 'key' attributes for the tag."
981 self.attrs.pop(key, None)
982
983 def __call__(self, *args, **kwargs):
984 """Calling a tag like a function is the same as calling its
985 find_all() method. Eg. tag('a') returns a list of all the A tags
986 found within this tag."""
987 return self.find_all(*args, **kwargs)
988
989 def __getattr__(self, tag):
990 #print "Getattr %s.%s" % (self.__class__, tag)
991 if len(tag) > 3 and tag.endswith('Tag'):
992 # BS3: soup.aTag -> "soup.find("a")
993 tag_name = tag[:-3]
994 warnings.warn(
995 '.%sTag is deprecated, use .find("%s") instead.' % (
996 tag_name, tag_name))
997 return self.find(tag_name)
998 # We special case contents to avoid recursion.
999 elif not tag.startswith("__") and not tag=="contents":
1000 return self.find(tag)
1001 raise AttributeError(
1002 "'%s' object has no attribute '%s'" % (self.__class__, tag))
1003
1004 def __eq__(self, other):
1005 """Returns true iff this tag has the same name, the same attributes,
1006 and the same contents (recursively) as the given tag."""
1007 if self is other:
1008 return True
1009 if (not hasattr(other, 'name') or
1010 not hasattr(other, 'attrs') or
1011 not hasattr(other, 'contents') or
1012 self.name != other.name or
1013 self.attrs != other.attrs or
1014 len(self) != len(other)):
1015 return False
1016 for i, my_child in enumerate(self.contents):
1017 if my_child != other.contents[i]:
1018 return False
1019 return True
1020
1021 def __ne__(self, other):
1022 """Returns true iff this tag is not identical to the other tag,
1023 as defined in __eq__."""
1024 return not self == other
1025
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001026 def __repr__(self, encoding="unicode-escape"):
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001027 """Renders this tag as a string."""
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001028 if PY3K:
1029 # "The return value must be a string object", i.e. Unicode
1030 return self.decode()
1031 else:
1032 # "The return value must be a string object", i.e. a bytestring.
1033 # By convention, the return value of __repr__ should also be
1034 # an ASCII string.
1035 return self.encode(encoding)
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001036
1037 def __unicode__(self):
1038 return self.decode()
1039
1040 def __str__(self):
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001041 if PY3K:
1042 return self.decode()
1043 else:
1044 return self.encode()
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001045
1046 if PY3K:
1047 __str__ = __repr__ = __unicode__
1048
1049 def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
1050 indent_level=None, formatter="minimal",
1051 errors="xmlcharrefreplace"):
1052 # Turn the data structure into Unicode, then encode the
1053 # Unicode.
1054 u = self.decode(indent_level, encoding, formatter)
1055 return u.encode(encoding, errors)
1056
1057 def _should_pretty_print(self, indent_level):
1058 """Should this tag be pretty-printed?"""
1059 return (
1060 indent_level is not None and
1061 (self.name not in HTMLAwareEntitySubstitution.preformatted_tags
1062 or self._is_xml))
1063
1064 def decode(self, indent_level=None,
1065 eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1066 formatter="minimal"):
1067 """Returns a Unicode representation of this tag and its contents.
1068
1069 :param eventual_encoding: The tag is destined to be
1070 encoded into this encoding. This method is _not_
1071 responsible for performing that encoding. This information
1072 is passed in so that it can be substituted in if the
1073 document contains a <META> tag that mentions the document's
1074 encoding.
1075 """
1076
1077 # First off, turn a string formatter into a function. This
1078 # will stop the lookup from happening over and over again.
Brad Bishop19323692019-04-05 15:28:33 -04001079 if not isinstance(formatter, collections.abc.Callable):
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001080 formatter = self._formatter_for_name(formatter)
1081
1082 attrs = []
1083 if self.attrs:
1084 for key, val in sorted(self.attrs.items()):
1085 if val is None:
1086 decoded = key
1087 else:
1088 if isinstance(val, list) or isinstance(val, tuple):
1089 val = ' '.join(val)
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001090 elif not isinstance(val, str):
1091 val = str(val)
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001092 elif (
1093 isinstance(val, AttributeValueWithCharsetSubstitution)
1094 and eventual_encoding is not None):
1095 val = val.encode(eventual_encoding)
1096
1097 text = self.format_string(val, formatter)
1098 decoded = (
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001099 str(key) + '='
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001100 + EntitySubstitution.quoted_attribute_value(text))
1101 attrs.append(decoded)
1102 close = ''
1103 closeTag = ''
1104
1105 prefix = ''
1106 if self.prefix:
1107 prefix = self.prefix + ":"
1108
1109 if self.is_empty_element:
1110 close = '/'
1111 else:
1112 closeTag = '</%s%s>' % (prefix, self.name)
1113
1114 pretty_print = self._should_pretty_print(indent_level)
1115 space = ''
1116 indent_space = ''
1117 if indent_level is not None:
1118 indent_space = (' ' * (indent_level - 1))
1119 if pretty_print:
1120 space = indent_space
1121 indent_contents = indent_level + 1
1122 else:
1123 indent_contents = None
1124 contents = self.decode_contents(
1125 indent_contents, eventual_encoding, formatter)
1126
1127 if self.hidden:
1128 # This is the 'document root' object.
1129 s = contents
1130 else:
1131 s = []
1132 attribute_string = ''
1133 if attrs:
1134 attribute_string = ' ' + ' '.join(attrs)
1135 if indent_level is not None:
1136 # Even if this particular tag is not pretty-printed,
1137 # we should indent up to the start of the tag.
1138 s.append(indent_space)
1139 s.append('<%s%s%s%s>' % (
1140 prefix, self.name, attribute_string, close))
1141 if pretty_print:
1142 s.append("\n")
1143 s.append(contents)
1144 if pretty_print and contents and contents[-1] != "\n":
1145 s.append("\n")
1146 if pretty_print and closeTag:
1147 s.append(space)
1148 s.append(closeTag)
1149 if indent_level is not None and closeTag and self.next_sibling:
1150 # Even if this particular tag is not pretty-printed,
1151 # we're now done with the tag, and we should add a
1152 # newline if appropriate.
1153 s.append("\n")
1154 s = ''.join(s)
1155 return s
1156
1157 def prettify(self, encoding=None, formatter="minimal"):
1158 if encoding is None:
1159 return self.decode(True, formatter=formatter)
1160 else:
1161 return self.encode(encoding, True, formatter=formatter)
1162
1163 def decode_contents(self, indent_level=None,
1164 eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1165 formatter="minimal"):
1166 """Renders the contents of this tag as a Unicode string.
1167
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001168 :param indent_level: Each line of the rendering will be
1169 indented this many spaces.
1170
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001171 :param eventual_encoding: The tag is destined to be
1172 encoded into this encoding. This method is _not_
1173 responsible for performing that encoding. This information
1174 is passed in so that it can be substituted in if the
1175 document contains a <META> tag that mentions the document's
1176 encoding.
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001177
1178 :param formatter: The output formatter responsible for converting
1179 entities to Unicode characters.
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001180 """
1181 # First off, turn a string formatter into a function. This
1182 # will stop the lookup from happening over and over again.
Brad Bishop19323692019-04-05 15:28:33 -04001183 if not isinstance(formatter, collections.abc.Callable):
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001184 formatter = self._formatter_for_name(formatter)
1185
1186 pretty_print = (indent_level is not None)
1187 s = []
1188 for c in self:
1189 text = None
1190 if isinstance(c, NavigableString):
1191 text = c.output_ready(formatter)
1192 elif isinstance(c, Tag):
1193 s.append(c.decode(indent_level, eventual_encoding,
1194 formatter))
1195 if text and indent_level and not self.name == 'pre':
1196 text = text.strip()
1197 if text:
1198 if pretty_print and not self.name == 'pre':
1199 s.append(" " * (indent_level - 1))
1200 s.append(text)
1201 if pretty_print and not self.name == 'pre':
1202 s.append("\n")
1203 return ''.join(s)
1204
1205 def encode_contents(
1206 self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
1207 formatter="minimal"):
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001208 """Renders the contents of this tag as a bytestring.
1209
1210 :param indent_level: Each line of the rendering will be
1211 indented this many spaces.
1212
1213 :param eventual_encoding: The bytestring will be in this encoding.
1214
1215 :param formatter: The output formatter responsible for converting
1216 entities to Unicode characters.
1217 """
1218
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001219 contents = self.decode_contents(indent_level, encoding, formatter)
1220 return contents.encode(encoding)
1221
1222 # Old method for BS3 compatibility
1223 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
1224 prettyPrint=False, indentLevel=0):
1225 if not prettyPrint:
1226 indentLevel = None
1227 return self.encode_contents(
1228 indent_level=indentLevel, encoding=encoding)
1229
1230 #Soup methods
1231
1232 def find(self, name=None, attrs={}, recursive=True, text=None,
1233 **kwargs):
1234 """Return only the first child of this Tag matching the given
1235 criteria."""
1236 r = None
1237 l = self.find_all(name, attrs, recursive, text, 1, **kwargs)
1238 if l:
1239 r = l[0]
1240 return r
1241 findChild = find
1242
1243 def find_all(self, name=None, attrs={}, recursive=True, text=None,
1244 limit=None, **kwargs):
1245 """Extracts a list of Tag objects that match the given
1246 criteria. You can specify the name of the Tag and any
1247 attributes you want the Tag to have.
1248
1249 The value of a key-value pair in the 'attrs' map can be a
1250 string, a list of strings, a regular expression object, or a
1251 callable that takes a string and returns whether or not the
1252 string matches for some custom definition of 'matches'. The
1253 same is true of the tag name."""
1254
1255 generator = self.descendants
1256 if not recursive:
1257 generator = self.children
1258 return self._find_all(name, attrs, text, limit, generator, **kwargs)
1259 findAll = find_all # BS3
1260 findChildren = find_all # BS2
1261
1262 #Generator methods
1263 @property
1264 def children(self):
1265 # return iter() to make the purpose of the method clear
1266 return iter(self.contents) # XXX This seems to be untested.
1267
1268 @property
1269 def descendants(self):
1270 if not len(self.contents):
1271 return
1272 stopNode = self._last_descendant().next_element
1273 current = self.contents[0]
1274 while current is not stopNode:
1275 yield current
1276 current = current.next_element
1277
1278 # CSS selector code
1279
1280 _selector_combinators = ['>', '+', '~']
1281 _select_debug = False
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001282 def select_one(self, selector):
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001283 """Perform a CSS selection operation on the current element."""
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001284 value = self.select(selector, limit=1)
1285 if value:
1286 return value[0]
1287 return None
1288
1289 def select(self, selector, _candidate_generator=None, limit=None):
1290 """Perform a CSS selection operation on the current element."""
1291
1292 # Handle grouping selectors if ',' exists, ie: p,a
1293 if ',' in selector:
1294 context = []
1295 for partial_selector in selector.split(','):
1296 partial_selector = partial_selector.strip()
1297 if partial_selector == '':
1298 raise ValueError('Invalid group selection syntax: %s' % selector)
1299 candidates = self.select(partial_selector, limit=limit)
1300 for candidate in candidates:
1301 if candidate not in context:
1302 context.append(candidate)
1303
1304 if limit and len(context) >= limit:
1305 break
1306 return context
1307
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001308 tokens = selector.split()
1309 current_context = [self]
1310
1311 if tokens[-1] in self._selector_combinators:
1312 raise ValueError(
1313 'Final combinator "%s" is missing an argument.' % tokens[-1])
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001314
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001315 if self._select_debug:
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001316 print('Running CSS selector "%s"' % selector)
1317
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001318 for index, token in enumerate(tokens):
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001319 new_context = []
1320 new_context_ids = set([])
1321
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001322 if tokens[index-1] in self._selector_combinators:
1323 # This token was consumed by the previous combinator. Skip it.
1324 if self._select_debug:
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001325 print(' Token was consumed by the previous combinator.')
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001326 continue
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001327
1328 if self._select_debug:
1329 print(' Considering token "%s"' % token)
1330 recursive_candidate_generator = None
1331 tag_name = None
1332
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001333 # Each operation corresponds to a checker function, a rule
1334 # for determining whether a candidate matches the
1335 # selector. Candidates are generated by the active
1336 # iterator.
1337 checker = None
1338
1339 m = self.attribselect_re.match(token)
1340 if m is not None:
1341 # Attribute selector
1342 tag_name, attribute, operator, value = m.groups()
1343 checker = self._attribute_checker(operator, attribute, value)
1344
1345 elif '#' in token:
1346 # ID selector
1347 tag_name, tag_id = token.split('#', 1)
1348 def id_matches(tag):
1349 return tag.get('id', None) == tag_id
1350 checker = id_matches
1351
1352 elif '.' in token:
1353 # Class selector
1354 tag_name, klass = token.split('.', 1)
1355 classes = set(klass.split('.'))
1356 def classes_match(candidate):
1357 return classes.issubset(candidate.get('class', []))
1358 checker = classes_match
1359
1360 elif ':' in token:
1361 # Pseudo-class
1362 tag_name, pseudo = token.split(':', 1)
1363 if tag_name == '':
1364 raise ValueError(
1365 "A pseudo-class must be prefixed with a tag name.")
Brad Bishop19323692019-04-05 15:28:33 -04001366 pseudo_attributes = re.match(r'([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001367 found = []
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001368 if pseudo_attributes is None:
1369 pseudo_type = pseudo
1370 pseudo_value = None
1371 else:
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001372 pseudo_type, pseudo_value = pseudo_attributes.groups()
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001373 if pseudo_type == 'nth-of-type':
1374 try:
1375 pseudo_value = int(pseudo_value)
1376 except:
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001377 raise NotImplementedError(
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001378 'Only numeric values are currently supported for the nth-of-type pseudo-class.')
1379 if pseudo_value < 1:
1380 raise ValueError(
1381 'nth-of-type pseudo-class value must be at least 1.')
1382 class Counter(object):
1383 def __init__(self, destination):
1384 self.count = 0
1385 self.destination = destination
1386
1387 def nth_child_of_type(self, tag):
1388 self.count += 1
1389 if self.count == self.destination:
1390 return True
1391 if self.count > self.destination:
1392 # Stop the generator that's sending us
1393 # these things.
1394 raise StopIteration()
1395 return False
1396 checker = Counter(pseudo_value).nth_child_of_type
1397 else:
1398 raise NotImplementedError(
1399 'Only the following pseudo-classes are implemented: nth-of-type.')
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001400
1401 elif token == '*':
1402 # Star selector -- matches everything
1403 pass
1404 elif token == '>':
1405 # Run the next token as a CSS selector against the
1406 # direct children of each tag in the current context.
1407 recursive_candidate_generator = lambda tag: tag.children
1408 elif token == '~':
1409 # Run the next token as a CSS selector against the
1410 # siblings of each tag in the current context.
1411 recursive_candidate_generator = lambda tag: tag.next_siblings
1412 elif token == '+':
1413 # For each tag in the current context, run the next
1414 # token as a CSS selector against the tag's next
1415 # sibling that's a tag.
1416 def next_tag_sibling(tag):
1417 yield tag.find_next_sibling(True)
1418 recursive_candidate_generator = next_tag_sibling
1419
1420 elif self.tag_name_re.match(token):
1421 # Just a tag name.
1422 tag_name = token
1423 else:
1424 raise ValueError(
1425 'Unsupported or invalid CSS selector: "%s"' % token)
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001426 if recursive_candidate_generator:
1427 # This happens when the selector looks like "> foo".
1428 #
1429 # The generator calls select() recursively on every
1430 # member of the current context, passing in a different
1431 # candidate generator and a different selector.
1432 #
1433 # In the case of "> foo", the candidate generator is
1434 # one that yields a tag's direct children (">"), and
1435 # the selector is "foo".
1436 next_token = tokens[index+1]
1437 def recursive_select(tag):
1438 if self._select_debug:
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001439 print(' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs))
1440 print('-' * 40)
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001441 for i in tag.select(next_token, recursive_candidate_generator):
1442 if self._select_debug:
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001443 print('(Recursive select picked up candidate %s %s)' % (i.name, i.attrs))
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001444 yield i
1445 if self._select_debug:
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001446 print('-' * 40)
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001447 _use_candidate_generator = recursive_select
1448 elif _candidate_generator is None:
1449 # By default, a tag's candidates are all of its
1450 # children. If tag_name is defined, only yield tags
1451 # with that name.
1452 if self._select_debug:
1453 if tag_name:
1454 check = "[any]"
1455 else:
1456 check = tag_name
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001457 print(' Default candidate generator, tag name="%s"' % check)
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001458 if self._select_debug:
1459 # This is redundant with later code, but it stops
1460 # a bunch of bogus tags from cluttering up the
1461 # debug log.
1462 def default_candidate_generator(tag):
1463 for child in tag.descendants:
1464 if not isinstance(child, Tag):
1465 continue
1466 if tag_name and not child.name == tag_name:
1467 continue
1468 yield child
1469 _use_candidate_generator = default_candidate_generator
1470 else:
1471 _use_candidate_generator = lambda tag: tag.descendants
1472 else:
1473 _use_candidate_generator = _candidate_generator
1474
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001475 count = 0
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001476 for tag in current_context:
1477 if self._select_debug:
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001478 print(" Running candidate generator on %s %s" % (
1479 tag.name, repr(tag.attrs)))
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001480 for candidate in _use_candidate_generator(tag):
1481 if not isinstance(candidate, Tag):
1482 continue
1483 if tag_name and candidate.name != tag_name:
1484 continue
1485 if checker is not None:
1486 try:
1487 result = checker(candidate)
1488 except StopIteration:
1489 # The checker has decided we should no longer
1490 # run the generator.
1491 break
1492 if checker is None or result:
1493 if self._select_debug:
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001494 print(" SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)))
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001495 if id(candidate) not in new_context_ids:
1496 # If a tag matches a selector more than once,
1497 # don't include it in the context more than once.
1498 new_context.append(candidate)
1499 new_context_ids.add(id(candidate))
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001500 if limit and len(new_context) >= limit:
1501 break
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001502 elif self._select_debug:
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001503 print(" FAILURE %s %s" % (candidate.name, repr(candidate.attrs)))
1504
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001505
1506 current_context = new_context
1507
1508 if self._select_debug:
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001509 print("Final verdict:")
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001510 for i in current_context:
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001511 print(" %s %s" % (i.name, i.attrs))
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001512 return current_context
1513
1514 # Old names for backwards compatibility
1515 def childGenerator(self):
1516 return self.children
1517
1518 def recursiveChildGenerator(self):
1519 return self.descendants
1520
1521 def has_key(self, key):
1522 """This was kind of misleading because has_key() (attributes)
1523 was different from __in__ (contents). has_key() is gone in
1524 Python 3, anyway."""
1525 warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % (
1526 key))
1527 return self.has_attr(key)
1528
1529# Next, a couple classes to represent queries and their results.
1530class SoupStrainer(object):
1531 """Encapsulates a number of ways of matching a markup element (tag or
1532 text)."""
1533
1534 def __init__(self, name=None, attrs={}, text=None, **kwargs):
1535 self.name = self._normalize_search_value(name)
1536 if not isinstance(attrs, dict):
1537 # Treat a non-dict value for attrs as a search for the 'class'
1538 # attribute.
1539 kwargs['class'] = attrs
1540 attrs = None
1541
1542 if 'class_' in kwargs:
1543 # Treat class_="foo" as a search for the 'class'
1544 # attribute, overriding any non-dict value for attrs.
1545 kwargs['class'] = kwargs['class_']
1546 del kwargs['class_']
1547
1548 if kwargs:
1549 if attrs:
1550 attrs = attrs.copy()
1551 attrs.update(kwargs)
1552 else:
1553 attrs = kwargs
1554 normalized_attrs = {}
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001555 for key, value in list(attrs.items()):
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001556 normalized_attrs[key] = self._normalize_search_value(value)
1557
1558 self.attrs = normalized_attrs
1559 self.text = self._normalize_search_value(text)
1560
1561 def _normalize_search_value(self, value):
1562 # Leave it alone if it's a Unicode string, a callable, a
1563 # regular expression, a boolean, or None.
Brad Bishop19323692019-04-05 15:28:33 -04001564 if (isinstance(value, str) or isinstance(value, collections.abc.Callable) or hasattr(value, 'match')
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001565 or isinstance(value, bool) or value is None):
1566 return value
1567
1568 # If it's a bytestring, convert it to Unicode, treating it as UTF-8.
1569 if isinstance(value, bytes):
1570 return value.decode("utf8")
1571
1572 # If it's listlike, convert it into a list of strings.
1573 if hasattr(value, '__iter__'):
1574 new_value = []
1575 for v in value:
1576 if (hasattr(v, '__iter__') and not isinstance(v, bytes)
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001577 and not isinstance(v, str)):
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001578 # This is almost certainly the user's mistake. In the
1579 # interests of avoiding infinite loops, we'll let
1580 # it through as-is rather than doing a recursive call.
1581 new_value.append(v)
1582 else:
1583 new_value.append(self._normalize_search_value(v))
1584 return new_value
1585
1586 # Otherwise, convert it into a Unicode string.
1587 # The unicode(str()) thing is so this will do the same thing on Python 2
1588 # and Python 3.
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001589 return str(str(value))
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001590
1591 def __str__(self):
1592 if self.text:
1593 return self.text
1594 else:
1595 return "%s|%s" % (self.name, self.attrs)
1596
1597 def search_tag(self, markup_name=None, markup_attrs={}):
1598 found = None
1599 markup = None
1600 if isinstance(markup_name, Tag):
1601 markup = markup_name
1602 markup_attrs = markup
1603 call_function_with_tag_data = (
Brad Bishop19323692019-04-05 15:28:33 -04001604 isinstance(self.name, collections.abc.Callable)
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001605 and not isinstance(markup_name, Tag))
1606
1607 if ((not self.name)
1608 or call_function_with_tag_data
1609 or (markup and self._matches(markup, self.name))
1610 or (not markup and self._matches(markup_name, self.name))):
1611 if call_function_with_tag_data:
1612 match = self.name(markup_name, markup_attrs)
1613 else:
1614 match = True
1615 markup_attr_map = None
1616 for attr, match_against in list(self.attrs.items()):
1617 if not markup_attr_map:
1618 if hasattr(markup_attrs, 'get'):
1619 markup_attr_map = markup_attrs
1620 else:
1621 markup_attr_map = {}
1622 for k, v in markup_attrs:
1623 markup_attr_map[k] = v
1624 attr_value = markup_attr_map.get(attr)
1625 if not self._matches(attr_value, match_against):
1626 match = False
1627 break
1628 if match:
1629 if markup:
1630 found = markup
1631 else:
1632 found = markup_name
1633 if found and self.text and not self._matches(found.string, self.text):
1634 found = None
1635 return found
1636 searchTag = search_tag
1637
1638 def search(self, markup):
1639 # print 'looking for %s in %s' % (self, markup)
1640 found = None
1641 # If given a list of items, scan it for a text element that
1642 # matches.
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001643 if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)):
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001644 for element in markup:
1645 if isinstance(element, NavigableString) \
1646 and self.search(element):
1647 found = element
1648 break
1649 # If it's a Tag, make sure its name or attributes match.
1650 # Don't bother with Tags if we're searching for text.
1651 elif isinstance(markup, Tag):
1652 if not self.text or self.name or self.attrs:
1653 found = self.search_tag(markup)
1654 # If it's text, make sure the text matches.
1655 elif isinstance(markup, NavigableString) or \
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001656 isinstance(markup, str):
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001657 if not self.name and not self.attrs and self._matches(markup, self.text):
1658 found = markup
1659 else:
1660 raise Exception(
1661 "I don't know how to match against a %s" % markup.__class__)
1662 return found
1663
1664 def _matches(self, markup, match_against):
1665 # print u"Matching %s against %s" % (markup, match_against)
1666 result = False
1667 if isinstance(markup, list) or isinstance(markup, tuple):
1668 # This should only happen when searching a multi-valued attribute
1669 # like 'class'.
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001670 if (isinstance(match_against, str)
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001671 and ' ' in match_against):
1672 # A bit of a special case. If they try to match "foo
1673 # bar" on a multivalue attribute's value, only accept
1674 # the literal value "foo bar"
1675 #
1676 # XXX This is going to be pretty slow because we keep
1677 # splitting match_against. But it shouldn't come up
1678 # too often.
1679 return (whitespace_re.split(match_against) == markup)
1680 else:
1681 for item in markup:
1682 if self._matches(item, match_against):
1683 return True
1684 return False
1685
1686 if match_against is True:
1687 # True matches any non-None value.
1688 return markup is not None
1689
Brad Bishop19323692019-04-05 15:28:33 -04001690 if isinstance(match_against, collections.abc.Callable):
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001691 return match_against(markup)
1692
1693 # Custom callables take the tag as an argument, but all
1694 # other ways of matching match the tag name as a string.
1695 if isinstance(markup, Tag):
1696 markup = markup.name
1697
1698 # Ensure that `markup` is either a Unicode string, or None.
1699 markup = self._normalize_search_value(markup)
1700
1701 if markup is None:
1702 # None matches None, False, an empty string, an empty list, and so on.
1703 return not match_against
1704
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001705 if isinstance(match_against, str):
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001706 # Exact string match
1707 return markup == match_against
1708
1709 if hasattr(match_against, 'match'):
1710 # Regexp match
1711 return match_against.search(markup)
1712
1713 if hasattr(match_against, '__iter__'):
1714 # The markup must be an exact match against something
1715 # in the iterable.
1716 return markup in match_against
1717
1718
1719class ResultSet(list):
1720 """A ResultSet is just a list that keeps track of the SoupStrainer
1721 that created it."""
1722 def __init__(self, source, result=()):
1723 super(ResultSet, self).__init__(result)
1724 self.source = source