blob: 0e62c2e100974114b02b3068c5a0efd51fa94a37 [file] [log] [blame]
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001__license__ = "MIT"
2
3from pdb import set_trace
Patrick Williamsc124f4f2015-09-15 14:41:29 -05004import collections
5import re
6import sys
7import warnings
8from bs4.dammit import EntitySubstitution
9
10DEFAULT_OUTPUT_ENCODING = "utf-8"
11PY3K = (sys.version_info[0] > 2)
12
13whitespace_re = re.compile("\s+")
14
15def _alias(attr):
16 """Alias one attribute name to another for backward compatibility"""
17 @property
18 def alias(self):
19 return getattr(self, attr)
20
21 @alias.setter
22 def alias(self):
23 return setattr(self, attr)
24 return alias
25
26
Patrick Williamsc0f7c042017-02-23 20:41:17 -060027class NamespacedAttribute(str):
Patrick Williamsc124f4f2015-09-15 14:41:29 -050028
29 def __new__(cls, prefix, name, namespace=None):
30 if name is None:
Patrick Williamsc0f7c042017-02-23 20:41:17 -060031 obj = str.__new__(cls, prefix)
Patrick Williamsc124f4f2015-09-15 14:41:29 -050032 elif prefix is None:
33 # Not really namespaced.
Patrick Williamsc0f7c042017-02-23 20:41:17 -060034 obj = str.__new__(cls, name)
Patrick Williamsc124f4f2015-09-15 14:41:29 -050035 else:
Patrick Williamsc0f7c042017-02-23 20:41:17 -060036 obj = str.__new__(cls, prefix + ":" + name)
Patrick Williamsc124f4f2015-09-15 14:41:29 -050037 obj.prefix = prefix
38 obj.name = name
39 obj.namespace = namespace
40 return obj
41
Patrick Williamsc0f7c042017-02-23 20:41:17 -060042class AttributeValueWithCharsetSubstitution(str):
Patrick Williamsc124f4f2015-09-15 14:41:29 -050043 """A stand-in object for a character encoding specified in HTML."""
44
45class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
46 """A generic stand-in for the value of a meta tag's 'charset' attribute.
47
48 When Beautiful Soup parses the markup '<meta charset="utf8">', the
49 value of the 'charset' attribute will be one of these objects.
50 """
51
52 def __new__(cls, original_value):
Patrick Williamsc0f7c042017-02-23 20:41:17 -060053 obj = str.__new__(cls, original_value)
Patrick Williamsc124f4f2015-09-15 14:41:29 -050054 obj.original_value = original_value
55 return obj
56
57 def encode(self, encoding):
58 return encoding
59
60
61class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
62 """A generic stand-in for the value of a meta tag's 'content' attribute.
63
64 When Beautiful Soup parses the markup:
65 <meta http-equiv="content-type" content="text/html; charset=utf8">
66
67 The value of the 'content' attribute will be one of these objects.
68 """
69
70 CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
71
72 def __new__(cls, original_value):
73 match = cls.CHARSET_RE.search(original_value)
74 if match is None:
75 # No substitution necessary.
Patrick Williamsc0f7c042017-02-23 20:41:17 -060076 return str.__new__(str, original_value)
Patrick Williamsc124f4f2015-09-15 14:41:29 -050077
Patrick Williamsc0f7c042017-02-23 20:41:17 -060078 obj = str.__new__(cls, original_value)
Patrick Williamsc124f4f2015-09-15 14:41:29 -050079 obj.original_value = original_value
80 return obj
81
82 def encode(self, encoding):
83 def rewrite(match):
84 return match.group(1) + encoding
85 return self.CHARSET_RE.sub(rewrite, self.original_value)
86
87class HTMLAwareEntitySubstitution(EntitySubstitution):
88
89 """Entity substitution rules that are aware of some HTML quirks.
90
91 Specifically, the contents of <script> and <style> tags should not
92 undergo entity substitution.
93
94 Incoming NavigableString objects are checked to see if they're the
95 direct children of a <script> or <style> tag.
96 """
97
98 cdata_containing_tags = set(["script", "style"])
99
100 preformatted_tags = set(["pre"])
101
102 @classmethod
103 def _substitute_if_appropriate(cls, ns, f):
104 if (isinstance(ns, NavigableString)
105 and ns.parent is not None
106 and ns.parent.name in cls.cdata_containing_tags):
107 # Do nothing.
108 return ns
109 # Substitute.
110 return f(ns)
111
112 @classmethod
113 def substitute_html(cls, ns):
114 return cls._substitute_if_appropriate(
115 ns, EntitySubstitution.substitute_html)
116
117 @classmethod
118 def substitute_xml(cls, ns):
119 return cls._substitute_if_appropriate(
120 ns, EntitySubstitution.substitute_xml)
121
122class PageElement(object):
123 """Contains the navigational information for some part of the page
124 (either a tag or a piece of text)"""
125
126 # There are five possible values for the "formatter" argument passed in
127 # to methods like encode() and prettify():
128 #
129 # "html" - All Unicode characters with corresponding HTML entities
130 # are converted to those entities on output.
131 # "minimal" - Bare ampersands and angle brackets are converted to
132 # XML entities: &amp; &lt; &gt;
133 # None - The null formatter. Unicode characters are never
134 # converted to entities. This is not recommended, but it's
135 # faster than "minimal".
136 # A function - This function will be called on every string that
137 # needs to undergo entity substitution.
138 #
139
140 # In an HTML document, the default "html" and "minimal" functions
141 # will leave the contents of <script> and <style> tags alone. For
142 # an XML document, all tags will be given the same treatment.
143
144 HTML_FORMATTERS = {
145 "html" : HTMLAwareEntitySubstitution.substitute_html,
146 "minimal" : HTMLAwareEntitySubstitution.substitute_xml,
147 None : None
148 }
149
150 XML_FORMATTERS = {
151 "html" : EntitySubstitution.substitute_html,
152 "minimal" : EntitySubstitution.substitute_xml,
153 None : None
154 }
155
156 def format_string(self, s, formatter='minimal'):
157 """Format the given string using the given formatter."""
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600158 if not isinstance(formatter, collections.Callable):
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500159 formatter = self._formatter_for_name(formatter)
160 if formatter is None:
161 output = s
162 else:
163 output = formatter(s)
164 return output
165
166 @property
167 def _is_xml(self):
168 """Is this element part of an XML tree or an HTML tree?
169
170 This is used when mapping a formatter name ("minimal") to an
171 appropriate function (one that performs entity-substitution on
172 the contents of <script> and <style> tags, or not). It's
173 inefficient, but it should be called very rarely.
174 """
175 if self.parent is None:
176 # This is the top-level object. It should have .is_xml set
177 # from tree creation. If not, take a guess--BS is usually
178 # used on HTML markup.
179 return getattr(self, 'is_xml', False)
180 return self.parent._is_xml
181
182 def _formatter_for_name(self, name):
183 "Look up a formatter function based on its name and the tree."
184 if self._is_xml:
185 return self.XML_FORMATTERS.get(
186 name, EntitySubstitution.substitute_xml)
187 else:
188 return self.HTML_FORMATTERS.get(
189 name, HTMLAwareEntitySubstitution.substitute_xml)
190
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600191 def setup(self, parent=None, previous_element=None, next_element=None,
192 previous_sibling=None, next_sibling=None):
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500193 """Sets up the initial relations between this element and
194 other elements."""
195 self.parent = parent
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600196
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500197 self.previous_element = previous_element
198 if previous_element is not None:
199 self.previous_element.next_element = self
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600200
201 self.next_element = next_element
202 if self.next_element:
203 self.next_element.previous_element = self
204
205 self.next_sibling = next_sibling
206 if self.next_sibling:
207 self.next_sibling.previous_sibling = self
208
209 if (not previous_sibling
210 and self.parent is not None and self.parent.contents):
211 previous_sibling = self.parent.contents[-1]
212
213 self.previous_sibling = previous_sibling
214 if previous_sibling:
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500215 self.previous_sibling.next_sibling = self
216
217 nextSibling = _alias("next_sibling") # BS3
218 previousSibling = _alias("previous_sibling") # BS3
219
220 def replace_with(self, replace_with):
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600221 if not self.parent:
222 raise ValueError(
223 "Cannot replace one element with another when the"
224 "element to be replaced is not part of a tree.")
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500225 if replace_with is self:
226 return
227 if replace_with is self.parent:
228 raise ValueError("Cannot replace a Tag with its parent.")
229 old_parent = self.parent
230 my_index = self.parent.index(self)
231 self.extract()
232 old_parent.insert(my_index, replace_with)
233 return self
234 replaceWith = replace_with # BS3
235
236 def unwrap(self):
237 my_parent = self.parent
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600238 if not self.parent:
239 raise ValueError(
240 "Cannot replace an element with its contents when that"
241 "element is not part of a tree.")
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500242 my_index = self.parent.index(self)
243 self.extract()
244 for child in reversed(self.contents[:]):
245 my_parent.insert(my_index, child)
246 return self
247 replace_with_children = unwrap
248 replaceWithChildren = unwrap # BS3
249
250 def wrap(self, wrap_inside):
251 me = self.replace_with(wrap_inside)
252 wrap_inside.append(me)
253 return wrap_inside
254
255 def extract(self):
256 """Destructively rips this element out of the tree."""
257 if self.parent is not None:
258 del self.parent.contents[self.parent.index(self)]
259
260 #Find the two elements that would be next to each other if
261 #this element (and any children) hadn't been parsed. Connect
262 #the two.
263 last_child = self._last_descendant()
264 next_element = last_child.next_element
265
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600266 if (self.previous_element is not None and
267 self.previous_element is not next_element):
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500268 self.previous_element.next_element = next_element
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600269 if next_element is not None and next_element is not self.previous_element:
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500270 next_element.previous_element = self.previous_element
271 self.previous_element = None
272 last_child.next_element = None
273
274 self.parent = None
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600275 if (self.previous_sibling is not None
276 and self.previous_sibling is not self.next_sibling):
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500277 self.previous_sibling.next_sibling = self.next_sibling
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600278 if (self.next_sibling is not None
279 and self.next_sibling is not self.previous_sibling):
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500280 self.next_sibling.previous_sibling = self.previous_sibling
281 self.previous_sibling = self.next_sibling = None
282 return self
283
284 def _last_descendant(self, is_initialized=True, accept_self=True):
285 "Finds the last element beneath this object to be parsed."
286 if is_initialized and self.next_sibling:
287 last_child = self.next_sibling.previous_element
288 else:
289 last_child = self
290 while isinstance(last_child, Tag) and last_child.contents:
291 last_child = last_child.contents[-1]
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600292 if not accept_self and last_child is self:
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500293 last_child = None
294 return last_child
295 # BS3: Not part of the API!
296 _lastRecursiveChild = _last_descendant
297
298 def insert(self, position, new_child):
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600299 if new_child is None:
300 raise ValueError("Cannot insert None into a tag.")
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500301 if new_child is self:
302 raise ValueError("Cannot insert a tag into itself.")
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600303 if (isinstance(new_child, str)
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500304 and not isinstance(new_child, NavigableString)):
305 new_child = NavigableString(new_child)
306
307 position = min(position, len(self.contents))
308 if hasattr(new_child, 'parent') and new_child.parent is not None:
309 # We're 'inserting' an element that's already one
310 # of this object's children.
311 if new_child.parent is self:
312 current_index = self.index(new_child)
313 if current_index < position:
314 # We're moving this element further down the list
315 # of this object's children. That means that when
316 # we extract this element, our target index will
317 # jump down one.
318 position -= 1
319 new_child.extract()
320
321 new_child.parent = self
322 previous_child = None
323 if position == 0:
324 new_child.previous_sibling = None
325 new_child.previous_element = self
326 else:
327 previous_child = self.contents[position - 1]
328 new_child.previous_sibling = previous_child
329 new_child.previous_sibling.next_sibling = new_child
330 new_child.previous_element = previous_child._last_descendant(False)
331 if new_child.previous_element is not None:
332 new_child.previous_element.next_element = new_child
333
334 new_childs_last_element = new_child._last_descendant(False)
335
336 if position >= len(self.contents):
337 new_child.next_sibling = None
338
339 parent = self
340 parents_next_sibling = None
341 while parents_next_sibling is None and parent is not None:
342 parents_next_sibling = parent.next_sibling
343 parent = parent.parent
344 if parents_next_sibling is not None:
345 # We found the element that comes next in the document.
346 break
347 if parents_next_sibling is not None:
348 new_childs_last_element.next_element = parents_next_sibling
349 else:
350 # The last element of this tag is the last element in
351 # the document.
352 new_childs_last_element.next_element = None
353 else:
354 next_child = self.contents[position]
355 new_child.next_sibling = next_child
356 if new_child.next_sibling is not None:
357 new_child.next_sibling.previous_sibling = new_child
358 new_childs_last_element.next_element = next_child
359
360 if new_childs_last_element.next_element is not None:
361 new_childs_last_element.next_element.previous_element = new_childs_last_element
362 self.contents.insert(position, new_child)
363
364 def append(self, tag):
365 """Appends the given tag to the contents of this tag."""
366 self.insert(len(self.contents), tag)
367
368 def insert_before(self, predecessor):
369 """Makes the given element the immediate predecessor of this one.
370
371 The two elements will have the same parent, and the given element
372 will be immediately before this one.
373 """
374 if self is predecessor:
375 raise ValueError("Can't insert an element before itself.")
376 parent = self.parent
377 if parent is None:
378 raise ValueError(
379 "Element has no parent, so 'before' has no meaning.")
380 # Extract first so that the index won't be screwed up if they
381 # are siblings.
382 if isinstance(predecessor, PageElement):
383 predecessor.extract()
384 index = parent.index(self)
385 parent.insert(index, predecessor)
386
387 def insert_after(self, successor):
388 """Makes the given element the immediate successor of this one.
389
390 The two elements will have the same parent, and the given element
391 will be immediately after this one.
392 """
393 if self is successor:
394 raise ValueError("Can't insert an element after itself.")
395 parent = self.parent
396 if parent is None:
397 raise ValueError(
398 "Element has no parent, so 'after' has no meaning.")
399 # Extract first so that the index won't be screwed up if they
400 # are siblings.
401 if isinstance(successor, PageElement):
402 successor.extract()
403 index = parent.index(self)
404 parent.insert(index+1, successor)
405
406 def find_next(self, name=None, attrs={}, text=None, **kwargs):
407 """Returns the first item that matches the given criteria and
408 appears after this Tag in the document."""
409 return self._find_one(self.find_all_next, name, attrs, text, **kwargs)
410 findNext = find_next # BS3
411
412 def find_all_next(self, name=None, attrs={}, text=None, limit=None,
413 **kwargs):
414 """Returns all items that match the given criteria and appear
415 after this Tag in the document."""
416 return self._find_all(name, attrs, text, limit, self.next_elements,
417 **kwargs)
418 findAllNext = find_all_next # BS3
419
420 def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs):
421 """Returns the closest sibling to this Tag that matches the
422 given criteria and appears after this Tag in the document."""
423 return self._find_one(self.find_next_siblings, name, attrs, text,
424 **kwargs)
425 findNextSibling = find_next_sibling # BS3
426
427 def find_next_siblings(self, name=None, attrs={}, text=None, limit=None,
428 **kwargs):
429 """Returns the siblings of this Tag that match the given
430 criteria and appear after this Tag in the document."""
431 return self._find_all(name, attrs, text, limit,
432 self.next_siblings, **kwargs)
433 findNextSiblings = find_next_siblings # BS3
434 fetchNextSiblings = find_next_siblings # BS2
435
436 def find_previous(self, name=None, attrs={}, text=None, **kwargs):
437 """Returns the first item that matches the given criteria and
438 appears before this Tag in the document."""
439 return self._find_one(
440 self.find_all_previous, name, attrs, text, **kwargs)
441 findPrevious = find_previous # BS3
442
443 def find_all_previous(self, name=None, attrs={}, text=None, limit=None,
444 **kwargs):
445 """Returns all items that match the given criteria and appear
446 before this Tag in the document."""
447 return self._find_all(name, attrs, text, limit, self.previous_elements,
448 **kwargs)
449 findAllPrevious = find_all_previous # BS3
450 fetchPrevious = find_all_previous # BS2
451
452 def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs):
453 """Returns the closest sibling to this Tag that matches the
454 given criteria and appears before this Tag in the document."""
455 return self._find_one(self.find_previous_siblings, name, attrs, text,
456 **kwargs)
457 findPreviousSibling = find_previous_sibling # BS3
458
459 def find_previous_siblings(self, name=None, attrs={}, text=None,
460 limit=None, **kwargs):
461 """Returns the siblings of this Tag that match the given
462 criteria and appear before this Tag in the document."""
463 return self._find_all(name, attrs, text, limit,
464 self.previous_siblings, **kwargs)
465 findPreviousSiblings = find_previous_siblings # BS3
466 fetchPreviousSiblings = find_previous_siblings # BS2
467
468 def find_parent(self, name=None, attrs={}, **kwargs):
469 """Returns the closest parent of this Tag that matches the given
470 criteria."""
471 # NOTE: We can't use _find_one because findParents takes a different
472 # set of arguments.
473 r = None
474 l = self.find_parents(name, attrs, 1, **kwargs)
475 if l:
476 r = l[0]
477 return r
478 findParent = find_parent # BS3
479
480 def find_parents(self, name=None, attrs={}, limit=None, **kwargs):
481 """Returns the parents of this Tag that match the given
482 criteria."""
483
484 return self._find_all(name, attrs, None, limit, self.parents,
485 **kwargs)
486 findParents = find_parents # BS3
487 fetchParents = find_parents # BS2
488
489 @property
490 def next(self):
491 return self.next_element
492
493 @property
494 def previous(self):
495 return self.previous_element
496
497 #These methods do the real heavy lifting.
498
499 def _find_one(self, method, name, attrs, text, **kwargs):
500 r = None
501 l = method(name, attrs, text, 1, **kwargs)
502 if l:
503 r = l[0]
504 return r
505
506 def _find_all(self, name, attrs, text, limit, generator, **kwargs):
507 "Iterates over a generator looking for things that match."
508
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600509 if text is None and 'string' in kwargs:
510 text = kwargs['string']
511 del kwargs['string']
512
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500513 if isinstance(name, SoupStrainer):
514 strainer = name
515 else:
516 strainer = SoupStrainer(name, attrs, text, **kwargs)
517
518 if text is None and not limit and not attrs and not kwargs:
519 if name is True or name is None:
520 # Optimization to find all tags.
521 result = (element for element in generator
522 if isinstance(element, Tag))
523 return ResultSet(strainer, result)
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600524 elif isinstance(name, str):
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500525 # Optimization to find all tags with a given name.
526 result = (element for element in generator
527 if isinstance(element, Tag)
528 and element.name == name)
529 return ResultSet(strainer, result)
530 results = ResultSet(strainer)
531 while True:
532 try:
533 i = next(generator)
534 except StopIteration:
535 break
536 if i:
537 found = strainer.search(i)
538 if found:
539 results.append(found)
540 if limit and len(results) >= limit:
541 break
542 return results
543
544 #These generators can be used to navigate starting from both
545 #NavigableStrings and Tags.
546 @property
547 def next_elements(self):
548 i = self.next_element
549 while i is not None:
550 yield i
551 i = i.next_element
552
553 @property
554 def next_siblings(self):
555 i = self.next_sibling
556 while i is not None:
557 yield i
558 i = i.next_sibling
559
560 @property
561 def previous_elements(self):
562 i = self.previous_element
563 while i is not None:
564 yield i
565 i = i.previous_element
566
567 @property
568 def previous_siblings(self):
569 i = self.previous_sibling
570 while i is not None:
571 yield i
572 i = i.previous_sibling
573
574 @property
575 def parents(self):
576 i = self.parent
577 while i is not None:
578 yield i
579 i = i.parent
580
581 # Methods for supporting CSS selectors.
582
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600583 tag_name_re = re.compile('^[a-zA-Z0-9][-.a-zA-Z0-9:_]*$')
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500584
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600585 # /^([a-zA-Z0-9][-.a-zA-Z0-9:_]*)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
586 # \---------------------------/ \---/\-------------/ \-------/
587 # | | | |
588 # | | | The value
589 # | | ~,|,^,$,* or =
590 # | Attribute
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500591 # Tag
592 attribselect_re = re.compile(
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600593 r'^(?P<tag>[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P<attribute>[\w-]+)(?P<operator>[=~\|\^\$\*]?)' +
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500594 r'=?"?(?P<value>[^\]"]*)"?\]$'
595 )
596
597 def _attr_value_as_string(self, value, default=None):
598 """Force an attribute value into a string representation.
599
600 A multi-valued attribute will be converted into a
601 space-separated stirng.
602 """
603 value = self.get(value, default)
604 if isinstance(value, list) or isinstance(value, tuple):
605 value =" ".join(value)
606 return value
607
608 def _tag_name_matches_and(self, function, tag_name):
609 if not tag_name:
610 return function
611 else:
612 def _match(tag):
613 return tag.name == tag_name and function(tag)
614 return _match
615
616 def _attribute_checker(self, operator, attribute, value=''):
617 """Create a function that performs a CSS selector operation.
618
619 Takes an operator, attribute and optional value. Returns a
620 function that will return True for elements that match that
621 combination.
622 """
623 if operator == '=':
624 # string representation of `attribute` is equal to `value`
625 return lambda el: el._attr_value_as_string(attribute) == value
626 elif operator == '~':
627 # space-separated list representation of `attribute`
628 # contains `value`
629 def _includes_value(element):
630 attribute_value = element.get(attribute, [])
631 if not isinstance(attribute_value, list):
632 attribute_value = attribute_value.split()
633 return value in attribute_value
634 return _includes_value
635 elif operator == '^':
636 # string representation of `attribute` starts with `value`
637 return lambda el: el._attr_value_as_string(
638 attribute, '').startswith(value)
639 elif operator == '$':
640 # string represenation of `attribute` ends with `value`
641 return lambda el: el._attr_value_as_string(
642 attribute, '').endswith(value)
643 elif operator == '*':
644 # string representation of `attribute` contains `value`
645 return lambda el: value in el._attr_value_as_string(attribute, '')
646 elif operator == '|':
647 # string representation of `attribute` is either exactly
648 # `value` or starts with `value` and then a dash.
649 def _is_or_starts_with_dash(element):
650 attribute_value = element._attr_value_as_string(attribute, '')
651 return (attribute_value == value or attribute_value.startswith(
652 value + '-'))
653 return _is_or_starts_with_dash
654 else:
655 return lambda el: el.has_attr(attribute)
656
657 # Old non-property versions of the generators, for backwards
658 # compatibility with BS3.
659 def nextGenerator(self):
660 return self.next_elements
661
662 def nextSiblingGenerator(self):
663 return self.next_siblings
664
665 def previousGenerator(self):
666 return self.previous_elements
667
668 def previousSiblingGenerator(self):
669 return self.previous_siblings
670
671 def parentGenerator(self):
672 return self.parents
673
674
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600675class NavigableString(str, PageElement):
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500676
677 PREFIX = ''
678 SUFFIX = ''
679
680 def __new__(cls, value):
681 """Create a new NavigableString.
682
683 When unpickling a NavigableString, this method is called with
684 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
685 passed in to the superclass's __new__ or the superclass won't know
686 how to handle non-ASCII characters.
687 """
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600688 if isinstance(value, str):
689 u = str.__new__(cls, value)
690 else:
691 u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
692 u.setup()
693 return u
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500694
695 def __copy__(self):
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600696 """A copy of a NavigableString has the same contents and class
697 as the original, but it is not connected to the parse tree.
698 """
699 return type(self)(self)
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500700
701 def __getnewargs__(self):
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600702 return (str(self),)
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500703
704 def __getattr__(self, attr):
705 """text.string gives you text. This is for backwards
706 compatibility for Navigable*String, but for CData* it lets you
707 get the string without the CData wrapper."""
708 if attr == 'string':
709 return self
710 else:
711 raise AttributeError(
712 "'%s' object has no attribute '%s'" % (
713 self.__class__.__name__, attr))
714
715 def output_ready(self, formatter="minimal"):
716 output = self.format_string(self, formatter)
717 return self.PREFIX + output + self.SUFFIX
718
719 @property
720 def name(self):
721 return None
722
723 @name.setter
724 def name(self, name):
725 raise AttributeError("A NavigableString cannot be given a name.")
726
727class PreformattedString(NavigableString):
728 """A NavigableString not subject to the normal formatting rules.
729
730 The string will be passed into the formatter (to trigger side effects),
731 but the return value will be ignored.
732 """
733
734 def output_ready(self, formatter="minimal"):
735 """CData strings are passed into the formatter.
736 But the return value is ignored."""
737 self.format_string(self, formatter)
738 return self.PREFIX + self + self.SUFFIX
739
740class CData(PreformattedString):
741
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600742 PREFIX = '<![CDATA['
743 SUFFIX = ']]>'
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500744
745class ProcessingInstruction(PreformattedString):
746
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600747 PREFIX = '<?'
748 SUFFIX = '>'
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500749
750class Comment(PreformattedString):
751
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600752 PREFIX = '<!--'
753 SUFFIX = '-->'
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500754
755
756class Declaration(PreformattedString):
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600757 PREFIX = '<?'
758 SUFFIX = '?>'
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500759
760
761class Doctype(PreformattedString):
762
763 @classmethod
764 def for_name_and_ids(cls, name, pub_id, system_id):
765 value = name or ''
766 if pub_id is not None:
767 value += ' PUBLIC "%s"' % pub_id
768 if system_id is not None:
769 value += ' "%s"' % system_id
770 elif system_id is not None:
771 value += ' SYSTEM "%s"' % system_id
772
773 return Doctype(value)
774
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600775 PREFIX = '<!DOCTYPE '
776 SUFFIX = '>\n'
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500777
778
779class Tag(PageElement):
780
781 """Represents a found HTML tag with its attributes and contents."""
782
783 def __init__(self, parser=None, builder=None, name=None, namespace=None,
784 prefix=None, attrs=None, parent=None, previous=None):
785 "Basic constructor."
786
787 if parser is None:
788 self.parser_class = None
789 else:
790 # We don't actually store the parser object: that lets extracted
791 # chunks be garbage-collected.
792 self.parser_class = parser.__class__
793 if name is None:
794 raise ValueError("No value provided for new tag's name.")
795 self.name = name
796 self.namespace = namespace
797 self.prefix = prefix
798 if attrs is None:
799 attrs = {}
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600800 elif attrs:
801 if builder is not None and builder.cdata_list_attributes:
802 attrs = builder._replace_cdata_list_attribute_values(
803 self.name, attrs)
804 else:
805 attrs = dict(attrs)
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500806 else:
807 attrs = dict(attrs)
808 self.attrs = attrs
809 self.contents = []
810 self.setup(parent, previous)
811 self.hidden = False
812
813 # Set up any substitutions, such as the charset in a META tag.
814 if builder is not None:
815 builder.set_up_substitutions(self)
816 self.can_be_empty_element = builder.can_be_empty_element(name)
817 else:
818 self.can_be_empty_element = False
819
820 parserClass = _alias("parser_class") # BS3
821
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600822 def __copy__(self):
823 """A copy of a Tag is a new Tag, unconnected to the parse tree.
824 Its contents are a copy of the old Tag's contents.
825 """
826 clone = type(self)(None, self.builder, self.name, self.namespace,
827 self.nsprefix, self.attrs)
828 for attr in ('can_be_empty_element', 'hidden'):
829 setattr(clone, attr, getattr(self, attr))
830 for child in self.contents:
831 clone.append(child.__copy__())
832 return clone
833
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500834 @property
835 def is_empty_element(self):
836 """Is this tag an empty-element tag? (aka a self-closing tag)
837
838 A tag that has contents is never an empty-element tag.
839
840 A tag that has no contents may or may not be an empty-element
841 tag. It depends on the builder used to create the tag. If the
842 builder has a designated list of empty-element tags, then only
843 a tag whose name shows up in that list is considered an
844 empty-element tag.
845
846 If the builder has no designated list of empty-element tags,
847 then any tag with no contents is an empty-element tag.
848 """
849 return len(self.contents) == 0 and self.can_be_empty_element
850 isSelfClosing = is_empty_element # BS3
851
852 @property
853 def string(self):
854 """Convenience property to get the single string within this tag.
855
856 :Return: If this tag has a single string child, return value
857 is that string. If this tag has no children, or more than one
858 child, return value is None. If this tag has one child tag,
859 return value is the 'string' attribute of the child tag,
860 recursively.
861 """
862 if len(self.contents) != 1:
863 return None
864 child = self.contents[0]
865 if isinstance(child, NavigableString):
866 return child
867 return child.string
868
869 @string.setter
870 def string(self, string):
871 self.clear()
872 self.append(string.__class__(string))
873
874 def _all_strings(self, strip=False, types=(NavigableString, CData)):
875 """Yield all strings of certain classes, possibly stripping them.
876
877 By default, yields only NavigableString and CData objects. So
878 no comments, processing instructions, etc.
879 """
880 for descendant in self.descendants:
881 if (
882 (types is None and not isinstance(descendant, NavigableString))
883 or
884 (types is not None and type(descendant) not in types)):
885 continue
886 if strip:
887 descendant = descendant.strip()
888 if len(descendant) == 0:
889 continue
890 yield descendant
891
892 strings = property(_all_strings)
893
894 @property
895 def stripped_strings(self):
896 for string in self._all_strings(True):
897 yield string
898
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600899 def get_text(self, separator="", strip=False,
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500900 types=(NavigableString, CData)):
901 """
902 Get all child strings, concatenated using the given separator.
903 """
904 return separator.join([s for s in self._all_strings(
905 strip, types=types)])
906 getText = get_text
907 text = property(get_text)
908
909 def decompose(self):
910 """Recursively destroys the contents of this tree."""
911 self.extract()
912 i = self
913 while i is not None:
914 next = i.next_element
915 i.__dict__.clear()
916 i.contents = []
917 i = next
918
919 def clear(self, decompose=False):
920 """
921 Extract all children. If decompose is True, decompose instead.
922 """
923 if decompose:
924 for element in self.contents[:]:
925 if isinstance(element, Tag):
926 element.decompose()
927 else:
928 element.extract()
929 else:
930 for element in self.contents[:]:
931 element.extract()
932
933 def index(self, element):
934 """
935 Find the index of a child by identity, not value. Avoids issues with
936 tag.contents.index(element) getting the index of equal elements.
937 """
938 for i, child in enumerate(self.contents):
939 if child is element:
940 return i
941 raise ValueError("Tag.index: element not in tag")
942
943 def get(self, key, default=None):
944 """Returns the value of the 'key' attribute for the tag, or
945 the value given for 'default' if it doesn't have that
946 attribute."""
947 return self.attrs.get(key, default)
948
949 def has_attr(self, key):
950 return key in self.attrs
951
952 def __hash__(self):
953 return str(self).__hash__()
954
955 def __getitem__(self, key):
956 """tag[key] returns the value of the 'key' attribute for the tag,
957 and throws an exception if it's not there."""
958 return self.attrs[key]
959
960 def __iter__(self):
961 "Iterating over a tag iterates over its contents."
962 return iter(self.contents)
963
964 def __len__(self):
965 "The length of a tag is the length of its list of contents."
966 return len(self.contents)
967
968 def __contains__(self, x):
969 return x in self.contents
970
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600971 def __bool__(self):
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500972 "A tag is non-None even if it has no contents."
973 return True
974
975 def __setitem__(self, key, value):
976 """Setting tag[key] sets the value of the 'key' attribute for the
977 tag."""
978 self.attrs[key] = value
979
980 def __delitem__(self, key):
981 "Deleting tag[key] deletes all 'key' attributes for the tag."
982 self.attrs.pop(key, None)
983
984 def __call__(self, *args, **kwargs):
985 """Calling a tag like a function is the same as calling its
986 find_all() method. Eg. tag('a') returns a list of all the A tags
987 found within this tag."""
988 return self.find_all(*args, **kwargs)
989
990 def __getattr__(self, tag):
991 #print "Getattr %s.%s" % (self.__class__, tag)
992 if len(tag) > 3 and tag.endswith('Tag'):
993 # BS3: soup.aTag -> "soup.find("a")
994 tag_name = tag[:-3]
995 warnings.warn(
996 '.%sTag is deprecated, use .find("%s") instead.' % (
997 tag_name, tag_name))
998 return self.find(tag_name)
999 # We special case contents to avoid recursion.
1000 elif not tag.startswith("__") and not tag=="contents":
1001 return self.find(tag)
1002 raise AttributeError(
1003 "'%s' object has no attribute '%s'" % (self.__class__, tag))
1004
1005 def __eq__(self, other):
1006 """Returns true iff this tag has the same name, the same attributes,
1007 and the same contents (recursively) as the given tag."""
1008 if self is other:
1009 return True
1010 if (not hasattr(other, 'name') or
1011 not hasattr(other, 'attrs') or
1012 not hasattr(other, 'contents') or
1013 self.name != other.name or
1014 self.attrs != other.attrs or
1015 len(self) != len(other)):
1016 return False
1017 for i, my_child in enumerate(self.contents):
1018 if my_child != other.contents[i]:
1019 return False
1020 return True
1021
1022 def __ne__(self, other):
1023 """Returns true iff this tag is not identical to the other tag,
1024 as defined in __eq__."""
1025 return not self == other
1026
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001027 def __repr__(self, encoding="unicode-escape"):
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001028 """Renders this tag as a string."""
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001029 if PY3K:
1030 # "The return value must be a string object", i.e. Unicode
1031 return self.decode()
1032 else:
1033 # "The return value must be a string object", i.e. a bytestring.
1034 # By convention, the return value of __repr__ should also be
1035 # an ASCII string.
1036 return self.encode(encoding)
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001037
1038 def __unicode__(self):
1039 return self.decode()
1040
1041 def __str__(self):
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001042 if PY3K:
1043 return self.decode()
1044 else:
1045 return self.encode()
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001046
1047 if PY3K:
1048 __str__ = __repr__ = __unicode__
1049
1050 def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
1051 indent_level=None, formatter="minimal",
1052 errors="xmlcharrefreplace"):
1053 # Turn the data structure into Unicode, then encode the
1054 # Unicode.
1055 u = self.decode(indent_level, encoding, formatter)
1056 return u.encode(encoding, errors)
1057
1058 def _should_pretty_print(self, indent_level):
1059 """Should this tag be pretty-printed?"""
1060 return (
1061 indent_level is not None and
1062 (self.name not in HTMLAwareEntitySubstitution.preformatted_tags
1063 or self._is_xml))
1064
1065 def decode(self, indent_level=None,
1066 eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1067 formatter="minimal"):
1068 """Returns a Unicode representation of this tag and its contents.
1069
1070 :param eventual_encoding: The tag is destined to be
1071 encoded into this encoding. This method is _not_
1072 responsible for performing that encoding. This information
1073 is passed in so that it can be substituted in if the
1074 document contains a <META> tag that mentions the document's
1075 encoding.
1076 """
1077
1078 # First off, turn a string formatter into a function. This
1079 # will stop the lookup from happening over and over again.
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001080 if not isinstance(formatter, collections.Callable):
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001081 formatter = self._formatter_for_name(formatter)
1082
1083 attrs = []
1084 if self.attrs:
1085 for key, val in sorted(self.attrs.items()):
1086 if val is None:
1087 decoded = key
1088 else:
1089 if isinstance(val, list) or isinstance(val, tuple):
1090 val = ' '.join(val)
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001091 elif not isinstance(val, str):
1092 val = str(val)
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001093 elif (
1094 isinstance(val, AttributeValueWithCharsetSubstitution)
1095 and eventual_encoding is not None):
1096 val = val.encode(eventual_encoding)
1097
1098 text = self.format_string(val, formatter)
1099 decoded = (
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001100 str(key) + '='
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001101 + EntitySubstitution.quoted_attribute_value(text))
1102 attrs.append(decoded)
1103 close = ''
1104 closeTag = ''
1105
1106 prefix = ''
1107 if self.prefix:
1108 prefix = self.prefix + ":"
1109
1110 if self.is_empty_element:
1111 close = '/'
1112 else:
1113 closeTag = '</%s%s>' % (prefix, self.name)
1114
1115 pretty_print = self._should_pretty_print(indent_level)
1116 space = ''
1117 indent_space = ''
1118 if indent_level is not None:
1119 indent_space = (' ' * (indent_level - 1))
1120 if pretty_print:
1121 space = indent_space
1122 indent_contents = indent_level + 1
1123 else:
1124 indent_contents = None
1125 contents = self.decode_contents(
1126 indent_contents, eventual_encoding, formatter)
1127
1128 if self.hidden:
1129 # This is the 'document root' object.
1130 s = contents
1131 else:
1132 s = []
1133 attribute_string = ''
1134 if attrs:
1135 attribute_string = ' ' + ' '.join(attrs)
1136 if indent_level is not None:
1137 # Even if this particular tag is not pretty-printed,
1138 # we should indent up to the start of the tag.
1139 s.append(indent_space)
1140 s.append('<%s%s%s%s>' % (
1141 prefix, self.name, attribute_string, close))
1142 if pretty_print:
1143 s.append("\n")
1144 s.append(contents)
1145 if pretty_print and contents and contents[-1] != "\n":
1146 s.append("\n")
1147 if pretty_print and closeTag:
1148 s.append(space)
1149 s.append(closeTag)
1150 if indent_level is not None and closeTag and self.next_sibling:
1151 # Even if this particular tag is not pretty-printed,
1152 # we're now done with the tag, and we should add a
1153 # newline if appropriate.
1154 s.append("\n")
1155 s = ''.join(s)
1156 return s
1157
1158 def prettify(self, encoding=None, formatter="minimal"):
1159 if encoding is None:
1160 return self.decode(True, formatter=formatter)
1161 else:
1162 return self.encode(encoding, True, formatter=formatter)
1163
1164 def decode_contents(self, indent_level=None,
1165 eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1166 formatter="minimal"):
1167 """Renders the contents of this tag as a Unicode string.
1168
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001169 :param indent_level: Each line of the rendering will be
1170 indented this many spaces.
1171
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001172 :param eventual_encoding: The tag is destined to be
1173 encoded into this encoding. This method is _not_
1174 responsible for performing that encoding. This information
1175 is passed in so that it can be substituted in if the
1176 document contains a <META> tag that mentions the document's
1177 encoding.
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001178
1179 :param formatter: The output formatter responsible for converting
1180 entities to Unicode characters.
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001181 """
1182 # First off, turn a string formatter into a function. This
1183 # will stop the lookup from happening over and over again.
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001184 if not isinstance(formatter, collections.Callable):
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001185 formatter = self._formatter_for_name(formatter)
1186
1187 pretty_print = (indent_level is not None)
1188 s = []
1189 for c in self:
1190 text = None
1191 if isinstance(c, NavigableString):
1192 text = c.output_ready(formatter)
1193 elif isinstance(c, Tag):
1194 s.append(c.decode(indent_level, eventual_encoding,
1195 formatter))
1196 if text and indent_level and not self.name == 'pre':
1197 text = text.strip()
1198 if text:
1199 if pretty_print and not self.name == 'pre':
1200 s.append(" " * (indent_level - 1))
1201 s.append(text)
1202 if pretty_print and not self.name == 'pre':
1203 s.append("\n")
1204 return ''.join(s)
1205
1206 def encode_contents(
1207 self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
1208 formatter="minimal"):
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001209 """Renders the contents of this tag as a bytestring.
1210
1211 :param indent_level: Each line of the rendering will be
1212 indented this many spaces.
1213
1214 :param eventual_encoding: The bytestring will be in this encoding.
1215
1216 :param formatter: The output formatter responsible for converting
1217 entities to Unicode characters.
1218 """
1219
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001220 contents = self.decode_contents(indent_level, encoding, formatter)
1221 return contents.encode(encoding)
1222
1223 # Old method for BS3 compatibility
1224 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
1225 prettyPrint=False, indentLevel=0):
1226 if not prettyPrint:
1227 indentLevel = None
1228 return self.encode_contents(
1229 indent_level=indentLevel, encoding=encoding)
1230
1231 #Soup methods
1232
1233 def find(self, name=None, attrs={}, recursive=True, text=None,
1234 **kwargs):
1235 """Return only the first child of this Tag matching the given
1236 criteria."""
1237 r = None
1238 l = self.find_all(name, attrs, recursive, text, 1, **kwargs)
1239 if l:
1240 r = l[0]
1241 return r
1242 findChild = find
1243
1244 def find_all(self, name=None, attrs={}, recursive=True, text=None,
1245 limit=None, **kwargs):
1246 """Extracts a list of Tag objects that match the given
1247 criteria. You can specify the name of the Tag and any
1248 attributes you want the Tag to have.
1249
1250 The value of a key-value pair in the 'attrs' map can be a
1251 string, a list of strings, a regular expression object, or a
1252 callable that takes a string and returns whether or not the
1253 string matches for some custom definition of 'matches'. The
1254 same is true of the tag name."""
1255
1256 generator = self.descendants
1257 if not recursive:
1258 generator = self.children
1259 return self._find_all(name, attrs, text, limit, generator, **kwargs)
1260 findAll = find_all # BS3
1261 findChildren = find_all # BS2
1262
1263 #Generator methods
1264 @property
1265 def children(self):
1266 # return iter() to make the purpose of the method clear
1267 return iter(self.contents) # XXX This seems to be untested.
1268
1269 @property
1270 def descendants(self):
1271 if not len(self.contents):
1272 return
1273 stopNode = self._last_descendant().next_element
1274 current = self.contents[0]
1275 while current is not stopNode:
1276 yield current
1277 current = current.next_element
1278
1279 # CSS selector code
1280
1281 _selector_combinators = ['>', '+', '~']
1282 _select_debug = False
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001283 def select_one(self, selector):
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001284 """Perform a CSS selection operation on the current element."""
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001285 value = self.select(selector, limit=1)
1286 if value:
1287 return value[0]
1288 return None
1289
1290 def select(self, selector, _candidate_generator=None, limit=None):
1291 """Perform a CSS selection operation on the current element."""
1292
1293 # Handle grouping selectors if ',' exists, ie: p,a
1294 if ',' in selector:
1295 context = []
1296 for partial_selector in selector.split(','):
1297 partial_selector = partial_selector.strip()
1298 if partial_selector == '':
1299 raise ValueError('Invalid group selection syntax: %s' % selector)
1300 candidates = self.select(partial_selector, limit=limit)
1301 for candidate in candidates:
1302 if candidate not in context:
1303 context.append(candidate)
1304
1305 if limit and len(context) >= limit:
1306 break
1307 return context
1308
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001309 tokens = selector.split()
1310 current_context = [self]
1311
1312 if tokens[-1] in self._selector_combinators:
1313 raise ValueError(
1314 'Final combinator "%s" is missing an argument.' % tokens[-1])
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001315
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001316 if self._select_debug:
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001317 print('Running CSS selector "%s"' % selector)
1318
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001319 for index, token in enumerate(tokens):
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001320 new_context = []
1321 new_context_ids = set([])
1322
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001323 if tokens[index-1] in self._selector_combinators:
1324 # This token was consumed by the previous combinator. Skip it.
1325 if self._select_debug:
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001326 print(' Token was consumed by the previous combinator.')
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001327 continue
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001328
1329 if self._select_debug:
1330 print(' Considering token "%s"' % token)
1331 recursive_candidate_generator = None
1332 tag_name = None
1333
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001334 # Each operation corresponds to a checker function, a rule
1335 # for determining whether a candidate matches the
1336 # selector. Candidates are generated by the active
1337 # iterator.
1338 checker = None
1339
1340 m = self.attribselect_re.match(token)
1341 if m is not None:
1342 # Attribute selector
1343 tag_name, attribute, operator, value = m.groups()
1344 checker = self._attribute_checker(operator, attribute, value)
1345
1346 elif '#' in token:
1347 # ID selector
1348 tag_name, tag_id = token.split('#', 1)
1349 def id_matches(tag):
1350 return tag.get('id', None) == tag_id
1351 checker = id_matches
1352
1353 elif '.' in token:
1354 # Class selector
1355 tag_name, klass = token.split('.', 1)
1356 classes = set(klass.split('.'))
1357 def classes_match(candidate):
1358 return classes.issubset(candidate.get('class', []))
1359 checker = classes_match
1360
1361 elif ':' in token:
1362 # Pseudo-class
1363 tag_name, pseudo = token.split(':', 1)
1364 if tag_name == '':
1365 raise ValueError(
1366 "A pseudo-class must be prefixed with a tag name.")
1367 pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
1368 found = []
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001369 if pseudo_attributes is None:
1370 pseudo_type = pseudo
1371 pseudo_value = None
1372 else:
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001373 pseudo_type, pseudo_value = pseudo_attributes.groups()
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001374 if pseudo_type == 'nth-of-type':
1375 try:
1376 pseudo_value = int(pseudo_value)
1377 except:
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001378 raise NotImplementedError(
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001379 'Only numeric values are currently supported for the nth-of-type pseudo-class.')
1380 if pseudo_value < 1:
1381 raise ValueError(
1382 'nth-of-type pseudo-class value must be at least 1.')
1383 class Counter(object):
1384 def __init__(self, destination):
1385 self.count = 0
1386 self.destination = destination
1387
1388 def nth_child_of_type(self, tag):
1389 self.count += 1
1390 if self.count == self.destination:
1391 return True
1392 if self.count > self.destination:
1393 # Stop the generator that's sending us
1394 # these things.
1395 raise StopIteration()
1396 return False
1397 checker = Counter(pseudo_value).nth_child_of_type
1398 else:
1399 raise NotImplementedError(
1400 'Only the following pseudo-classes are implemented: nth-of-type.')
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001401
1402 elif token == '*':
1403 # Star selector -- matches everything
1404 pass
1405 elif token == '>':
1406 # Run the next token as a CSS selector against the
1407 # direct children of each tag in the current context.
1408 recursive_candidate_generator = lambda tag: tag.children
1409 elif token == '~':
1410 # Run the next token as a CSS selector against the
1411 # siblings of each tag in the current context.
1412 recursive_candidate_generator = lambda tag: tag.next_siblings
1413 elif token == '+':
1414 # For each tag in the current context, run the next
1415 # token as a CSS selector against the tag's next
1416 # sibling that's a tag.
1417 def next_tag_sibling(tag):
1418 yield tag.find_next_sibling(True)
1419 recursive_candidate_generator = next_tag_sibling
1420
1421 elif self.tag_name_re.match(token):
1422 # Just a tag name.
1423 tag_name = token
1424 else:
1425 raise ValueError(
1426 'Unsupported or invalid CSS selector: "%s"' % token)
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001427 if recursive_candidate_generator:
1428 # This happens when the selector looks like "> foo".
1429 #
1430 # The generator calls select() recursively on every
1431 # member of the current context, passing in a different
1432 # candidate generator and a different selector.
1433 #
1434 # In the case of "> foo", the candidate generator is
1435 # one that yields a tag's direct children (">"), and
1436 # the selector is "foo".
1437 next_token = tokens[index+1]
1438 def recursive_select(tag):
1439 if self._select_debug:
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001440 print(' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs))
1441 print('-' * 40)
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001442 for i in tag.select(next_token, recursive_candidate_generator):
1443 if self._select_debug:
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001444 print('(Recursive select picked up candidate %s %s)' % (i.name, i.attrs))
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001445 yield i
1446 if self._select_debug:
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001447 print('-' * 40)
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001448 _use_candidate_generator = recursive_select
1449 elif _candidate_generator is None:
1450 # By default, a tag's candidates are all of its
1451 # children. If tag_name is defined, only yield tags
1452 # with that name.
1453 if self._select_debug:
1454 if tag_name:
1455 check = "[any]"
1456 else:
1457 check = tag_name
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001458 print(' Default candidate generator, tag name="%s"' % check)
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001459 if self._select_debug:
1460 # This is redundant with later code, but it stops
1461 # a bunch of bogus tags from cluttering up the
1462 # debug log.
1463 def default_candidate_generator(tag):
1464 for child in tag.descendants:
1465 if not isinstance(child, Tag):
1466 continue
1467 if tag_name and not child.name == tag_name:
1468 continue
1469 yield child
1470 _use_candidate_generator = default_candidate_generator
1471 else:
1472 _use_candidate_generator = lambda tag: tag.descendants
1473 else:
1474 _use_candidate_generator = _candidate_generator
1475
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001476 count = 0
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001477 for tag in current_context:
1478 if self._select_debug:
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001479 print(" Running candidate generator on %s %s" % (
1480 tag.name, repr(tag.attrs)))
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001481 for candidate in _use_candidate_generator(tag):
1482 if not isinstance(candidate, Tag):
1483 continue
1484 if tag_name and candidate.name != tag_name:
1485 continue
1486 if checker is not None:
1487 try:
1488 result = checker(candidate)
1489 except StopIteration:
1490 # The checker has decided we should no longer
1491 # run the generator.
1492 break
1493 if checker is None or result:
1494 if self._select_debug:
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001495 print(" SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)))
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001496 if id(candidate) not in new_context_ids:
1497 # If a tag matches a selector more than once,
1498 # don't include it in the context more than once.
1499 new_context.append(candidate)
1500 new_context_ids.add(id(candidate))
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001501 if limit and len(new_context) >= limit:
1502 break
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001503 elif self._select_debug:
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001504 print(" FAILURE %s %s" % (candidate.name, repr(candidate.attrs)))
1505
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001506
1507 current_context = new_context
1508
1509 if self._select_debug:
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001510 print("Final verdict:")
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001511 for i in current_context:
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001512 print(" %s %s" % (i.name, i.attrs))
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001513 return current_context
1514
1515 # Old names for backwards compatibility
1516 def childGenerator(self):
1517 return self.children
1518
1519 def recursiveChildGenerator(self):
1520 return self.descendants
1521
1522 def has_key(self, key):
1523 """This was kind of misleading because has_key() (attributes)
1524 was different from __in__ (contents). has_key() is gone in
1525 Python 3, anyway."""
1526 warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % (
1527 key))
1528 return self.has_attr(key)
1529
1530# Next, a couple classes to represent queries and their results.
1531class SoupStrainer(object):
1532 """Encapsulates a number of ways of matching a markup element (tag or
1533 text)."""
1534
1535 def __init__(self, name=None, attrs={}, text=None, **kwargs):
1536 self.name = self._normalize_search_value(name)
1537 if not isinstance(attrs, dict):
1538 # Treat a non-dict value for attrs as a search for the 'class'
1539 # attribute.
1540 kwargs['class'] = attrs
1541 attrs = None
1542
1543 if 'class_' in kwargs:
1544 # Treat class_="foo" as a search for the 'class'
1545 # attribute, overriding any non-dict value for attrs.
1546 kwargs['class'] = kwargs['class_']
1547 del kwargs['class_']
1548
1549 if kwargs:
1550 if attrs:
1551 attrs = attrs.copy()
1552 attrs.update(kwargs)
1553 else:
1554 attrs = kwargs
1555 normalized_attrs = {}
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001556 for key, value in list(attrs.items()):
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001557 normalized_attrs[key] = self._normalize_search_value(value)
1558
1559 self.attrs = normalized_attrs
1560 self.text = self._normalize_search_value(text)
1561
1562 def _normalize_search_value(self, value):
1563 # Leave it alone if it's a Unicode string, a callable, a
1564 # regular expression, a boolean, or None.
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001565 if (isinstance(value, str) or isinstance(value, collections.Callable) or hasattr(value, 'match')
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001566 or isinstance(value, bool) or value is None):
1567 return value
1568
1569 # If it's a bytestring, convert it to Unicode, treating it as UTF-8.
1570 if isinstance(value, bytes):
1571 return value.decode("utf8")
1572
1573 # If it's listlike, convert it into a list of strings.
1574 if hasattr(value, '__iter__'):
1575 new_value = []
1576 for v in value:
1577 if (hasattr(v, '__iter__') and not isinstance(v, bytes)
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001578 and not isinstance(v, str)):
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001579 # This is almost certainly the user's mistake. In the
1580 # interests of avoiding infinite loops, we'll let
1581 # it through as-is rather than doing a recursive call.
1582 new_value.append(v)
1583 else:
1584 new_value.append(self._normalize_search_value(v))
1585 return new_value
1586
1587 # Otherwise, convert it into a Unicode string.
1588 # The unicode(str()) thing is so this will do the same thing on Python 2
1589 # and Python 3.
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001590 return str(str(value))
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001591
1592 def __str__(self):
1593 if self.text:
1594 return self.text
1595 else:
1596 return "%s|%s" % (self.name, self.attrs)
1597
1598 def search_tag(self, markup_name=None, markup_attrs={}):
1599 found = None
1600 markup = None
1601 if isinstance(markup_name, Tag):
1602 markup = markup_name
1603 markup_attrs = markup
1604 call_function_with_tag_data = (
1605 isinstance(self.name, collections.Callable)
1606 and not isinstance(markup_name, Tag))
1607
1608 if ((not self.name)
1609 or call_function_with_tag_data
1610 or (markup and self._matches(markup, self.name))
1611 or (not markup and self._matches(markup_name, self.name))):
1612 if call_function_with_tag_data:
1613 match = self.name(markup_name, markup_attrs)
1614 else:
1615 match = True
1616 markup_attr_map = None
1617 for attr, match_against in list(self.attrs.items()):
1618 if not markup_attr_map:
1619 if hasattr(markup_attrs, 'get'):
1620 markup_attr_map = markup_attrs
1621 else:
1622 markup_attr_map = {}
1623 for k, v in markup_attrs:
1624 markup_attr_map[k] = v
1625 attr_value = markup_attr_map.get(attr)
1626 if not self._matches(attr_value, match_against):
1627 match = False
1628 break
1629 if match:
1630 if markup:
1631 found = markup
1632 else:
1633 found = markup_name
1634 if found and self.text and not self._matches(found.string, self.text):
1635 found = None
1636 return found
1637 searchTag = search_tag
1638
1639 def search(self, markup):
1640 # print 'looking for %s in %s' % (self, markup)
1641 found = None
1642 # If given a list of items, scan it for a text element that
1643 # matches.
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001644 if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)):
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001645 for element in markup:
1646 if isinstance(element, NavigableString) \
1647 and self.search(element):
1648 found = element
1649 break
1650 # If it's a Tag, make sure its name or attributes match.
1651 # Don't bother with Tags if we're searching for text.
1652 elif isinstance(markup, Tag):
1653 if not self.text or self.name or self.attrs:
1654 found = self.search_tag(markup)
1655 # If it's text, make sure the text matches.
1656 elif isinstance(markup, NavigableString) or \
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001657 isinstance(markup, str):
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001658 if not self.name and not self.attrs and self._matches(markup, self.text):
1659 found = markup
1660 else:
1661 raise Exception(
1662 "I don't know how to match against a %s" % markup.__class__)
1663 return found
1664
1665 def _matches(self, markup, match_against):
1666 # print u"Matching %s against %s" % (markup, match_against)
1667 result = False
1668 if isinstance(markup, list) or isinstance(markup, tuple):
1669 # This should only happen when searching a multi-valued attribute
1670 # like 'class'.
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001671 if (isinstance(match_against, str)
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001672 and ' ' in match_against):
1673 # A bit of a special case. If they try to match "foo
1674 # bar" on a multivalue attribute's value, only accept
1675 # the literal value "foo bar"
1676 #
1677 # XXX This is going to be pretty slow because we keep
1678 # splitting match_against. But it shouldn't come up
1679 # too often.
1680 return (whitespace_re.split(match_against) == markup)
1681 else:
1682 for item in markup:
1683 if self._matches(item, match_against):
1684 return True
1685 return False
1686
1687 if match_against is True:
1688 # True matches any non-None value.
1689 return markup is not None
1690
1691 if isinstance(match_against, collections.Callable):
1692 return match_against(markup)
1693
1694 # Custom callables take the tag as an argument, but all
1695 # other ways of matching match the tag name as a string.
1696 if isinstance(markup, Tag):
1697 markup = markup.name
1698
1699 # Ensure that `markup` is either a Unicode string, or None.
1700 markup = self._normalize_search_value(markup)
1701
1702 if markup is None:
1703 # None matches None, False, an empty string, an empty list, and so on.
1704 return not match_against
1705
Patrick Williamsc0f7c042017-02-23 20:41:17 -06001706 if isinstance(match_against, str):
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001707 # Exact string match
1708 return markup == match_against
1709
1710 if hasattr(match_against, 'match'):
1711 # Regexp match
1712 return match_against.search(markup)
1713
1714 if hasattr(match_against, '__iter__'):
1715 # The markup must be an exact match against something
1716 # in the iterable.
1717 return markup in match_against
1718
1719
1720class ResultSet(list):
1721 """A ResultSet is just a list that keeps track of the SoupStrainer
1722 that created it."""
1723 def __init__(self, source, result=()):
1724 super(ResultSet, self).__init__(result)
1725 self.source = source