blob: da9afdf48ec0b05cf8e970cd906425ce80b343cb [file] [log] [blame]
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001import collections
2import re
3import sys
4import warnings
5from bs4.dammit import EntitySubstitution
6
7DEFAULT_OUTPUT_ENCODING = "utf-8"
8PY3K = (sys.version_info[0] > 2)
9
10whitespace_re = re.compile("\s+")
11
12def _alias(attr):
13 """Alias one attribute name to another for backward compatibility"""
14 @property
15 def alias(self):
16 return getattr(self, attr)
17
18 @alias.setter
19 def alias(self):
20 return setattr(self, attr)
21 return alias
22
23
24class NamespacedAttribute(unicode):
25
26 def __new__(cls, prefix, name, namespace=None):
27 if name is None:
28 obj = unicode.__new__(cls, prefix)
29 elif prefix is None:
30 # Not really namespaced.
31 obj = unicode.__new__(cls, name)
32 else:
33 obj = unicode.__new__(cls, prefix + ":" + name)
34 obj.prefix = prefix
35 obj.name = name
36 obj.namespace = namespace
37 return obj
38
39class AttributeValueWithCharsetSubstitution(unicode):
40 """A stand-in object for a character encoding specified in HTML."""
41
42class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
43 """A generic stand-in for the value of a meta tag's 'charset' attribute.
44
45 When Beautiful Soup parses the markup '<meta charset="utf8">', the
46 value of the 'charset' attribute will be one of these objects.
47 """
48
49 def __new__(cls, original_value):
50 obj = unicode.__new__(cls, original_value)
51 obj.original_value = original_value
52 return obj
53
54 def encode(self, encoding):
55 return encoding
56
57
58class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
59 """A generic stand-in for the value of a meta tag's 'content' attribute.
60
61 When Beautiful Soup parses the markup:
62 <meta http-equiv="content-type" content="text/html; charset=utf8">
63
64 The value of the 'content' attribute will be one of these objects.
65 """
66
67 CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
68
69 def __new__(cls, original_value):
70 match = cls.CHARSET_RE.search(original_value)
71 if match is None:
72 # No substitution necessary.
73 return unicode.__new__(unicode, original_value)
74
75 obj = unicode.__new__(cls, original_value)
76 obj.original_value = original_value
77 return obj
78
79 def encode(self, encoding):
80 def rewrite(match):
81 return match.group(1) + encoding
82 return self.CHARSET_RE.sub(rewrite, self.original_value)
83
84class HTMLAwareEntitySubstitution(EntitySubstitution):
85
86 """Entity substitution rules that are aware of some HTML quirks.
87
88 Specifically, the contents of <script> and <style> tags should not
89 undergo entity substitution.
90
91 Incoming NavigableString objects are checked to see if they're the
92 direct children of a <script> or <style> tag.
93 """
94
95 cdata_containing_tags = set(["script", "style"])
96
97 preformatted_tags = set(["pre"])
98
99 @classmethod
100 def _substitute_if_appropriate(cls, ns, f):
101 if (isinstance(ns, NavigableString)
102 and ns.parent is not None
103 and ns.parent.name in cls.cdata_containing_tags):
104 # Do nothing.
105 return ns
106 # Substitute.
107 return f(ns)
108
109 @classmethod
110 def substitute_html(cls, ns):
111 return cls._substitute_if_appropriate(
112 ns, EntitySubstitution.substitute_html)
113
114 @classmethod
115 def substitute_xml(cls, ns):
116 return cls._substitute_if_appropriate(
117 ns, EntitySubstitution.substitute_xml)
118
119class PageElement(object):
120 """Contains the navigational information for some part of the page
121 (either a tag or a piece of text)"""
122
123 # There are five possible values for the "formatter" argument passed in
124 # to methods like encode() and prettify():
125 #
126 # "html" - All Unicode characters with corresponding HTML entities
127 # are converted to those entities on output.
128 # "minimal" - Bare ampersands and angle brackets are converted to
129 # XML entities: &amp; &lt; &gt;
130 # None - The null formatter. Unicode characters are never
131 # converted to entities. This is not recommended, but it's
132 # faster than "minimal".
133 # A function - This function will be called on every string that
134 # needs to undergo entity substitution.
135 #
136
137 # In an HTML document, the default "html" and "minimal" functions
138 # will leave the contents of <script> and <style> tags alone. For
139 # an XML document, all tags will be given the same treatment.
140
141 HTML_FORMATTERS = {
142 "html" : HTMLAwareEntitySubstitution.substitute_html,
143 "minimal" : HTMLAwareEntitySubstitution.substitute_xml,
144 None : None
145 }
146
147 XML_FORMATTERS = {
148 "html" : EntitySubstitution.substitute_html,
149 "minimal" : EntitySubstitution.substitute_xml,
150 None : None
151 }
152
153 def format_string(self, s, formatter='minimal'):
154 """Format the given string using the given formatter."""
155 if not callable(formatter):
156 formatter = self._formatter_for_name(formatter)
157 if formatter is None:
158 output = s
159 else:
160 output = formatter(s)
161 return output
162
163 @property
164 def _is_xml(self):
165 """Is this element part of an XML tree or an HTML tree?
166
167 This is used when mapping a formatter name ("minimal") to an
168 appropriate function (one that performs entity-substitution on
169 the contents of <script> and <style> tags, or not). It's
170 inefficient, but it should be called very rarely.
171 """
172 if self.parent is None:
173 # This is the top-level object. It should have .is_xml set
174 # from tree creation. If not, take a guess--BS is usually
175 # used on HTML markup.
176 return getattr(self, 'is_xml', False)
177 return self.parent._is_xml
178
179 def _formatter_for_name(self, name):
180 "Look up a formatter function based on its name and the tree."
181 if self._is_xml:
182 return self.XML_FORMATTERS.get(
183 name, EntitySubstitution.substitute_xml)
184 else:
185 return self.HTML_FORMATTERS.get(
186 name, HTMLAwareEntitySubstitution.substitute_xml)
187
188 def setup(self, parent=None, previous_element=None):
189 """Sets up the initial relations between this element and
190 other elements."""
191 self.parent = parent
192 self.previous_element = previous_element
193 if previous_element is not None:
194 self.previous_element.next_element = self
195 self.next_element = None
196 self.previous_sibling = None
197 self.next_sibling = None
198 if self.parent is not None and self.parent.contents:
199 self.previous_sibling = self.parent.contents[-1]
200 self.previous_sibling.next_sibling = self
201
202 nextSibling = _alias("next_sibling") # BS3
203 previousSibling = _alias("previous_sibling") # BS3
204
205 def replace_with(self, replace_with):
206 if replace_with is self:
207 return
208 if replace_with is self.parent:
209 raise ValueError("Cannot replace a Tag with its parent.")
210 old_parent = self.parent
211 my_index = self.parent.index(self)
212 self.extract()
213 old_parent.insert(my_index, replace_with)
214 return self
215 replaceWith = replace_with # BS3
216
217 def unwrap(self):
218 my_parent = self.parent
219 my_index = self.parent.index(self)
220 self.extract()
221 for child in reversed(self.contents[:]):
222 my_parent.insert(my_index, child)
223 return self
224 replace_with_children = unwrap
225 replaceWithChildren = unwrap # BS3
226
227 def wrap(self, wrap_inside):
228 me = self.replace_with(wrap_inside)
229 wrap_inside.append(me)
230 return wrap_inside
231
232 def extract(self):
233 """Destructively rips this element out of the tree."""
234 if self.parent is not None:
235 del self.parent.contents[self.parent.index(self)]
236
237 #Find the two elements that would be next to each other if
238 #this element (and any children) hadn't been parsed. Connect
239 #the two.
240 last_child = self._last_descendant()
241 next_element = last_child.next_element
242
243 if self.previous_element is not None:
244 self.previous_element.next_element = next_element
245 if next_element is not None:
246 next_element.previous_element = self.previous_element
247 self.previous_element = None
248 last_child.next_element = None
249
250 self.parent = None
251 if self.previous_sibling is not None:
252 self.previous_sibling.next_sibling = self.next_sibling
253 if self.next_sibling is not None:
254 self.next_sibling.previous_sibling = self.previous_sibling
255 self.previous_sibling = self.next_sibling = None
256 return self
257
258 def _last_descendant(self, is_initialized=True, accept_self=True):
259 "Finds the last element beneath this object to be parsed."
260 if is_initialized and self.next_sibling:
261 last_child = self.next_sibling.previous_element
262 else:
263 last_child = self
264 while isinstance(last_child, Tag) and last_child.contents:
265 last_child = last_child.contents[-1]
266 if not accept_self and last_child == self:
267 last_child = None
268 return last_child
269 # BS3: Not part of the API!
270 _lastRecursiveChild = _last_descendant
271
272 def insert(self, position, new_child):
273 if new_child is self:
274 raise ValueError("Cannot insert a tag into itself.")
275 if (isinstance(new_child, basestring)
276 and not isinstance(new_child, NavigableString)):
277 new_child = NavigableString(new_child)
278
279 position = min(position, len(self.contents))
280 if hasattr(new_child, 'parent') and new_child.parent is not None:
281 # We're 'inserting' an element that's already one
282 # of this object's children.
283 if new_child.parent is self:
284 current_index = self.index(new_child)
285 if current_index < position:
286 # We're moving this element further down the list
287 # of this object's children. That means that when
288 # we extract this element, our target index will
289 # jump down one.
290 position -= 1
291 new_child.extract()
292
293 new_child.parent = self
294 previous_child = None
295 if position == 0:
296 new_child.previous_sibling = None
297 new_child.previous_element = self
298 else:
299 previous_child = self.contents[position - 1]
300 new_child.previous_sibling = previous_child
301 new_child.previous_sibling.next_sibling = new_child
302 new_child.previous_element = previous_child._last_descendant(False)
303 if new_child.previous_element is not None:
304 new_child.previous_element.next_element = new_child
305
306 new_childs_last_element = new_child._last_descendant(False)
307
308 if position >= len(self.contents):
309 new_child.next_sibling = None
310
311 parent = self
312 parents_next_sibling = None
313 while parents_next_sibling is None and parent is not None:
314 parents_next_sibling = parent.next_sibling
315 parent = parent.parent
316 if parents_next_sibling is not None:
317 # We found the element that comes next in the document.
318 break
319 if parents_next_sibling is not None:
320 new_childs_last_element.next_element = parents_next_sibling
321 else:
322 # The last element of this tag is the last element in
323 # the document.
324 new_childs_last_element.next_element = None
325 else:
326 next_child = self.contents[position]
327 new_child.next_sibling = next_child
328 if new_child.next_sibling is not None:
329 new_child.next_sibling.previous_sibling = new_child
330 new_childs_last_element.next_element = next_child
331
332 if new_childs_last_element.next_element is not None:
333 new_childs_last_element.next_element.previous_element = new_childs_last_element
334 self.contents.insert(position, new_child)
335
336 def append(self, tag):
337 """Appends the given tag to the contents of this tag."""
338 self.insert(len(self.contents), tag)
339
340 def insert_before(self, predecessor):
341 """Makes the given element the immediate predecessor of this one.
342
343 The two elements will have the same parent, and the given element
344 will be immediately before this one.
345 """
346 if self is predecessor:
347 raise ValueError("Can't insert an element before itself.")
348 parent = self.parent
349 if parent is None:
350 raise ValueError(
351 "Element has no parent, so 'before' has no meaning.")
352 # Extract first so that the index won't be screwed up if they
353 # are siblings.
354 if isinstance(predecessor, PageElement):
355 predecessor.extract()
356 index = parent.index(self)
357 parent.insert(index, predecessor)
358
359 def insert_after(self, successor):
360 """Makes the given element the immediate successor of this one.
361
362 The two elements will have the same parent, and the given element
363 will be immediately after this one.
364 """
365 if self is successor:
366 raise ValueError("Can't insert an element after itself.")
367 parent = self.parent
368 if parent is None:
369 raise ValueError(
370 "Element has no parent, so 'after' has no meaning.")
371 # Extract first so that the index won't be screwed up if they
372 # are siblings.
373 if isinstance(successor, PageElement):
374 successor.extract()
375 index = parent.index(self)
376 parent.insert(index+1, successor)
377
378 def find_next(self, name=None, attrs={}, text=None, **kwargs):
379 """Returns the first item that matches the given criteria and
380 appears after this Tag in the document."""
381 return self._find_one(self.find_all_next, name, attrs, text, **kwargs)
382 findNext = find_next # BS3
383
384 def find_all_next(self, name=None, attrs={}, text=None, limit=None,
385 **kwargs):
386 """Returns all items that match the given criteria and appear
387 after this Tag in the document."""
388 return self._find_all(name, attrs, text, limit, self.next_elements,
389 **kwargs)
390 findAllNext = find_all_next # BS3
391
392 def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs):
393 """Returns the closest sibling to this Tag that matches the
394 given criteria and appears after this Tag in the document."""
395 return self._find_one(self.find_next_siblings, name, attrs, text,
396 **kwargs)
397 findNextSibling = find_next_sibling # BS3
398
399 def find_next_siblings(self, name=None, attrs={}, text=None, limit=None,
400 **kwargs):
401 """Returns the siblings of this Tag that match the given
402 criteria and appear after this Tag in the document."""
403 return self._find_all(name, attrs, text, limit,
404 self.next_siblings, **kwargs)
405 findNextSiblings = find_next_siblings # BS3
406 fetchNextSiblings = find_next_siblings # BS2
407
408 def find_previous(self, name=None, attrs={}, text=None, **kwargs):
409 """Returns the first item that matches the given criteria and
410 appears before this Tag in the document."""
411 return self._find_one(
412 self.find_all_previous, name, attrs, text, **kwargs)
413 findPrevious = find_previous # BS3
414
415 def find_all_previous(self, name=None, attrs={}, text=None, limit=None,
416 **kwargs):
417 """Returns all items that match the given criteria and appear
418 before this Tag in the document."""
419 return self._find_all(name, attrs, text, limit, self.previous_elements,
420 **kwargs)
421 findAllPrevious = find_all_previous # BS3
422 fetchPrevious = find_all_previous # BS2
423
424 def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs):
425 """Returns the closest sibling to this Tag that matches the
426 given criteria and appears before this Tag in the document."""
427 return self._find_one(self.find_previous_siblings, name, attrs, text,
428 **kwargs)
429 findPreviousSibling = find_previous_sibling # BS3
430
431 def find_previous_siblings(self, name=None, attrs={}, text=None,
432 limit=None, **kwargs):
433 """Returns the siblings of this Tag that match the given
434 criteria and appear before this Tag in the document."""
435 return self._find_all(name, attrs, text, limit,
436 self.previous_siblings, **kwargs)
437 findPreviousSiblings = find_previous_siblings # BS3
438 fetchPreviousSiblings = find_previous_siblings # BS2
439
440 def find_parent(self, name=None, attrs={}, **kwargs):
441 """Returns the closest parent of this Tag that matches the given
442 criteria."""
443 # NOTE: We can't use _find_one because findParents takes a different
444 # set of arguments.
445 r = None
446 l = self.find_parents(name, attrs, 1, **kwargs)
447 if l:
448 r = l[0]
449 return r
450 findParent = find_parent # BS3
451
452 def find_parents(self, name=None, attrs={}, limit=None, **kwargs):
453 """Returns the parents of this Tag that match the given
454 criteria."""
455
456 return self._find_all(name, attrs, None, limit, self.parents,
457 **kwargs)
458 findParents = find_parents # BS3
459 fetchParents = find_parents # BS2
460
461 @property
462 def next(self):
463 return self.next_element
464
465 @property
466 def previous(self):
467 return self.previous_element
468
469 #These methods do the real heavy lifting.
470
471 def _find_one(self, method, name, attrs, text, **kwargs):
472 r = None
473 l = method(name, attrs, text, 1, **kwargs)
474 if l:
475 r = l[0]
476 return r
477
478 def _find_all(self, name, attrs, text, limit, generator, **kwargs):
479 "Iterates over a generator looking for things that match."
480
481 if isinstance(name, SoupStrainer):
482 strainer = name
483 else:
484 strainer = SoupStrainer(name, attrs, text, **kwargs)
485
486 if text is None and not limit and not attrs and not kwargs:
487 if name is True or name is None:
488 # Optimization to find all tags.
489 result = (element for element in generator
490 if isinstance(element, Tag))
491 return ResultSet(strainer, result)
492 elif isinstance(name, basestring):
493 # Optimization to find all tags with a given name.
494 result = (element for element in generator
495 if isinstance(element, Tag)
496 and element.name == name)
497 return ResultSet(strainer, result)
498 results = ResultSet(strainer)
499 while True:
500 try:
501 i = next(generator)
502 except StopIteration:
503 break
504 if i:
505 found = strainer.search(i)
506 if found:
507 results.append(found)
508 if limit and len(results) >= limit:
509 break
510 return results
511
512 #These generators can be used to navigate starting from both
513 #NavigableStrings and Tags.
514 @property
515 def next_elements(self):
516 i = self.next_element
517 while i is not None:
518 yield i
519 i = i.next_element
520
521 @property
522 def next_siblings(self):
523 i = self.next_sibling
524 while i is not None:
525 yield i
526 i = i.next_sibling
527
528 @property
529 def previous_elements(self):
530 i = self.previous_element
531 while i is not None:
532 yield i
533 i = i.previous_element
534
535 @property
536 def previous_siblings(self):
537 i = self.previous_sibling
538 while i is not None:
539 yield i
540 i = i.previous_sibling
541
542 @property
543 def parents(self):
544 i = self.parent
545 while i is not None:
546 yield i
547 i = i.parent
548
549 # Methods for supporting CSS selectors.
550
551 tag_name_re = re.compile('^[a-z0-9]+$')
552
553 # /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
554 # \---/ \---/\-------------/ \-------/
555 # | | | |
556 # | | | The value
557 # | | ~,|,^,$,* or =
558 # | Attribute
559 # Tag
560 attribselect_re = re.compile(
561 r'^(?P<tag>\w+)?\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)' +
562 r'=?"?(?P<value>[^\]"]*)"?\]$'
563 )
564
565 def _attr_value_as_string(self, value, default=None):
566 """Force an attribute value into a string representation.
567
568 A multi-valued attribute will be converted into a
569 space-separated stirng.
570 """
571 value = self.get(value, default)
572 if isinstance(value, list) or isinstance(value, tuple):
573 value =" ".join(value)
574 return value
575
576 def _tag_name_matches_and(self, function, tag_name):
577 if not tag_name:
578 return function
579 else:
580 def _match(tag):
581 return tag.name == tag_name and function(tag)
582 return _match
583
584 def _attribute_checker(self, operator, attribute, value=''):
585 """Create a function that performs a CSS selector operation.
586
587 Takes an operator, attribute and optional value. Returns a
588 function that will return True for elements that match that
589 combination.
590 """
591 if operator == '=':
592 # string representation of `attribute` is equal to `value`
593 return lambda el: el._attr_value_as_string(attribute) == value
594 elif operator == '~':
595 # space-separated list representation of `attribute`
596 # contains `value`
597 def _includes_value(element):
598 attribute_value = element.get(attribute, [])
599 if not isinstance(attribute_value, list):
600 attribute_value = attribute_value.split()
601 return value in attribute_value
602 return _includes_value
603 elif operator == '^':
604 # string representation of `attribute` starts with `value`
605 return lambda el: el._attr_value_as_string(
606 attribute, '').startswith(value)
607 elif operator == '$':
608 # string represenation of `attribute` ends with `value`
609 return lambda el: el._attr_value_as_string(
610 attribute, '').endswith(value)
611 elif operator == '*':
612 # string representation of `attribute` contains `value`
613 return lambda el: value in el._attr_value_as_string(attribute, '')
614 elif operator == '|':
615 # string representation of `attribute` is either exactly
616 # `value` or starts with `value` and then a dash.
617 def _is_or_starts_with_dash(element):
618 attribute_value = element._attr_value_as_string(attribute, '')
619 return (attribute_value == value or attribute_value.startswith(
620 value + '-'))
621 return _is_or_starts_with_dash
622 else:
623 return lambda el: el.has_attr(attribute)
624
625 # Old non-property versions of the generators, for backwards
626 # compatibility with BS3.
627 def nextGenerator(self):
628 return self.next_elements
629
630 def nextSiblingGenerator(self):
631 return self.next_siblings
632
633 def previousGenerator(self):
634 return self.previous_elements
635
636 def previousSiblingGenerator(self):
637 return self.previous_siblings
638
639 def parentGenerator(self):
640 return self.parents
641
642
643class NavigableString(unicode, PageElement):
644
645 PREFIX = ''
646 SUFFIX = ''
647
648 def __new__(cls, value):
649 """Create a new NavigableString.
650
651 When unpickling a NavigableString, this method is called with
652 the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
653 passed in to the superclass's __new__ or the superclass won't know
654 how to handle non-ASCII characters.
655 """
656 if isinstance(value, unicode):
657 return unicode.__new__(cls, value)
658 return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
659
660 def __copy__(self):
661 return self
662
663 def __getnewargs__(self):
664 return (unicode(self),)
665
666 def __getattr__(self, attr):
667 """text.string gives you text. This is for backwards
668 compatibility for Navigable*String, but for CData* it lets you
669 get the string without the CData wrapper."""
670 if attr == 'string':
671 return self
672 else:
673 raise AttributeError(
674 "'%s' object has no attribute '%s'" % (
675 self.__class__.__name__, attr))
676
677 def output_ready(self, formatter="minimal"):
678 output = self.format_string(self, formatter)
679 return self.PREFIX + output + self.SUFFIX
680
681 @property
682 def name(self):
683 return None
684
685 @name.setter
686 def name(self, name):
687 raise AttributeError("A NavigableString cannot be given a name.")
688
689class PreformattedString(NavigableString):
690 """A NavigableString not subject to the normal formatting rules.
691
692 The string will be passed into the formatter (to trigger side effects),
693 but the return value will be ignored.
694 """
695
696 def output_ready(self, formatter="minimal"):
697 """CData strings are passed into the formatter.
698 But the return value is ignored."""
699 self.format_string(self, formatter)
700 return self.PREFIX + self + self.SUFFIX
701
702class CData(PreformattedString):
703
704 PREFIX = u'<![CDATA['
705 SUFFIX = u']]>'
706
707class ProcessingInstruction(PreformattedString):
708
709 PREFIX = u'<?'
710 SUFFIX = u'?>'
711
712class Comment(PreformattedString):
713
714 PREFIX = u'<!--'
715 SUFFIX = u'-->'
716
717
718class Declaration(PreformattedString):
719 PREFIX = u'<!'
720 SUFFIX = u'!>'
721
722
723class Doctype(PreformattedString):
724
725 @classmethod
726 def for_name_and_ids(cls, name, pub_id, system_id):
727 value = name or ''
728 if pub_id is not None:
729 value += ' PUBLIC "%s"' % pub_id
730 if system_id is not None:
731 value += ' "%s"' % system_id
732 elif system_id is not None:
733 value += ' SYSTEM "%s"' % system_id
734
735 return Doctype(value)
736
737 PREFIX = u'<!DOCTYPE '
738 SUFFIX = u'>\n'
739
740
741class Tag(PageElement):
742
743 """Represents a found HTML tag with its attributes and contents."""
744
745 def __init__(self, parser=None, builder=None, name=None, namespace=None,
746 prefix=None, attrs=None, parent=None, previous=None):
747 "Basic constructor."
748
749 if parser is None:
750 self.parser_class = None
751 else:
752 # We don't actually store the parser object: that lets extracted
753 # chunks be garbage-collected.
754 self.parser_class = parser.__class__
755 if name is None:
756 raise ValueError("No value provided for new tag's name.")
757 self.name = name
758 self.namespace = namespace
759 self.prefix = prefix
760 if attrs is None:
761 attrs = {}
762 elif attrs and builder.cdata_list_attributes:
763 attrs = builder._replace_cdata_list_attribute_values(
764 self.name, attrs)
765 else:
766 attrs = dict(attrs)
767 self.attrs = attrs
768 self.contents = []
769 self.setup(parent, previous)
770 self.hidden = False
771
772 # Set up any substitutions, such as the charset in a META tag.
773 if builder is not None:
774 builder.set_up_substitutions(self)
775 self.can_be_empty_element = builder.can_be_empty_element(name)
776 else:
777 self.can_be_empty_element = False
778
779 parserClass = _alias("parser_class") # BS3
780
781 @property
782 def is_empty_element(self):
783 """Is this tag an empty-element tag? (aka a self-closing tag)
784
785 A tag that has contents is never an empty-element tag.
786
787 A tag that has no contents may or may not be an empty-element
788 tag. It depends on the builder used to create the tag. If the
789 builder has a designated list of empty-element tags, then only
790 a tag whose name shows up in that list is considered an
791 empty-element tag.
792
793 If the builder has no designated list of empty-element tags,
794 then any tag with no contents is an empty-element tag.
795 """
796 return len(self.contents) == 0 and self.can_be_empty_element
797 isSelfClosing = is_empty_element # BS3
798
799 @property
800 def string(self):
801 """Convenience property to get the single string within this tag.
802
803 :Return: If this tag has a single string child, return value
804 is that string. If this tag has no children, or more than one
805 child, return value is None. If this tag has one child tag,
806 return value is the 'string' attribute of the child tag,
807 recursively.
808 """
809 if len(self.contents) != 1:
810 return None
811 child = self.contents[0]
812 if isinstance(child, NavigableString):
813 return child
814 return child.string
815
816 @string.setter
817 def string(self, string):
818 self.clear()
819 self.append(string.__class__(string))
820
821 def _all_strings(self, strip=False, types=(NavigableString, CData)):
822 """Yield all strings of certain classes, possibly stripping them.
823
824 By default, yields only NavigableString and CData objects. So
825 no comments, processing instructions, etc.
826 """
827 for descendant in self.descendants:
828 if (
829 (types is None and not isinstance(descendant, NavigableString))
830 or
831 (types is not None and type(descendant) not in types)):
832 continue
833 if strip:
834 descendant = descendant.strip()
835 if len(descendant) == 0:
836 continue
837 yield descendant
838
839 strings = property(_all_strings)
840
841 @property
842 def stripped_strings(self):
843 for string in self._all_strings(True):
844 yield string
845
846 def get_text(self, separator=u"", strip=False,
847 types=(NavigableString, CData)):
848 """
849 Get all child strings, concatenated using the given separator.
850 """
851 return separator.join([s for s in self._all_strings(
852 strip, types=types)])
853 getText = get_text
854 text = property(get_text)
855
856 def decompose(self):
857 """Recursively destroys the contents of this tree."""
858 self.extract()
859 i = self
860 while i is not None:
861 next = i.next_element
862 i.__dict__.clear()
863 i.contents = []
864 i = next
865
866 def clear(self, decompose=False):
867 """
868 Extract all children. If decompose is True, decompose instead.
869 """
870 if decompose:
871 for element in self.contents[:]:
872 if isinstance(element, Tag):
873 element.decompose()
874 else:
875 element.extract()
876 else:
877 for element in self.contents[:]:
878 element.extract()
879
880 def index(self, element):
881 """
882 Find the index of a child by identity, not value. Avoids issues with
883 tag.contents.index(element) getting the index of equal elements.
884 """
885 for i, child in enumerate(self.contents):
886 if child is element:
887 return i
888 raise ValueError("Tag.index: element not in tag")
889
890 def get(self, key, default=None):
891 """Returns the value of the 'key' attribute for the tag, or
892 the value given for 'default' if it doesn't have that
893 attribute."""
894 return self.attrs.get(key, default)
895
896 def has_attr(self, key):
897 return key in self.attrs
898
899 def __hash__(self):
900 return str(self).__hash__()
901
902 def __getitem__(self, key):
903 """tag[key] returns the value of the 'key' attribute for the tag,
904 and throws an exception if it's not there."""
905 return self.attrs[key]
906
907 def __iter__(self):
908 "Iterating over a tag iterates over its contents."
909 return iter(self.contents)
910
911 def __len__(self):
912 "The length of a tag is the length of its list of contents."
913 return len(self.contents)
914
915 def __contains__(self, x):
916 return x in self.contents
917
918 def __nonzero__(self):
919 "A tag is non-None even if it has no contents."
920 return True
921
922 def __setitem__(self, key, value):
923 """Setting tag[key] sets the value of the 'key' attribute for the
924 tag."""
925 self.attrs[key] = value
926
927 def __delitem__(self, key):
928 "Deleting tag[key] deletes all 'key' attributes for the tag."
929 self.attrs.pop(key, None)
930
931 def __call__(self, *args, **kwargs):
932 """Calling a tag like a function is the same as calling its
933 find_all() method. Eg. tag('a') returns a list of all the A tags
934 found within this tag."""
935 return self.find_all(*args, **kwargs)
936
937 def __getattr__(self, tag):
938 #print "Getattr %s.%s" % (self.__class__, tag)
939 if len(tag) > 3 and tag.endswith('Tag'):
940 # BS3: soup.aTag -> "soup.find("a")
941 tag_name = tag[:-3]
942 warnings.warn(
943 '.%sTag is deprecated, use .find("%s") instead.' % (
944 tag_name, tag_name))
945 return self.find(tag_name)
946 # We special case contents to avoid recursion.
947 elif not tag.startswith("__") and not tag=="contents":
948 return self.find(tag)
949 raise AttributeError(
950 "'%s' object has no attribute '%s'" % (self.__class__, tag))
951
952 def __eq__(self, other):
953 """Returns true iff this tag has the same name, the same attributes,
954 and the same contents (recursively) as the given tag."""
955 if self is other:
956 return True
957 if (not hasattr(other, 'name') or
958 not hasattr(other, 'attrs') or
959 not hasattr(other, 'contents') or
960 self.name != other.name or
961 self.attrs != other.attrs or
962 len(self) != len(other)):
963 return False
964 for i, my_child in enumerate(self.contents):
965 if my_child != other.contents[i]:
966 return False
967 return True
968
969 def __ne__(self, other):
970 """Returns true iff this tag is not identical to the other tag,
971 as defined in __eq__."""
972 return not self == other
973
974 def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
975 """Renders this tag as a string."""
976 return self.encode(encoding)
977
978 def __unicode__(self):
979 return self.decode()
980
981 def __str__(self):
982 return self.encode()
983
984 if PY3K:
985 __str__ = __repr__ = __unicode__
986
987 def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
988 indent_level=None, formatter="minimal",
989 errors="xmlcharrefreplace"):
990 # Turn the data structure into Unicode, then encode the
991 # Unicode.
992 u = self.decode(indent_level, encoding, formatter)
993 return u.encode(encoding, errors)
994
995 def _should_pretty_print(self, indent_level):
996 """Should this tag be pretty-printed?"""
997 return (
998 indent_level is not None and
999 (self.name not in HTMLAwareEntitySubstitution.preformatted_tags
1000 or self._is_xml))
1001
1002 def decode(self, indent_level=None,
1003 eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1004 formatter="minimal"):
1005 """Returns a Unicode representation of this tag and its contents.
1006
1007 :param eventual_encoding: The tag is destined to be
1008 encoded into this encoding. This method is _not_
1009 responsible for performing that encoding. This information
1010 is passed in so that it can be substituted in if the
1011 document contains a <META> tag that mentions the document's
1012 encoding.
1013 """
1014
1015 # First off, turn a string formatter into a function. This
1016 # will stop the lookup from happening over and over again.
1017 if not callable(formatter):
1018 formatter = self._formatter_for_name(formatter)
1019
1020 attrs = []
1021 if self.attrs:
1022 for key, val in sorted(self.attrs.items()):
1023 if val is None:
1024 decoded = key
1025 else:
1026 if isinstance(val, list) or isinstance(val, tuple):
1027 val = ' '.join(val)
1028 elif not isinstance(val, basestring):
1029 val = unicode(val)
1030 elif (
1031 isinstance(val, AttributeValueWithCharsetSubstitution)
1032 and eventual_encoding is not None):
1033 val = val.encode(eventual_encoding)
1034
1035 text = self.format_string(val, formatter)
1036 decoded = (
1037 unicode(key) + '='
1038 + EntitySubstitution.quoted_attribute_value(text))
1039 attrs.append(decoded)
1040 close = ''
1041 closeTag = ''
1042
1043 prefix = ''
1044 if self.prefix:
1045 prefix = self.prefix + ":"
1046
1047 if self.is_empty_element:
1048 close = '/'
1049 else:
1050 closeTag = '</%s%s>' % (prefix, self.name)
1051
1052 pretty_print = self._should_pretty_print(indent_level)
1053 space = ''
1054 indent_space = ''
1055 if indent_level is not None:
1056 indent_space = (' ' * (indent_level - 1))
1057 if pretty_print:
1058 space = indent_space
1059 indent_contents = indent_level + 1
1060 else:
1061 indent_contents = None
1062 contents = self.decode_contents(
1063 indent_contents, eventual_encoding, formatter)
1064
1065 if self.hidden:
1066 # This is the 'document root' object.
1067 s = contents
1068 else:
1069 s = []
1070 attribute_string = ''
1071 if attrs:
1072 attribute_string = ' ' + ' '.join(attrs)
1073 if indent_level is not None:
1074 # Even if this particular tag is not pretty-printed,
1075 # we should indent up to the start of the tag.
1076 s.append(indent_space)
1077 s.append('<%s%s%s%s>' % (
1078 prefix, self.name, attribute_string, close))
1079 if pretty_print:
1080 s.append("\n")
1081 s.append(contents)
1082 if pretty_print and contents and contents[-1] != "\n":
1083 s.append("\n")
1084 if pretty_print and closeTag:
1085 s.append(space)
1086 s.append(closeTag)
1087 if indent_level is not None and closeTag and self.next_sibling:
1088 # Even if this particular tag is not pretty-printed,
1089 # we're now done with the tag, and we should add a
1090 # newline if appropriate.
1091 s.append("\n")
1092 s = ''.join(s)
1093 return s
1094
1095 def prettify(self, encoding=None, formatter="minimal"):
1096 if encoding is None:
1097 return self.decode(True, formatter=formatter)
1098 else:
1099 return self.encode(encoding, True, formatter=formatter)
1100
1101 def decode_contents(self, indent_level=None,
1102 eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1103 formatter="minimal"):
1104 """Renders the contents of this tag as a Unicode string.
1105
1106 :param eventual_encoding: The tag is destined to be
1107 encoded into this encoding. This method is _not_
1108 responsible for performing that encoding. This information
1109 is passed in so that it can be substituted in if the
1110 document contains a <META> tag that mentions the document's
1111 encoding.
1112 """
1113 # First off, turn a string formatter into a function. This
1114 # will stop the lookup from happening over and over again.
1115 if not callable(formatter):
1116 formatter = self._formatter_for_name(formatter)
1117
1118 pretty_print = (indent_level is not None)
1119 s = []
1120 for c in self:
1121 text = None
1122 if isinstance(c, NavigableString):
1123 text = c.output_ready(formatter)
1124 elif isinstance(c, Tag):
1125 s.append(c.decode(indent_level, eventual_encoding,
1126 formatter))
1127 if text and indent_level and not self.name == 'pre':
1128 text = text.strip()
1129 if text:
1130 if pretty_print and not self.name == 'pre':
1131 s.append(" " * (indent_level - 1))
1132 s.append(text)
1133 if pretty_print and not self.name == 'pre':
1134 s.append("\n")
1135 return ''.join(s)
1136
1137 def encode_contents(
1138 self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
1139 formatter="minimal"):
1140 """Renders the contents of this tag as a bytestring."""
1141 contents = self.decode_contents(indent_level, encoding, formatter)
1142 return contents.encode(encoding)
1143
1144 # Old method for BS3 compatibility
1145 def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
1146 prettyPrint=False, indentLevel=0):
1147 if not prettyPrint:
1148 indentLevel = None
1149 return self.encode_contents(
1150 indent_level=indentLevel, encoding=encoding)
1151
1152 #Soup methods
1153
1154 def find(self, name=None, attrs={}, recursive=True, text=None,
1155 **kwargs):
1156 """Return only the first child of this Tag matching the given
1157 criteria."""
1158 r = None
1159 l = self.find_all(name, attrs, recursive, text, 1, **kwargs)
1160 if l:
1161 r = l[0]
1162 return r
1163 findChild = find
1164
1165 def find_all(self, name=None, attrs={}, recursive=True, text=None,
1166 limit=None, **kwargs):
1167 """Extracts a list of Tag objects that match the given
1168 criteria. You can specify the name of the Tag and any
1169 attributes you want the Tag to have.
1170
1171 The value of a key-value pair in the 'attrs' map can be a
1172 string, a list of strings, a regular expression object, or a
1173 callable that takes a string and returns whether or not the
1174 string matches for some custom definition of 'matches'. The
1175 same is true of the tag name."""
1176
1177 generator = self.descendants
1178 if not recursive:
1179 generator = self.children
1180 return self._find_all(name, attrs, text, limit, generator, **kwargs)
1181 findAll = find_all # BS3
1182 findChildren = find_all # BS2
1183
1184 #Generator methods
1185 @property
1186 def children(self):
1187 # return iter() to make the purpose of the method clear
1188 return iter(self.contents) # XXX This seems to be untested.
1189
1190 @property
1191 def descendants(self):
1192 if not len(self.contents):
1193 return
1194 stopNode = self._last_descendant().next_element
1195 current = self.contents[0]
1196 while current is not stopNode:
1197 yield current
1198 current = current.next_element
1199
1200 # CSS selector code
1201
1202 _selector_combinators = ['>', '+', '~']
1203 _select_debug = False
1204 def select(self, selector, _candidate_generator=None):
1205 """Perform a CSS selection operation on the current element."""
1206 tokens = selector.split()
1207 current_context = [self]
1208
1209 if tokens[-1] in self._selector_combinators:
1210 raise ValueError(
1211 'Final combinator "%s" is missing an argument.' % tokens[-1])
1212 if self._select_debug:
1213 print 'Running CSS selector "%s"' % selector
1214 for index, token in enumerate(tokens):
1215 if self._select_debug:
1216 print ' Considering token "%s"' % token
1217 recursive_candidate_generator = None
1218 tag_name = None
1219 if tokens[index-1] in self._selector_combinators:
1220 # This token was consumed by the previous combinator. Skip it.
1221 if self._select_debug:
1222 print ' Token was consumed by the previous combinator.'
1223 continue
1224 # Each operation corresponds to a checker function, a rule
1225 # for determining whether a candidate matches the
1226 # selector. Candidates are generated by the active
1227 # iterator.
1228 checker = None
1229
1230 m = self.attribselect_re.match(token)
1231 if m is not None:
1232 # Attribute selector
1233 tag_name, attribute, operator, value = m.groups()
1234 checker = self._attribute_checker(operator, attribute, value)
1235
1236 elif '#' in token:
1237 # ID selector
1238 tag_name, tag_id = token.split('#', 1)
1239 def id_matches(tag):
1240 return tag.get('id', None) == tag_id
1241 checker = id_matches
1242
1243 elif '.' in token:
1244 # Class selector
1245 tag_name, klass = token.split('.', 1)
1246 classes = set(klass.split('.'))
1247 def classes_match(candidate):
1248 return classes.issubset(candidate.get('class', []))
1249 checker = classes_match
1250
1251 elif ':' in token:
1252 # Pseudo-class
1253 tag_name, pseudo = token.split(':', 1)
1254 if tag_name == '':
1255 raise ValueError(
1256 "A pseudo-class must be prefixed with a tag name.")
1257 pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
1258 found = []
1259 if pseudo_attributes is not None:
1260 pseudo_type, pseudo_value = pseudo_attributes.groups()
1261 if pseudo_type == 'nth-of-type':
1262 try:
1263 pseudo_value = int(pseudo_value)
1264 except:
1265 raise NotImplementedError(
1266 'Only numeric values are currently supported for the nth-of-type pseudo-class.')
1267 if pseudo_value < 1:
1268 raise ValueError(
1269 'nth-of-type pseudo-class value must be at least 1.')
1270 class Counter(object):
1271 def __init__(self, destination):
1272 self.count = 0
1273 self.destination = destination
1274
1275 def nth_child_of_type(self, tag):
1276 self.count += 1
1277 if self.count == self.destination:
1278 return True
1279 if self.count > self.destination:
1280 # Stop the generator that's sending us
1281 # these things.
1282 raise StopIteration()
1283 return False
1284 checker = Counter(pseudo_value).nth_child_of_type
1285 else:
1286 raise NotImplementedError(
1287 'Only the following pseudo-classes are implemented: nth-of-type.')
1288
1289 elif token == '*':
1290 # Star selector -- matches everything
1291 pass
1292 elif token == '>':
1293 # Run the next token as a CSS selector against the
1294 # direct children of each tag in the current context.
1295 recursive_candidate_generator = lambda tag: tag.children
1296 elif token == '~':
1297 # Run the next token as a CSS selector against the
1298 # siblings of each tag in the current context.
1299 recursive_candidate_generator = lambda tag: tag.next_siblings
1300 elif token == '+':
1301 # For each tag in the current context, run the next
1302 # token as a CSS selector against the tag's next
1303 # sibling that's a tag.
1304 def next_tag_sibling(tag):
1305 yield tag.find_next_sibling(True)
1306 recursive_candidate_generator = next_tag_sibling
1307
1308 elif self.tag_name_re.match(token):
1309 # Just a tag name.
1310 tag_name = token
1311 else:
1312 raise ValueError(
1313 'Unsupported or invalid CSS selector: "%s"' % token)
1314
1315 if recursive_candidate_generator:
1316 # This happens when the selector looks like "> foo".
1317 #
1318 # The generator calls select() recursively on every
1319 # member of the current context, passing in a different
1320 # candidate generator and a different selector.
1321 #
1322 # In the case of "> foo", the candidate generator is
1323 # one that yields a tag's direct children (">"), and
1324 # the selector is "foo".
1325 next_token = tokens[index+1]
1326 def recursive_select(tag):
1327 if self._select_debug:
1328 print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)
1329 print '-' * 40
1330 for i in tag.select(next_token, recursive_candidate_generator):
1331 if self._select_debug:
1332 print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)
1333 yield i
1334 if self._select_debug:
1335 print '-' * 40
1336 _use_candidate_generator = recursive_select
1337 elif _candidate_generator is None:
1338 # By default, a tag's candidates are all of its
1339 # children. If tag_name is defined, only yield tags
1340 # with that name.
1341 if self._select_debug:
1342 if tag_name:
1343 check = "[any]"
1344 else:
1345 check = tag_name
1346 print ' Default candidate generator, tag name="%s"' % check
1347 if self._select_debug:
1348 # This is redundant with later code, but it stops
1349 # a bunch of bogus tags from cluttering up the
1350 # debug log.
1351 def default_candidate_generator(tag):
1352 for child in tag.descendants:
1353 if not isinstance(child, Tag):
1354 continue
1355 if tag_name and not child.name == tag_name:
1356 continue
1357 yield child
1358 _use_candidate_generator = default_candidate_generator
1359 else:
1360 _use_candidate_generator = lambda tag: tag.descendants
1361 else:
1362 _use_candidate_generator = _candidate_generator
1363
1364 new_context = []
1365 new_context_ids = set([])
1366 for tag in current_context:
1367 if self._select_debug:
1368 print " Running candidate generator on %s %s" % (
1369 tag.name, repr(tag.attrs))
1370 for candidate in _use_candidate_generator(tag):
1371 if not isinstance(candidate, Tag):
1372 continue
1373 if tag_name and candidate.name != tag_name:
1374 continue
1375 if checker is not None:
1376 try:
1377 result = checker(candidate)
1378 except StopIteration:
1379 # The checker has decided we should no longer
1380 # run the generator.
1381 break
1382 if checker is None or result:
1383 if self._select_debug:
1384 print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))
1385 if id(candidate) not in new_context_ids:
1386 # If a tag matches a selector more than once,
1387 # don't include it in the context more than once.
1388 new_context.append(candidate)
1389 new_context_ids.add(id(candidate))
1390 elif self._select_debug:
1391 print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
1392
1393 current_context = new_context
1394
1395 if self._select_debug:
1396 print "Final verdict:"
1397 for i in current_context:
1398 print " %s %s" % (i.name, i.attrs)
1399 return current_context
1400
1401 # Old names for backwards compatibility
1402 def childGenerator(self):
1403 return self.children
1404
1405 def recursiveChildGenerator(self):
1406 return self.descendants
1407
1408 def has_key(self, key):
1409 """This was kind of misleading because has_key() (attributes)
1410 was different from __in__ (contents). has_key() is gone in
1411 Python 3, anyway."""
1412 warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % (
1413 key))
1414 return self.has_attr(key)
1415
1416# Next, a couple classes to represent queries and their results.
1417class SoupStrainer(object):
1418 """Encapsulates a number of ways of matching a markup element (tag or
1419 text)."""
1420
1421 def __init__(self, name=None, attrs={}, text=None, **kwargs):
1422 self.name = self._normalize_search_value(name)
1423 if not isinstance(attrs, dict):
1424 # Treat a non-dict value for attrs as a search for the 'class'
1425 # attribute.
1426 kwargs['class'] = attrs
1427 attrs = None
1428
1429 if 'class_' in kwargs:
1430 # Treat class_="foo" as a search for the 'class'
1431 # attribute, overriding any non-dict value for attrs.
1432 kwargs['class'] = kwargs['class_']
1433 del kwargs['class_']
1434
1435 if kwargs:
1436 if attrs:
1437 attrs = attrs.copy()
1438 attrs.update(kwargs)
1439 else:
1440 attrs = kwargs
1441 normalized_attrs = {}
1442 for key, value in attrs.items():
1443 normalized_attrs[key] = self._normalize_search_value(value)
1444
1445 self.attrs = normalized_attrs
1446 self.text = self._normalize_search_value(text)
1447
1448 def _normalize_search_value(self, value):
1449 # Leave it alone if it's a Unicode string, a callable, a
1450 # regular expression, a boolean, or None.
1451 if (isinstance(value, unicode) or callable(value) or hasattr(value, 'match')
1452 or isinstance(value, bool) or value is None):
1453 return value
1454
1455 # If it's a bytestring, convert it to Unicode, treating it as UTF-8.
1456 if isinstance(value, bytes):
1457 return value.decode("utf8")
1458
1459 # If it's listlike, convert it into a list of strings.
1460 if hasattr(value, '__iter__'):
1461 new_value = []
1462 for v in value:
1463 if (hasattr(v, '__iter__') and not isinstance(v, bytes)
1464 and not isinstance(v, unicode)):
1465 # This is almost certainly the user's mistake. In the
1466 # interests of avoiding infinite loops, we'll let
1467 # it through as-is rather than doing a recursive call.
1468 new_value.append(v)
1469 else:
1470 new_value.append(self._normalize_search_value(v))
1471 return new_value
1472
1473 # Otherwise, convert it into a Unicode string.
1474 # The unicode(str()) thing is so this will do the same thing on Python 2
1475 # and Python 3.
1476 return unicode(str(value))
1477
1478 def __str__(self):
1479 if self.text:
1480 return self.text
1481 else:
1482 return "%s|%s" % (self.name, self.attrs)
1483
1484 def search_tag(self, markup_name=None, markup_attrs={}):
1485 found = None
1486 markup = None
1487 if isinstance(markup_name, Tag):
1488 markup = markup_name
1489 markup_attrs = markup
1490 call_function_with_tag_data = (
1491 isinstance(self.name, collections.Callable)
1492 and not isinstance(markup_name, Tag))
1493
1494 if ((not self.name)
1495 or call_function_with_tag_data
1496 or (markup and self._matches(markup, self.name))
1497 or (not markup and self._matches(markup_name, self.name))):
1498 if call_function_with_tag_data:
1499 match = self.name(markup_name, markup_attrs)
1500 else:
1501 match = True
1502 markup_attr_map = None
1503 for attr, match_against in list(self.attrs.items()):
1504 if not markup_attr_map:
1505 if hasattr(markup_attrs, 'get'):
1506 markup_attr_map = markup_attrs
1507 else:
1508 markup_attr_map = {}
1509 for k, v in markup_attrs:
1510 markup_attr_map[k] = v
1511 attr_value = markup_attr_map.get(attr)
1512 if not self._matches(attr_value, match_against):
1513 match = False
1514 break
1515 if match:
1516 if markup:
1517 found = markup
1518 else:
1519 found = markup_name
1520 if found and self.text and not self._matches(found.string, self.text):
1521 found = None
1522 return found
1523 searchTag = search_tag
1524
1525 def search(self, markup):
1526 # print 'looking for %s in %s' % (self, markup)
1527 found = None
1528 # If given a list of items, scan it for a text element that
1529 # matches.
1530 if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, basestring)):
1531 for element in markup:
1532 if isinstance(element, NavigableString) \
1533 and self.search(element):
1534 found = element
1535 break
1536 # If it's a Tag, make sure its name or attributes match.
1537 # Don't bother with Tags if we're searching for text.
1538 elif isinstance(markup, Tag):
1539 if not self.text or self.name or self.attrs:
1540 found = self.search_tag(markup)
1541 # If it's text, make sure the text matches.
1542 elif isinstance(markup, NavigableString) or \
1543 isinstance(markup, basestring):
1544 if not self.name and not self.attrs and self._matches(markup, self.text):
1545 found = markup
1546 else:
1547 raise Exception(
1548 "I don't know how to match against a %s" % markup.__class__)
1549 return found
1550
1551 def _matches(self, markup, match_against):
1552 # print u"Matching %s against %s" % (markup, match_against)
1553 result = False
1554 if isinstance(markup, list) or isinstance(markup, tuple):
1555 # This should only happen when searching a multi-valued attribute
1556 # like 'class'.
1557 if (isinstance(match_against, unicode)
1558 and ' ' in match_against):
1559 # A bit of a special case. If they try to match "foo
1560 # bar" on a multivalue attribute's value, only accept
1561 # the literal value "foo bar"
1562 #
1563 # XXX This is going to be pretty slow because we keep
1564 # splitting match_against. But it shouldn't come up
1565 # too often.
1566 return (whitespace_re.split(match_against) == markup)
1567 else:
1568 for item in markup:
1569 if self._matches(item, match_against):
1570 return True
1571 return False
1572
1573 if match_against is True:
1574 # True matches any non-None value.
1575 return markup is not None
1576
1577 if isinstance(match_against, collections.Callable):
1578 return match_against(markup)
1579
1580 # Custom callables take the tag as an argument, but all
1581 # other ways of matching match the tag name as a string.
1582 if isinstance(markup, Tag):
1583 markup = markup.name
1584
1585 # Ensure that `markup` is either a Unicode string, or None.
1586 markup = self._normalize_search_value(markup)
1587
1588 if markup is None:
1589 # None matches None, False, an empty string, an empty list, and so on.
1590 return not match_against
1591
1592 if isinstance(match_against, unicode):
1593 # Exact string match
1594 return markup == match_against
1595
1596 if hasattr(match_against, 'match'):
1597 # Regexp match
1598 return match_against.search(markup)
1599
1600 if hasattr(match_against, '__iter__'):
1601 # The markup must be an exact match against something
1602 # in the iterable.
1603 return markup in match_against
1604
1605
1606class ResultSet(list):
1607 """A ResultSet is just a list that keeps track of the SoupStrainer
1608 that created it."""
1609 def __init__(self, source, result=()):
1610 super(ResultSet, self).__init__(result)
1611 self.source = source