blob: f6fdfd50b125bede34b1f31312f1ea95ee444761 [file] [log] [blame]
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001"""Beautiful Soup
2Elixir and Tonic
3"The Screen-Scraper's Friend"
4http://www.crummy.com/software/BeautifulSoup/
5
6Beautiful Soup uses a pluggable XML or HTML parser to parse a
7(possibly invalid) document into a tree representation. Beautiful Soup
8provides provides methods and Pythonic idioms that make it easy to
9navigate, search, and modify the parse tree.
10
11Beautiful Soup works with Python 2.6 and up. It works better if lxml
12and/or html5lib is installed.
13
14For more than you ever wanted to know about Beautiful Soup, see the
15documentation:
16http://www.crummy.com/software/BeautifulSoup/bs4/doc/
17"""
18
19__author__ = "Leonard Richardson (leonardr@segfault.org)"
Patrick Williamsc0f7c042017-02-23 20:41:17 -060020__version__ = "4.4.1"
21__copyright__ = "Copyright (c) 2004-2015 Leonard Richardson"
Patrick Williamsc124f4f2015-09-15 14:41:29 -050022__license__ = "MIT"
23
24__all__ = ['BeautifulSoup']
25
26import os
27import re
28import warnings
29
30from .builder import builder_registry, ParserRejectedMarkup
31from .dammit import UnicodeDammit
32from .element import (
33 CData,
34 Comment,
35 DEFAULT_OUTPUT_ENCODING,
36 Declaration,
37 Doctype,
38 NavigableString,
39 PageElement,
40 ProcessingInstruction,
41 ResultSet,
42 SoupStrainer,
43 Tag,
44 )
45
46# The very first thing we do is give a useful error if someone is
47# running this code under Python 3 without converting it.
Patrick Williamsc0f7c042017-02-23 20:41:17 -060048'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
Patrick Williamsc124f4f2015-09-15 14:41:29 -050049
50class BeautifulSoup(Tag):
51 """
52 This class defines the basic interface called by the tree builders.
53
54 These methods will be called by the parser:
55 reset()
56 feed(markup)
57
58 The tree builder may call these methods from its feed() implementation:
59 handle_starttag(name, attrs) # See note about return value
60 handle_endtag(name)
61 handle_data(data) # Appends to the current data node
62 endData(containerClass=NavigableString) # Ends the current data node
63
64 No matter how complicated the underlying parser is, you should be
65 able to build a tree using 'start tag' events, 'end tag' events,
66 'data' events, and "done with data" events.
67
68 If you encounter an empty-element tag (aka a self-closing tag,
69 like HTML's <br> tag), call handle_starttag and then
70 handle_endtag.
71 """
Patrick Williamsc0f7c042017-02-23 20:41:17 -060072 ROOT_TAG_NAME = '[document]'
Patrick Williamsc124f4f2015-09-15 14:41:29 -050073
74 # If the end-user gives no indication which tree builder they
75 # want, look for one with these features.
76 DEFAULT_BUILDER_FEATURES = ['html', 'fast']
77
78 ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
79
Patrick Williamsc0f7c042017-02-23 20:41:17 -060080 NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
81
Patrick Williamsc124f4f2015-09-15 14:41:29 -050082 def __init__(self, markup="", features=None, builder=None,
Patrick Williamsc0f7c042017-02-23 20:41:17 -060083 parse_only=None, from_encoding=None, exclude_encodings=None,
84 **kwargs):
Patrick Williamsc124f4f2015-09-15 14:41:29 -050085 """The Soup object is initialized as the 'root tag', and the
86 provided markup (which can be a string or a file-like object)
87 is fed into the underlying parser."""
88
89 if 'convertEntities' in kwargs:
90 warnings.warn(
91 "BS4 does not respect the convertEntities argument to the "
92 "BeautifulSoup constructor. Entities are always converted "
93 "to Unicode characters.")
94
95 if 'markupMassage' in kwargs:
96 del kwargs['markupMassage']
97 warnings.warn(
98 "BS4 does not respect the markupMassage argument to the "
99 "BeautifulSoup constructor. The tree builder is responsible "
100 "for any necessary markup massage.")
101
102 if 'smartQuotesTo' in kwargs:
103 del kwargs['smartQuotesTo']
104 warnings.warn(
105 "BS4 does not respect the smartQuotesTo argument to the "
106 "BeautifulSoup constructor. Smart quotes are always converted "
107 "to Unicode characters.")
108
109 if 'selfClosingTags' in kwargs:
110 del kwargs['selfClosingTags']
111 warnings.warn(
112 "BS4 does not respect the selfClosingTags argument to the "
113 "BeautifulSoup constructor. The tree builder is responsible "
114 "for understanding self-closing tags.")
115
116 if 'isHTML' in kwargs:
117 del kwargs['isHTML']
118 warnings.warn(
119 "BS4 does not respect the isHTML argument to the "
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600120 "BeautifulSoup constructor. Suggest you use "
121 "features='lxml' for HTML and features='lxml-xml' for "
122 "XML.")
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500123
124 def deprecated_argument(old_name, new_name):
125 if old_name in kwargs:
126 warnings.warn(
127 'The "%s" argument to the BeautifulSoup constructor '
128 'has been renamed to "%s."' % (old_name, new_name))
129 value = kwargs[old_name]
130 del kwargs[old_name]
131 return value
132 return None
133
134 parse_only = parse_only or deprecated_argument(
135 "parseOnlyThese", "parse_only")
136
137 from_encoding = from_encoding or deprecated_argument(
138 "fromEncoding", "from_encoding")
139
140 if len(kwargs) > 0:
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600141 arg = list(kwargs.keys()).pop()
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500142 raise TypeError(
143 "__init__() got an unexpected keyword argument '%s'" % arg)
144
145 if builder is None:
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600146 original_features = features
147 if isinstance(features, str):
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500148 features = [features]
149 if features is None or len(features) == 0:
150 features = self.DEFAULT_BUILDER_FEATURES
151 builder_class = builder_registry.lookup(*features)
152 if builder_class is None:
153 raise FeatureNotFound(
154 "Couldn't find a tree builder with the features you "
155 "requested: %s. Do you need to install a parser library?"
156 % ",".join(features))
157 builder = builder_class()
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600158 if not (original_features == builder.NAME or
159 original_features in builder.ALTERNATE_NAMES):
160 if builder.is_xml:
161 markup_type = "XML"
162 else:
163 markup_type = "HTML"
164 warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
165 parser=builder.NAME,
166 markup_type=markup_type))
167
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500168 self.builder = builder
169 self.is_xml = builder.is_xml
170 self.builder.soup = self
171
172 self.parse_only = parse_only
173
174 if hasattr(markup, 'read'): # It's a file-type object.
175 markup = markup.read()
176 elif len(markup) <= 256:
177 # Print out warnings for a couple beginner problems
178 # involving passing non-markup to Beautiful Soup.
179 # Beautiful Soup will still parse the input as markup,
180 # just in case that's what the user really wants.
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600181 if (isinstance(markup, str)
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500182 and not os.path.supports_unicode_filenames):
183 possible_filename = markup.encode("utf8")
184 else:
185 possible_filename = markup
186 is_file = False
187 try:
188 is_file = os.path.exists(possible_filename)
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600189 except Exception as e:
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500190 # This is almost certainly a problem involving
191 # characters not valid in filenames on this
192 # system. Just let it go.
193 pass
194 if is_file:
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600195 if isinstance(markup, str):
196 markup = markup.encode("utf8")
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500197 warnings.warn(
198 '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
199 if markup[:5] == "http:" or markup[:6] == "https:":
200 # TODO: This is ugly but I couldn't get it to work in
201 # Python 3 otherwise.
202 if ((isinstance(markup, bytes) and not b' ' in markup)
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600203 or (isinstance(markup, str) and not ' ' in markup)):
204 if isinstance(markup, str):
205 markup = markup.encode("utf8")
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500206 warnings.warn(
207 '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
208
209 for (self.markup, self.original_encoding, self.declared_html_encoding,
210 self.contains_replacement_characters) in (
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600211 self.builder.prepare_markup(
212 markup, from_encoding, exclude_encodings=exclude_encodings)):
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500213 self.reset()
214 try:
215 self._feed()
216 break
217 except ParserRejectedMarkup:
218 pass
219
220 # Clear out the markup and remove the builder's circular
221 # reference to this object.
222 self.markup = None
223 self.builder.soup = None
224
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600225 def __copy__(self):
226 return type(self)(self.encode(), builder=self.builder)
227
228 def __getstate__(self):
229 # Frequently a tree builder can't be pickled.
230 d = dict(self.__dict__)
231 if 'builder' in d and not self.builder.picklable:
232 del d['builder']
233 return d
234
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500235 def _feed(self):
236 # Convert the document to Unicode.
237 self.builder.reset()
238
239 self.builder.feed(self.markup)
240 # Close out any unfinished strings and close all the open tags.
241 self.endData()
242 while self.currentTag.name != self.ROOT_TAG_NAME:
243 self.popTag()
244
245 def reset(self):
246 Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
247 self.hidden = 1
248 self.builder.reset()
249 self.current_data = []
250 self.currentTag = None
251 self.tagStack = []
252 self.preserve_whitespace_tag_stack = []
253 self.pushTag(self)
254
255 def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
256 """Create a new tag associated with this soup."""
257 return Tag(None, self.builder, name, namespace, nsprefix, attrs)
258
259 def new_string(self, s, subclass=NavigableString):
260 """Create a new NavigableString associated with this soup."""
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600261 return subclass(s)
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500262
263 def insert_before(self, successor):
264 raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
265
266 def insert_after(self, successor):
267 raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
268
269 def popTag(self):
270 tag = self.tagStack.pop()
271 if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
272 self.preserve_whitespace_tag_stack.pop()
273 #print "Pop", tag.name
274 if self.tagStack:
275 self.currentTag = self.tagStack[-1]
276 return self.currentTag
277
278 def pushTag(self, tag):
279 #print "Push", tag.name
280 if self.currentTag:
281 self.currentTag.contents.append(tag)
282 self.tagStack.append(tag)
283 self.currentTag = self.tagStack[-1]
284 if tag.name in self.builder.preserve_whitespace_tags:
285 self.preserve_whitespace_tag_stack.append(tag)
286
287 def endData(self, containerClass=NavigableString):
288 if self.current_data:
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600289 current_data = ''.join(self.current_data)
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500290 # If whitespace is not preserved, and this string contains
291 # nothing but ASCII spaces, replace it with a single space
292 # or newline.
293 if not self.preserve_whitespace_tag_stack:
294 strippable = True
295 for i in current_data:
296 if i not in self.ASCII_SPACES:
297 strippable = False
298 break
299 if strippable:
300 if '\n' in current_data:
301 current_data = '\n'
302 else:
303 current_data = ' '
304
305 # Reset the data collector.
306 self.current_data = []
307
308 # Should we add this string to the tree at all?
309 if self.parse_only and len(self.tagStack) <= 1 and \
310 (not self.parse_only.text or \
311 not self.parse_only.search(current_data)):
312 return
313
314 o = containerClass(current_data)
315 self.object_was_parsed(o)
316
317 def object_was_parsed(self, o, parent=None, most_recent_element=None):
318 """Add an object to the parse tree."""
319 parent = parent or self.currentTag
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600320 previous_element = most_recent_element or self._most_recent_element
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500321
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600322 next_element = previous_sibling = next_sibling = None
323 if isinstance(o, Tag):
324 next_element = o.next_element
325 next_sibling = o.next_sibling
326 previous_sibling = o.previous_sibling
327 if not previous_element:
328 previous_element = o.previous_element
329
330 o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
331
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500332 self._most_recent_element = o
333 parent.contents.append(o)
334
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600335 if parent.next_sibling:
336 # This node is being inserted into an element that has
337 # already been parsed. Deal with any dangling references.
338 index = parent.contents.index(o)
339 if index == 0:
340 previous_element = parent
341 previous_sibling = None
342 else:
343 previous_element = previous_sibling = parent.contents[index-1]
344 if index == len(parent.contents)-1:
345 next_element = parent.next_sibling
346 next_sibling = None
347 else:
348 next_element = next_sibling = parent.contents[index+1]
349
350 o.previous_element = previous_element
351 if previous_element:
352 previous_element.next_element = o
353 o.next_element = next_element
354 if next_element:
355 next_element.previous_element = o
356 o.next_sibling = next_sibling
357 if next_sibling:
358 next_sibling.previous_sibling = o
359 o.previous_sibling = previous_sibling
360 if previous_sibling:
361 previous_sibling.next_sibling = o
362
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500363 def _popToTag(self, name, nsprefix=None, inclusivePop=True):
364 """Pops the tag stack up to and including the most recent
365 instance of the given tag. If inclusivePop is false, pops the tag
366 stack up to but *not* including the most recent instqance of
367 the given tag."""
368 #print "Popping to %s" % name
369 if name == self.ROOT_TAG_NAME:
370 # The BeautifulSoup object itself can never be popped.
371 return
372
373 most_recently_popped = None
374
375 stack_size = len(self.tagStack)
376 for i in range(stack_size - 1, 0, -1):
377 t = self.tagStack[i]
378 if (name == t.name and nsprefix == t.prefix):
379 if inclusivePop:
380 most_recently_popped = self.popTag()
381 break
382 most_recently_popped = self.popTag()
383
384 return most_recently_popped
385
386 def handle_starttag(self, name, namespace, nsprefix, attrs):
387 """Push a start tag on to the stack.
388
389 If this method returns None, the tag was rejected by the
390 SoupStrainer. You should proceed as if the tag had not occured
391 in the document. For instance, if this was a self-closing tag,
392 don't call handle_endtag.
393 """
394
395 # print "Start tag %s: %s" % (name, attrs)
396 self.endData()
397
398 if (self.parse_only and len(self.tagStack) <= 1
399 and (self.parse_only.text
400 or not self.parse_only.search_tag(name, attrs))):
401 return None
402
403 tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
404 self.currentTag, self._most_recent_element)
405 if tag is None:
406 return tag
407 if self._most_recent_element:
408 self._most_recent_element.next_element = tag
409 self._most_recent_element = tag
410 self.pushTag(tag)
411 return tag
412
413 def handle_endtag(self, name, nsprefix=None):
414 #print "End tag: " + name
415 self.endData()
416 self._popToTag(name, nsprefix)
417
418 def handle_data(self, data):
419 self.current_data.append(data)
420
421 def decode(self, pretty_print=False,
422 eventual_encoding=DEFAULT_OUTPUT_ENCODING,
423 formatter="minimal"):
424 """Returns a string or Unicode representation of this document.
425 To get Unicode, pass None for encoding."""
426
427 if self.is_xml:
428 # Print the XML declaration
429 encoding_part = ''
430 if eventual_encoding != None:
431 encoding_part = ' encoding="%s"' % eventual_encoding
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600432 prefix = '<?xml version="1.0"%s?>\n' % encoding_part
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500433 else:
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600434 prefix = ''
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500435 if not pretty_print:
436 indent_level = None
437 else:
438 indent_level = 0
439 return prefix + super(BeautifulSoup, self).decode(
440 indent_level, eventual_encoding, formatter)
441
442# Alias to make it easier to type import: 'from bs4 import _soup'
443_s = BeautifulSoup
444_soup = BeautifulSoup
445
446class BeautifulStoneSoup(BeautifulSoup):
447 """Deprecated interface to an XML parser."""
448
449 def __init__(self, *args, **kwargs):
450 kwargs['features'] = 'xml'
451 warnings.warn(
452 'The BeautifulStoneSoup class is deprecated. Instead of using '
453 'it, pass features="xml" into the BeautifulSoup constructor.')
454 super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
455
456
457class StopParsing(Exception):
458 pass
459
460class FeatureNotFound(ValueError):
461 pass
462
463
464#By default, act as an HTML pretty-printer.
465if __name__ == '__main__':
466 import sys
467 soup = BeautifulSoup(sys.stdin)
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600468 print(soup.prettify())