blob: 7ba34269af71fbfbb2b53272866275684f08170f [file] [log] [blame]
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001"""Beautiful Soup
2Elixir and Tonic
3"The Screen-Scraper's Friend"
4http://www.crummy.com/software/BeautifulSoup/
5
6Beautiful Soup uses a pluggable XML or HTML parser to parse a
7(possibly invalid) document into a tree representation. Beautiful Soup
8provides provides methods and Pythonic idioms that make it easy to
9navigate, search, and modify the parse tree.
10
11Beautiful Soup works with Python 2.6 and up. It works better if lxml
12and/or html5lib is installed.
13
14For more than you ever wanted to know about Beautiful Soup, see the
15documentation:
16http://www.crummy.com/software/BeautifulSoup/bs4/doc/
17"""
18
19__author__ = "Leonard Richardson (leonardr@segfault.org)"
20__version__ = "4.3.2"
21__copyright__ = "Copyright (c) 2004-2013 Leonard Richardson"
22__license__ = "MIT"
23
24__all__ = ['BeautifulSoup']
25
26import os
27import re
28import warnings
29
30from .builder import builder_registry, ParserRejectedMarkup
31from .dammit import UnicodeDammit
32from .element import (
33 CData,
34 Comment,
35 DEFAULT_OUTPUT_ENCODING,
36 Declaration,
37 Doctype,
38 NavigableString,
39 PageElement,
40 ProcessingInstruction,
41 ResultSet,
42 SoupStrainer,
43 Tag,
44 )
45
46# The very first thing we do is give a useful error if someone is
47# running this code under Python 3 without converting it.
48syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
49
50class BeautifulSoup(Tag):
51 """
52 This class defines the basic interface called by the tree builders.
53
54 These methods will be called by the parser:
55 reset()
56 feed(markup)
57
58 The tree builder may call these methods from its feed() implementation:
59 handle_starttag(name, attrs) # See note about return value
60 handle_endtag(name)
61 handle_data(data) # Appends to the current data node
62 endData(containerClass=NavigableString) # Ends the current data node
63
64 No matter how complicated the underlying parser is, you should be
65 able to build a tree using 'start tag' events, 'end tag' events,
66 'data' events, and "done with data" events.
67
68 If you encounter an empty-element tag (aka a self-closing tag,
69 like HTML's <br> tag), call handle_starttag and then
70 handle_endtag.
71 """
72 ROOT_TAG_NAME = u'[document]'
73
74 # If the end-user gives no indication which tree builder they
75 # want, look for one with these features.
76 DEFAULT_BUILDER_FEATURES = ['html', 'fast']
77
78 ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
79
80 def __init__(self, markup="", features=None, builder=None,
81 parse_only=None, from_encoding=None, **kwargs):
82 """The Soup object is initialized as the 'root tag', and the
83 provided markup (which can be a string or a file-like object)
84 is fed into the underlying parser."""
85
86 if 'convertEntities' in kwargs:
87 warnings.warn(
88 "BS4 does not respect the convertEntities argument to the "
89 "BeautifulSoup constructor. Entities are always converted "
90 "to Unicode characters.")
91
92 if 'markupMassage' in kwargs:
93 del kwargs['markupMassage']
94 warnings.warn(
95 "BS4 does not respect the markupMassage argument to the "
96 "BeautifulSoup constructor. The tree builder is responsible "
97 "for any necessary markup massage.")
98
99 if 'smartQuotesTo' in kwargs:
100 del kwargs['smartQuotesTo']
101 warnings.warn(
102 "BS4 does not respect the smartQuotesTo argument to the "
103 "BeautifulSoup constructor. Smart quotes are always converted "
104 "to Unicode characters.")
105
106 if 'selfClosingTags' in kwargs:
107 del kwargs['selfClosingTags']
108 warnings.warn(
109 "BS4 does not respect the selfClosingTags argument to the "
110 "BeautifulSoup constructor. The tree builder is responsible "
111 "for understanding self-closing tags.")
112
113 if 'isHTML' in kwargs:
114 del kwargs['isHTML']
115 warnings.warn(
116 "BS4 does not respect the isHTML argument to the "
117 "BeautifulSoup constructor. You can pass in features='html' "
118 "or features='xml' to get a builder capable of handling "
119 "one or the other.")
120
121 def deprecated_argument(old_name, new_name):
122 if old_name in kwargs:
123 warnings.warn(
124 'The "%s" argument to the BeautifulSoup constructor '
125 'has been renamed to "%s."' % (old_name, new_name))
126 value = kwargs[old_name]
127 del kwargs[old_name]
128 return value
129 return None
130
131 parse_only = parse_only or deprecated_argument(
132 "parseOnlyThese", "parse_only")
133
134 from_encoding = from_encoding or deprecated_argument(
135 "fromEncoding", "from_encoding")
136
137 if len(kwargs) > 0:
138 arg = kwargs.keys().pop()
139 raise TypeError(
140 "__init__() got an unexpected keyword argument '%s'" % arg)
141
142 if builder is None:
143 if isinstance(features, basestring):
144 features = [features]
145 if features is None or len(features) == 0:
146 features = self.DEFAULT_BUILDER_FEATURES
147 builder_class = builder_registry.lookup(*features)
148 if builder_class is None:
149 raise FeatureNotFound(
150 "Couldn't find a tree builder with the features you "
151 "requested: %s. Do you need to install a parser library?"
152 % ",".join(features))
153 builder = builder_class()
154 self.builder = builder
155 self.is_xml = builder.is_xml
156 self.builder.soup = self
157
158 self.parse_only = parse_only
159
160 if hasattr(markup, 'read'): # It's a file-type object.
161 markup = markup.read()
162 elif len(markup) <= 256:
163 # Print out warnings for a couple beginner problems
164 # involving passing non-markup to Beautiful Soup.
165 # Beautiful Soup will still parse the input as markup,
166 # just in case that's what the user really wants.
167 if (isinstance(markup, unicode)
168 and not os.path.supports_unicode_filenames):
169 possible_filename = markup.encode("utf8")
170 else:
171 possible_filename = markup
172 is_file = False
173 try:
174 is_file = os.path.exists(possible_filename)
175 except Exception, e:
176 # This is almost certainly a problem involving
177 # characters not valid in filenames on this
178 # system. Just let it go.
179 pass
180 if is_file:
181 warnings.warn(
182 '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
183 if markup[:5] == "http:" or markup[:6] == "https:":
184 # TODO: This is ugly but I couldn't get it to work in
185 # Python 3 otherwise.
186 if ((isinstance(markup, bytes) and not b' ' in markup)
187 or (isinstance(markup, unicode) and not u' ' in markup)):
188 warnings.warn(
189 '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
190
191 for (self.markup, self.original_encoding, self.declared_html_encoding,
192 self.contains_replacement_characters) in (
193 self.builder.prepare_markup(markup, from_encoding)):
194 self.reset()
195 try:
196 self._feed()
197 break
198 except ParserRejectedMarkup:
199 pass
200
201 # Clear out the markup and remove the builder's circular
202 # reference to this object.
203 self.markup = None
204 self.builder.soup = None
205
206 def _feed(self):
207 # Convert the document to Unicode.
208 self.builder.reset()
209
210 self.builder.feed(self.markup)
211 # Close out any unfinished strings and close all the open tags.
212 self.endData()
213 while self.currentTag.name != self.ROOT_TAG_NAME:
214 self.popTag()
215
216 def reset(self):
217 Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
218 self.hidden = 1
219 self.builder.reset()
220 self.current_data = []
221 self.currentTag = None
222 self.tagStack = []
223 self.preserve_whitespace_tag_stack = []
224 self.pushTag(self)
225
226 def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
227 """Create a new tag associated with this soup."""
228 return Tag(None, self.builder, name, namespace, nsprefix, attrs)
229
230 def new_string(self, s, subclass=NavigableString):
231 """Create a new NavigableString associated with this soup."""
232 navigable = subclass(s)
233 navigable.setup()
234 return navigable
235
236 def insert_before(self, successor):
237 raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
238
239 def insert_after(self, successor):
240 raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
241
242 def popTag(self):
243 tag = self.tagStack.pop()
244 if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
245 self.preserve_whitespace_tag_stack.pop()
246 #print "Pop", tag.name
247 if self.tagStack:
248 self.currentTag = self.tagStack[-1]
249 return self.currentTag
250
251 def pushTag(self, tag):
252 #print "Push", tag.name
253 if self.currentTag:
254 self.currentTag.contents.append(tag)
255 self.tagStack.append(tag)
256 self.currentTag = self.tagStack[-1]
257 if tag.name in self.builder.preserve_whitespace_tags:
258 self.preserve_whitespace_tag_stack.append(tag)
259
260 def endData(self, containerClass=NavigableString):
261 if self.current_data:
262 current_data = u''.join(self.current_data)
263 # If whitespace is not preserved, and this string contains
264 # nothing but ASCII spaces, replace it with a single space
265 # or newline.
266 if not self.preserve_whitespace_tag_stack:
267 strippable = True
268 for i in current_data:
269 if i not in self.ASCII_SPACES:
270 strippable = False
271 break
272 if strippable:
273 if '\n' in current_data:
274 current_data = '\n'
275 else:
276 current_data = ' '
277
278 # Reset the data collector.
279 self.current_data = []
280
281 # Should we add this string to the tree at all?
282 if self.parse_only and len(self.tagStack) <= 1 and \
283 (not self.parse_only.text or \
284 not self.parse_only.search(current_data)):
285 return
286
287 o = containerClass(current_data)
288 self.object_was_parsed(o)
289
290 def object_was_parsed(self, o, parent=None, most_recent_element=None):
291 """Add an object to the parse tree."""
292 parent = parent or self.currentTag
293 most_recent_element = most_recent_element or self._most_recent_element
294 o.setup(parent, most_recent_element)
295
296 if most_recent_element is not None:
297 most_recent_element.next_element = o
298 self._most_recent_element = o
299 parent.contents.append(o)
300
301 def _popToTag(self, name, nsprefix=None, inclusivePop=True):
302 """Pops the tag stack up to and including the most recent
303 instance of the given tag. If inclusivePop is false, pops the tag
304 stack up to but *not* including the most recent instqance of
305 the given tag."""
306 #print "Popping to %s" % name
307 if name == self.ROOT_TAG_NAME:
308 # The BeautifulSoup object itself can never be popped.
309 return
310
311 most_recently_popped = None
312
313 stack_size = len(self.tagStack)
314 for i in range(stack_size - 1, 0, -1):
315 t = self.tagStack[i]
316 if (name == t.name and nsprefix == t.prefix):
317 if inclusivePop:
318 most_recently_popped = self.popTag()
319 break
320 most_recently_popped = self.popTag()
321
322 return most_recently_popped
323
324 def handle_starttag(self, name, namespace, nsprefix, attrs):
325 """Push a start tag on to the stack.
326
327 If this method returns None, the tag was rejected by the
328 SoupStrainer. You should proceed as if the tag had not occured
329 in the document. For instance, if this was a self-closing tag,
330 don't call handle_endtag.
331 """
332
333 # print "Start tag %s: %s" % (name, attrs)
334 self.endData()
335
336 if (self.parse_only and len(self.tagStack) <= 1
337 and (self.parse_only.text
338 or not self.parse_only.search_tag(name, attrs))):
339 return None
340
341 tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
342 self.currentTag, self._most_recent_element)
343 if tag is None:
344 return tag
345 if self._most_recent_element:
346 self._most_recent_element.next_element = tag
347 self._most_recent_element = tag
348 self.pushTag(tag)
349 return tag
350
351 def handle_endtag(self, name, nsprefix=None):
352 #print "End tag: " + name
353 self.endData()
354 self._popToTag(name, nsprefix)
355
356 def handle_data(self, data):
357 self.current_data.append(data)
358
359 def decode(self, pretty_print=False,
360 eventual_encoding=DEFAULT_OUTPUT_ENCODING,
361 formatter="minimal"):
362 """Returns a string or Unicode representation of this document.
363 To get Unicode, pass None for encoding."""
364
365 if self.is_xml:
366 # Print the XML declaration
367 encoding_part = ''
368 if eventual_encoding != None:
369 encoding_part = ' encoding="%s"' % eventual_encoding
370 prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
371 else:
372 prefix = u''
373 if not pretty_print:
374 indent_level = None
375 else:
376 indent_level = 0
377 return prefix + super(BeautifulSoup, self).decode(
378 indent_level, eventual_encoding, formatter)
379
380# Alias to make it easier to type import: 'from bs4 import _soup'
381_s = BeautifulSoup
382_soup = BeautifulSoup
383
384class BeautifulStoneSoup(BeautifulSoup):
385 """Deprecated interface to an XML parser."""
386
387 def __init__(self, *args, **kwargs):
388 kwargs['features'] = 'xml'
389 warnings.warn(
390 'The BeautifulStoneSoup class is deprecated. Instead of using '
391 'it, pass features="xml" into the BeautifulSoup constructor.')
392 super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
393
394
395class StopParsing(Exception):
396 pass
397
398class FeatureNotFound(ValueError):
399 pass
400
401
402#By default, act as an HTML pretty-printer.
403if __name__ == '__main__':
404 import sys
405 soup = BeautifulSoup(sys.stdin)
406 print soup.prettify()