Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 1 | """Beautiful Soup |
| 2 | Elixir and Tonic |
| 3 | "The Screen-Scraper's Friend" |
| 4 | http://www.crummy.com/software/BeautifulSoup/ |
| 5 | |
| 6 | Beautiful Soup uses a pluggable XML or HTML parser to parse a |
| 7 | (possibly invalid) document into a tree representation. Beautiful Soup |
| 8 | provides provides methods and Pythonic idioms that make it easy to |
| 9 | navigate, search, and modify the parse tree. |
| 10 | |
| 11 | Beautiful Soup works with Python 2.6 and up. It works better if lxml |
| 12 | and/or html5lib is installed. |
| 13 | |
| 14 | For more than you ever wanted to know about Beautiful Soup, see the |
| 15 | documentation: |
| 16 | http://www.crummy.com/software/BeautifulSoup/bs4/doc/ |
| 17 | """ |
| 18 | |
| 19 | __author__ = "Leonard Richardson (leonardr@segfault.org)" |
| 20 | __version__ = "4.3.2" |
| 21 | __copyright__ = "Copyright (c) 2004-2013 Leonard Richardson" |
| 22 | __license__ = "MIT" |
| 23 | |
| 24 | __all__ = ['BeautifulSoup'] |
| 25 | |
| 26 | import os |
| 27 | import re |
| 28 | import warnings |
| 29 | |
| 30 | from .builder import builder_registry, ParserRejectedMarkup |
| 31 | from .dammit import UnicodeDammit |
| 32 | from .element import ( |
| 33 | CData, |
| 34 | Comment, |
| 35 | DEFAULT_OUTPUT_ENCODING, |
| 36 | Declaration, |
| 37 | Doctype, |
| 38 | NavigableString, |
| 39 | PageElement, |
| 40 | ProcessingInstruction, |
| 41 | ResultSet, |
| 42 | SoupStrainer, |
| 43 | Tag, |
| 44 | ) |
| 45 | |
| 46 | # The very first thing we do is give a useful error if someone is |
| 47 | # running this code under Python 3 without converting it. |
| 48 | syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' |
| 49 | |
| 50 | class BeautifulSoup(Tag): |
| 51 | """ |
| 52 | This class defines the basic interface called by the tree builders. |
| 53 | |
| 54 | These methods will be called by the parser: |
| 55 | reset() |
| 56 | feed(markup) |
| 57 | |
| 58 | The tree builder may call these methods from its feed() implementation: |
| 59 | handle_starttag(name, attrs) # See note about return value |
| 60 | handle_endtag(name) |
| 61 | handle_data(data) # Appends to the current data node |
| 62 | endData(containerClass=NavigableString) # Ends the current data node |
| 63 | |
| 64 | No matter how complicated the underlying parser is, you should be |
| 65 | able to build a tree using 'start tag' events, 'end tag' events, |
| 66 | 'data' events, and "done with data" events. |
| 67 | |
| 68 | If you encounter an empty-element tag (aka a self-closing tag, |
| 69 | like HTML's <br> tag), call handle_starttag and then |
| 70 | handle_endtag. |
| 71 | """ |
| 72 | ROOT_TAG_NAME = u'[document]' |
| 73 | |
| 74 | # If the end-user gives no indication which tree builder they |
| 75 | # want, look for one with these features. |
| 76 | DEFAULT_BUILDER_FEATURES = ['html', 'fast'] |
| 77 | |
| 78 | ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' |
| 79 | |
| 80 | def __init__(self, markup="", features=None, builder=None, |
| 81 | parse_only=None, from_encoding=None, **kwargs): |
| 82 | """The Soup object is initialized as the 'root tag', and the |
| 83 | provided markup (which can be a string or a file-like object) |
| 84 | is fed into the underlying parser.""" |
| 85 | |
| 86 | if 'convertEntities' in kwargs: |
| 87 | warnings.warn( |
| 88 | "BS4 does not respect the convertEntities argument to the " |
| 89 | "BeautifulSoup constructor. Entities are always converted " |
| 90 | "to Unicode characters.") |
| 91 | |
| 92 | if 'markupMassage' in kwargs: |
| 93 | del kwargs['markupMassage'] |
| 94 | warnings.warn( |
| 95 | "BS4 does not respect the markupMassage argument to the " |
| 96 | "BeautifulSoup constructor. The tree builder is responsible " |
| 97 | "for any necessary markup massage.") |
| 98 | |
| 99 | if 'smartQuotesTo' in kwargs: |
| 100 | del kwargs['smartQuotesTo'] |
| 101 | warnings.warn( |
| 102 | "BS4 does not respect the smartQuotesTo argument to the " |
| 103 | "BeautifulSoup constructor. Smart quotes are always converted " |
| 104 | "to Unicode characters.") |
| 105 | |
| 106 | if 'selfClosingTags' in kwargs: |
| 107 | del kwargs['selfClosingTags'] |
| 108 | warnings.warn( |
| 109 | "BS4 does not respect the selfClosingTags argument to the " |
| 110 | "BeautifulSoup constructor. The tree builder is responsible " |
| 111 | "for understanding self-closing tags.") |
| 112 | |
| 113 | if 'isHTML' in kwargs: |
| 114 | del kwargs['isHTML'] |
| 115 | warnings.warn( |
| 116 | "BS4 does not respect the isHTML argument to the " |
| 117 | "BeautifulSoup constructor. You can pass in features='html' " |
| 118 | "or features='xml' to get a builder capable of handling " |
| 119 | "one or the other.") |
| 120 | |
| 121 | def deprecated_argument(old_name, new_name): |
| 122 | if old_name in kwargs: |
| 123 | warnings.warn( |
| 124 | 'The "%s" argument to the BeautifulSoup constructor ' |
| 125 | 'has been renamed to "%s."' % (old_name, new_name)) |
| 126 | value = kwargs[old_name] |
| 127 | del kwargs[old_name] |
| 128 | return value |
| 129 | return None |
| 130 | |
| 131 | parse_only = parse_only or deprecated_argument( |
| 132 | "parseOnlyThese", "parse_only") |
| 133 | |
| 134 | from_encoding = from_encoding or deprecated_argument( |
| 135 | "fromEncoding", "from_encoding") |
| 136 | |
| 137 | if len(kwargs) > 0: |
| 138 | arg = kwargs.keys().pop() |
| 139 | raise TypeError( |
| 140 | "__init__() got an unexpected keyword argument '%s'" % arg) |
| 141 | |
| 142 | if builder is None: |
| 143 | if isinstance(features, basestring): |
| 144 | features = [features] |
| 145 | if features is None or len(features) == 0: |
| 146 | features = self.DEFAULT_BUILDER_FEATURES |
| 147 | builder_class = builder_registry.lookup(*features) |
| 148 | if builder_class is None: |
| 149 | raise FeatureNotFound( |
| 150 | "Couldn't find a tree builder with the features you " |
| 151 | "requested: %s. Do you need to install a parser library?" |
| 152 | % ",".join(features)) |
| 153 | builder = builder_class() |
| 154 | self.builder = builder |
| 155 | self.is_xml = builder.is_xml |
| 156 | self.builder.soup = self |
| 157 | |
| 158 | self.parse_only = parse_only |
| 159 | |
| 160 | if hasattr(markup, 'read'): # It's a file-type object. |
| 161 | markup = markup.read() |
| 162 | elif len(markup) <= 256: |
| 163 | # Print out warnings for a couple beginner problems |
| 164 | # involving passing non-markup to Beautiful Soup. |
| 165 | # Beautiful Soup will still parse the input as markup, |
| 166 | # just in case that's what the user really wants. |
| 167 | if (isinstance(markup, unicode) |
| 168 | and not os.path.supports_unicode_filenames): |
| 169 | possible_filename = markup.encode("utf8") |
| 170 | else: |
| 171 | possible_filename = markup |
| 172 | is_file = False |
| 173 | try: |
| 174 | is_file = os.path.exists(possible_filename) |
| 175 | except Exception, e: |
| 176 | # This is almost certainly a problem involving |
| 177 | # characters not valid in filenames on this |
| 178 | # system. Just let it go. |
| 179 | pass |
| 180 | if is_file: |
| 181 | warnings.warn( |
| 182 | '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup) |
| 183 | if markup[:5] == "http:" or markup[:6] == "https:": |
| 184 | # TODO: This is ugly but I couldn't get it to work in |
| 185 | # Python 3 otherwise. |
| 186 | if ((isinstance(markup, bytes) and not b' ' in markup) |
| 187 | or (isinstance(markup, unicode) and not u' ' in markup)): |
| 188 | warnings.warn( |
| 189 | '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup) |
| 190 | |
| 191 | for (self.markup, self.original_encoding, self.declared_html_encoding, |
| 192 | self.contains_replacement_characters) in ( |
| 193 | self.builder.prepare_markup(markup, from_encoding)): |
| 194 | self.reset() |
| 195 | try: |
| 196 | self._feed() |
| 197 | break |
| 198 | except ParserRejectedMarkup: |
| 199 | pass |
| 200 | |
| 201 | # Clear out the markup and remove the builder's circular |
| 202 | # reference to this object. |
| 203 | self.markup = None |
| 204 | self.builder.soup = None |
| 205 | |
| 206 | def _feed(self): |
| 207 | # Convert the document to Unicode. |
| 208 | self.builder.reset() |
| 209 | |
| 210 | self.builder.feed(self.markup) |
| 211 | # Close out any unfinished strings and close all the open tags. |
| 212 | self.endData() |
| 213 | while self.currentTag.name != self.ROOT_TAG_NAME: |
| 214 | self.popTag() |
| 215 | |
| 216 | def reset(self): |
| 217 | Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) |
| 218 | self.hidden = 1 |
| 219 | self.builder.reset() |
| 220 | self.current_data = [] |
| 221 | self.currentTag = None |
| 222 | self.tagStack = [] |
| 223 | self.preserve_whitespace_tag_stack = [] |
| 224 | self.pushTag(self) |
| 225 | |
| 226 | def new_tag(self, name, namespace=None, nsprefix=None, **attrs): |
| 227 | """Create a new tag associated with this soup.""" |
| 228 | return Tag(None, self.builder, name, namespace, nsprefix, attrs) |
| 229 | |
| 230 | def new_string(self, s, subclass=NavigableString): |
| 231 | """Create a new NavigableString associated with this soup.""" |
| 232 | navigable = subclass(s) |
| 233 | navigable.setup() |
| 234 | return navigable |
| 235 | |
| 236 | def insert_before(self, successor): |
| 237 | raise NotImplementedError("BeautifulSoup objects don't support insert_before().") |
| 238 | |
| 239 | def insert_after(self, successor): |
| 240 | raise NotImplementedError("BeautifulSoup objects don't support insert_after().") |
| 241 | |
| 242 | def popTag(self): |
| 243 | tag = self.tagStack.pop() |
| 244 | if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]: |
| 245 | self.preserve_whitespace_tag_stack.pop() |
| 246 | #print "Pop", tag.name |
| 247 | if self.tagStack: |
| 248 | self.currentTag = self.tagStack[-1] |
| 249 | return self.currentTag |
| 250 | |
| 251 | def pushTag(self, tag): |
| 252 | #print "Push", tag.name |
| 253 | if self.currentTag: |
| 254 | self.currentTag.contents.append(tag) |
| 255 | self.tagStack.append(tag) |
| 256 | self.currentTag = self.tagStack[-1] |
| 257 | if tag.name in self.builder.preserve_whitespace_tags: |
| 258 | self.preserve_whitespace_tag_stack.append(tag) |
| 259 | |
| 260 | def endData(self, containerClass=NavigableString): |
| 261 | if self.current_data: |
| 262 | current_data = u''.join(self.current_data) |
| 263 | # If whitespace is not preserved, and this string contains |
| 264 | # nothing but ASCII spaces, replace it with a single space |
| 265 | # or newline. |
| 266 | if not self.preserve_whitespace_tag_stack: |
| 267 | strippable = True |
| 268 | for i in current_data: |
| 269 | if i not in self.ASCII_SPACES: |
| 270 | strippable = False |
| 271 | break |
| 272 | if strippable: |
| 273 | if '\n' in current_data: |
| 274 | current_data = '\n' |
| 275 | else: |
| 276 | current_data = ' ' |
| 277 | |
| 278 | # Reset the data collector. |
| 279 | self.current_data = [] |
| 280 | |
| 281 | # Should we add this string to the tree at all? |
| 282 | if self.parse_only and len(self.tagStack) <= 1 and \ |
| 283 | (not self.parse_only.text or \ |
| 284 | not self.parse_only.search(current_data)): |
| 285 | return |
| 286 | |
| 287 | o = containerClass(current_data) |
| 288 | self.object_was_parsed(o) |
| 289 | |
| 290 | def object_was_parsed(self, o, parent=None, most_recent_element=None): |
| 291 | """Add an object to the parse tree.""" |
| 292 | parent = parent or self.currentTag |
| 293 | most_recent_element = most_recent_element or self._most_recent_element |
| 294 | o.setup(parent, most_recent_element) |
| 295 | |
| 296 | if most_recent_element is not None: |
| 297 | most_recent_element.next_element = o |
| 298 | self._most_recent_element = o |
| 299 | parent.contents.append(o) |
| 300 | |
| 301 | def _popToTag(self, name, nsprefix=None, inclusivePop=True): |
| 302 | """Pops the tag stack up to and including the most recent |
| 303 | instance of the given tag. If inclusivePop is false, pops the tag |
| 304 | stack up to but *not* including the most recent instqance of |
| 305 | the given tag.""" |
| 306 | #print "Popping to %s" % name |
| 307 | if name == self.ROOT_TAG_NAME: |
| 308 | # The BeautifulSoup object itself can never be popped. |
| 309 | return |
| 310 | |
| 311 | most_recently_popped = None |
| 312 | |
| 313 | stack_size = len(self.tagStack) |
| 314 | for i in range(stack_size - 1, 0, -1): |
| 315 | t = self.tagStack[i] |
| 316 | if (name == t.name and nsprefix == t.prefix): |
| 317 | if inclusivePop: |
| 318 | most_recently_popped = self.popTag() |
| 319 | break |
| 320 | most_recently_popped = self.popTag() |
| 321 | |
| 322 | return most_recently_popped |
| 323 | |
| 324 | def handle_starttag(self, name, namespace, nsprefix, attrs): |
| 325 | """Push a start tag on to the stack. |
| 326 | |
| 327 | If this method returns None, the tag was rejected by the |
| 328 | SoupStrainer. You should proceed as if the tag had not occured |
| 329 | in the document. For instance, if this was a self-closing tag, |
| 330 | don't call handle_endtag. |
| 331 | """ |
| 332 | |
| 333 | # print "Start tag %s: %s" % (name, attrs) |
| 334 | self.endData() |
| 335 | |
| 336 | if (self.parse_only and len(self.tagStack) <= 1 |
| 337 | and (self.parse_only.text |
| 338 | or not self.parse_only.search_tag(name, attrs))): |
| 339 | return None |
| 340 | |
| 341 | tag = Tag(self, self.builder, name, namespace, nsprefix, attrs, |
| 342 | self.currentTag, self._most_recent_element) |
| 343 | if tag is None: |
| 344 | return tag |
| 345 | if self._most_recent_element: |
| 346 | self._most_recent_element.next_element = tag |
| 347 | self._most_recent_element = tag |
| 348 | self.pushTag(tag) |
| 349 | return tag |
| 350 | |
| 351 | def handle_endtag(self, name, nsprefix=None): |
| 352 | #print "End tag: " + name |
| 353 | self.endData() |
| 354 | self._popToTag(name, nsprefix) |
| 355 | |
| 356 | def handle_data(self, data): |
| 357 | self.current_data.append(data) |
| 358 | |
| 359 | def decode(self, pretty_print=False, |
| 360 | eventual_encoding=DEFAULT_OUTPUT_ENCODING, |
| 361 | formatter="minimal"): |
| 362 | """Returns a string or Unicode representation of this document. |
| 363 | To get Unicode, pass None for encoding.""" |
| 364 | |
| 365 | if self.is_xml: |
| 366 | # Print the XML declaration |
| 367 | encoding_part = '' |
| 368 | if eventual_encoding != None: |
| 369 | encoding_part = ' encoding="%s"' % eventual_encoding |
| 370 | prefix = u'<?xml version="1.0"%s?>\n' % encoding_part |
| 371 | else: |
| 372 | prefix = u'' |
| 373 | if not pretty_print: |
| 374 | indent_level = None |
| 375 | else: |
| 376 | indent_level = 0 |
| 377 | return prefix + super(BeautifulSoup, self).decode( |
| 378 | indent_level, eventual_encoding, formatter) |
| 379 | |
| 380 | # Alias to make it easier to type import: 'from bs4 import _soup' |
| 381 | _s = BeautifulSoup |
| 382 | _soup = BeautifulSoup |
| 383 | |
| 384 | class BeautifulStoneSoup(BeautifulSoup): |
| 385 | """Deprecated interface to an XML parser.""" |
| 386 | |
| 387 | def __init__(self, *args, **kwargs): |
| 388 | kwargs['features'] = 'xml' |
| 389 | warnings.warn( |
| 390 | 'The BeautifulStoneSoup class is deprecated. Instead of using ' |
| 391 | 'it, pass features="xml" into the BeautifulSoup constructor.') |
| 392 | super(BeautifulStoneSoup, self).__init__(*args, **kwargs) |
| 393 | |
| 394 | |
| 395 | class StopParsing(Exception): |
| 396 | pass |
| 397 | |
| 398 | class FeatureNotFound(ValueError): |
| 399 | pass |
| 400 | |
| 401 | |
| 402 | #By default, act as an HTML pretty-printer. |
| 403 | if __name__ == '__main__': |
| 404 | import sys |
| 405 | soup = BeautifulSoup(sys.stdin) |
| 406 | print soup.prettify() |