Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 1 | """Beautiful Soup |
| 2 | Elixir and Tonic |
| 3 | "The Screen-Scraper's Friend" |
| 4 | http://www.crummy.com/software/BeautifulSoup/ |
| 5 | |
| 6 | Beautiful Soup uses a pluggable XML or HTML parser to parse a |
| 7 | (possibly invalid) document into a tree representation. Beautiful Soup |
| 8 | provides provides methods and Pythonic idioms that make it easy to |
| 9 | navigate, search, and modify the parse tree. |
| 10 | |
| 11 | Beautiful Soup works with Python 2.6 and up. It works better if lxml |
| 12 | and/or html5lib is installed. |
| 13 | |
| 14 | For more than you ever wanted to know about Beautiful Soup, see the |
| 15 | documentation: |
| 16 | http://www.crummy.com/software/BeautifulSoup/bs4/doc/ |
| 17 | """ |
| 18 | |
| 19 | __author__ = "Leonard Richardson (leonardr@segfault.org)" |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 20 | __version__ = "4.4.1" |
| 21 | __copyright__ = "Copyright (c) 2004-2015 Leonard Richardson" |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 22 | __license__ = "MIT" |
| 23 | |
| 24 | __all__ = ['BeautifulSoup'] |
| 25 | |
| 26 | import os |
| 27 | import re |
| 28 | import warnings |
| 29 | |
| 30 | from .builder import builder_registry, ParserRejectedMarkup |
| 31 | from .dammit import UnicodeDammit |
| 32 | from .element import ( |
| 33 | CData, |
| 34 | Comment, |
| 35 | DEFAULT_OUTPUT_ENCODING, |
| 36 | Declaration, |
| 37 | Doctype, |
| 38 | NavigableString, |
| 39 | PageElement, |
| 40 | ProcessingInstruction, |
| 41 | ResultSet, |
| 42 | SoupStrainer, |
| 43 | Tag, |
| 44 | ) |
| 45 | |
| 46 | # The very first thing we do is give a useful error if someone is |
| 47 | # running this code under Python 3 without converting it. |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 48 | 'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 49 | |
| 50 | class BeautifulSoup(Tag): |
| 51 | """ |
| 52 | This class defines the basic interface called by the tree builders. |
| 53 | |
| 54 | These methods will be called by the parser: |
| 55 | reset() |
| 56 | feed(markup) |
| 57 | |
| 58 | The tree builder may call these methods from its feed() implementation: |
| 59 | handle_starttag(name, attrs) # See note about return value |
| 60 | handle_endtag(name) |
| 61 | handle_data(data) # Appends to the current data node |
| 62 | endData(containerClass=NavigableString) # Ends the current data node |
| 63 | |
| 64 | No matter how complicated the underlying parser is, you should be |
| 65 | able to build a tree using 'start tag' events, 'end tag' events, |
| 66 | 'data' events, and "done with data" events. |
| 67 | |
| 68 | If you encounter an empty-element tag (aka a self-closing tag, |
| 69 | like HTML's <br> tag), call handle_starttag and then |
| 70 | handle_endtag. |
| 71 | """ |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 72 | ROOT_TAG_NAME = '[document]' |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 73 | |
| 74 | # If the end-user gives no indication which tree builder they |
| 75 | # want, look for one with these features. |
| 76 | DEFAULT_BUILDER_FEATURES = ['html', 'fast'] |
| 77 | |
| 78 | ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' |
| 79 | |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 80 | NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nTo get rid of this warning, change this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n" |
| 81 | |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 82 | def __init__(self, markup="", features=None, builder=None, |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 83 | parse_only=None, from_encoding=None, exclude_encodings=None, |
| 84 | **kwargs): |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 85 | """The Soup object is initialized as the 'root tag', and the |
| 86 | provided markup (which can be a string or a file-like object) |
| 87 | is fed into the underlying parser.""" |
| 88 | |
| 89 | if 'convertEntities' in kwargs: |
| 90 | warnings.warn( |
| 91 | "BS4 does not respect the convertEntities argument to the " |
| 92 | "BeautifulSoup constructor. Entities are always converted " |
| 93 | "to Unicode characters.") |
| 94 | |
| 95 | if 'markupMassage' in kwargs: |
| 96 | del kwargs['markupMassage'] |
| 97 | warnings.warn( |
| 98 | "BS4 does not respect the markupMassage argument to the " |
| 99 | "BeautifulSoup constructor. The tree builder is responsible " |
| 100 | "for any necessary markup massage.") |
| 101 | |
| 102 | if 'smartQuotesTo' in kwargs: |
| 103 | del kwargs['smartQuotesTo'] |
| 104 | warnings.warn( |
| 105 | "BS4 does not respect the smartQuotesTo argument to the " |
| 106 | "BeautifulSoup constructor. Smart quotes are always converted " |
| 107 | "to Unicode characters.") |
| 108 | |
| 109 | if 'selfClosingTags' in kwargs: |
| 110 | del kwargs['selfClosingTags'] |
| 111 | warnings.warn( |
| 112 | "BS4 does not respect the selfClosingTags argument to the " |
| 113 | "BeautifulSoup constructor. The tree builder is responsible " |
| 114 | "for understanding self-closing tags.") |
| 115 | |
| 116 | if 'isHTML' in kwargs: |
| 117 | del kwargs['isHTML'] |
| 118 | warnings.warn( |
| 119 | "BS4 does not respect the isHTML argument to the " |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 120 | "BeautifulSoup constructor. Suggest you use " |
| 121 | "features='lxml' for HTML and features='lxml-xml' for " |
| 122 | "XML.") |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 123 | |
| 124 | def deprecated_argument(old_name, new_name): |
| 125 | if old_name in kwargs: |
| 126 | warnings.warn( |
| 127 | 'The "%s" argument to the BeautifulSoup constructor ' |
| 128 | 'has been renamed to "%s."' % (old_name, new_name)) |
| 129 | value = kwargs[old_name] |
| 130 | del kwargs[old_name] |
| 131 | return value |
| 132 | return None |
| 133 | |
| 134 | parse_only = parse_only or deprecated_argument( |
| 135 | "parseOnlyThese", "parse_only") |
| 136 | |
| 137 | from_encoding = from_encoding or deprecated_argument( |
| 138 | "fromEncoding", "from_encoding") |
| 139 | |
| 140 | if len(kwargs) > 0: |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 141 | arg = list(kwargs.keys()).pop() |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 142 | raise TypeError( |
| 143 | "__init__() got an unexpected keyword argument '%s'" % arg) |
| 144 | |
| 145 | if builder is None: |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 146 | original_features = features |
| 147 | if isinstance(features, str): |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 148 | features = [features] |
| 149 | if features is None or len(features) == 0: |
| 150 | features = self.DEFAULT_BUILDER_FEATURES |
| 151 | builder_class = builder_registry.lookup(*features) |
| 152 | if builder_class is None: |
| 153 | raise FeatureNotFound( |
| 154 | "Couldn't find a tree builder with the features you " |
| 155 | "requested: %s. Do you need to install a parser library?" |
| 156 | % ",".join(features)) |
| 157 | builder = builder_class() |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 158 | if not (original_features == builder.NAME or |
| 159 | original_features in builder.ALTERNATE_NAMES): |
| 160 | if builder.is_xml: |
| 161 | markup_type = "XML" |
| 162 | else: |
| 163 | markup_type = "HTML" |
| 164 | warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict( |
| 165 | parser=builder.NAME, |
| 166 | markup_type=markup_type)) |
| 167 | |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 168 | self.builder = builder |
| 169 | self.is_xml = builder.is_xml |
| 170 | self.builder.soup = self |
| 171 | |
| 172 | self.parse_only = parse_only |
| 173 | |
| 174 | if hasattr(markup, 'read'): # It's a file-type object. |
| 175 | markup = markup.read() |
| 176 | elif len(markup) <= 256: |
| 177 | # Print out warnings for a couple beginner problems |
| 178 | # involving passing non-markup to Beautiful Soup. |
| 179 | # Beautiful Soup will still parse the input as markup, |
| 180 | # just in case that's what the user really wants. |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 181 | if (isinstance(markup, str) |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 182 | and not os.path.supports_unicode_filenames): |
| 183 | possible_filename = markup.encode("utf8") |
| 184 | else: |
| 185 | possible_filename = markup |
| 186 | is_file = False |
| 187 | try: |
| 188 | is_file = os.path.exists(possible_filename) |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 189 | except Exception as e: |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 190 | # This is almost certainly a problem involving |
| 191 | # characters not valid in filenames on this |
| 192 | # system. Just let it go. |
| 193 | pass |
| 194 | if is_file: |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 195 | if isinstance(markup, str): |
| 196 | markup = markup.encode("utf8") |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 197 | warnings.warn( |
| 198 | '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup) |
| 199 | if markup[:5] == "http:" or markup[:6] == "https:": |
| 200 | # TODO: This is ugly but I couldn't get it to work in |
| 201 | # Python 3 otherwise. |
| 202 | if ((isinstance(markup, bytes) and not b' ' in markup) |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 203 | or (isinstance(markup, str) and not ' ' in markup)): |
| 204 | if isinstance(markup, str): |
| 205 | markup = markup.encode("utf8") |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 206 | warnings.warn( |
| 207 | '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup) |
| 208 | |
| 209 | for (self.markup, self.original_encoding, self.declared_html_encoding, |
| 210 | self.contains_replacement_characters) in ( |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 211 | self.builder.prepare_markup( |
| 212 | markup, from_encoding, exclude_encodings=exclude_encodings)): |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 213 | self.reset() |
| 214 | try: |
| 215 | self._feed() |
| 216 | break |
| 217 | except ParserRejectedMarkup: |
| 218 | pass |
| 219 | |
| 220 | # Clear out the markup and remove the builder's circular |
| 221 | # reference to this object. |
| 222 | self.markup = None |
| 223 | self.builder.soup = None |
| 224 | |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 225 | def __copy__(self): |
| 226 | return type(self)(self.encode(), builder=self.builder) |
| 227 | |
| 228 | def __getstate__(self): |
| 229 | # Frequently a tree builder can't be pickled. |
| 230 | d = dict(self.__dict__) |
| 231 | if 'builder' in d and not self.builder.picklable: |
| 232 | del d['builder'] |
| 233 | return d |
| 234 | |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 235 | def _feed(self): |
| 236 | # Convert the document to Unicode. |
| 237 | self.builder.reset() |
| 238 | |
| 239 | self.builder.feed(self.markup) |
| 240 | # Close out any unfinished strings and close all the open tags. |
| 241 | self.endData() |
| 242 | while self.currentTag.name != self.ROOT_TAG_NAME: |
| 243 | self.popTag() |
| 244 | |
| 245 | def reset(self): |
| 246 | Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) |
| 247 | self.hidden = 1 |
| 248 | self.builder.reset() |
| 249 | self.current_data = [] |
| 250 | self.currentTag = None |
| 251 | self.tagStack = [] |
| 252 | self.preserve_whitespace_tag_stack = [] |
| 253 | self.pushTag(self) |
| 254 | |
| 255 | def new_tag(self, name, namespace=None, nsprefix=None, **attrs): |
| 256 | """Create a new tag associated with this soup.""" |
| 257 | return Tag(None, self.builder, name, namespace, nsprefix, attrs) |
| 258 | |
| 259 | def new_string(self, s, subclass=NavigableString): |
| 260 | """Create a new NavigableString associated with this soup.""" |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 261 | return subclass(s) |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 262 | |
| 263 | def insert_before(self, successor): |
| 264 | raise NotImplementedError("BeautifulSoup objects don't support insert_before().") |
| 265 | |
| 266 | def insert_after(self, successor): |
| 267 | raise NotImplementedError("BeautifulSoup objects don't support insert_after().") |
| 268 | |
| 269 | def popTag(self): |
| 270 | tag = self.tagStack.pop() |
| 271 | if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]: |
| 272 | self.preserve_whitespace_tag_stack.pop() |
| 273 | #print "Pop", tag.name |
| 274 | if self.tagStack: |
| 275 | self.currentTag = self.tagStack[-1] |
| 276 | return self.currentTag |
| 277 | |
| 278 | def pushTag(self, tag): |
| 279 | #print "Push", tag.name |
| 280 | if self.currentTag: |
| 281 | self.currentTag.contents.append(tag) |
| 282 | self.tagStack.append(tag) |
| 283 | self.currentTag = self.tagStack[-1] |
| 284 | if tag.name in self.builder.preserve_whitespace_tags: |
| 285 | self.preserve_whitespace_tag_stack.append(tag) |
| 286 | |
| 287 | def endData(self, containerClass=NavigableString): |
| 288 | if self.current_data: |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 289 | current_data = ''.join(self.current_data) |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 290 | # If whitespace is not preserved, and this string contains |
| 291 | # nothing but ASCII spaces, replace it with a single space |
| 292 | # or newline. |
| 293 | if not self.preserve_whitespace_tag_stack: |
| 294 | strippable = True |
| 295 | for i in current_data: |
| 296 | if i not in self.ASCII_SPACES: |
| 297 | strippable = False |
| 298 | break |
| 299 | if strippable: |
| 300 | if '\n' in current_data: |
| 301 | current_data = '\n' |
| 302 | else: |
| 303 | current_data = ' ' |
| 304 | |
| 305 | # Reset the data collector. |
| 306 | self.current_data = [] |
| 307 | |
| 308 | # Should we add this string to the tree at all? |
| 309 | if self.parse_only and len(self.tagStack) <= 1 and \ |
| 310 | (not self.parse_only.text or \ |
| 311 | not self.parse_only.search(current_data)): |
| 312 | return |
| 313 | |
| 314 | o = containerClass(current_data) |
| 315 | self.object_was_parsed(o) |
| 316 | |
| 317 | def object_was_parsed(self, o, parent=None, most_recent_element=None): |
| 318 | """Add an object to the parse tree.""" |
| 319 | parent = parent or self.currentTag |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 320 | previous_element = most_recent_element or self._most_recent_element |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 321 | |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 322 | next_element = previous_sibling = next_sibling = None |
| 323 | if isinstance(o, Tag): |
| 324 | next_element = o.next_element |
| 325 | next_sibling = o.next_sibling |
| 326 | previous_sibling = o.previous_sibling |
| 327 | if not previous_element: |
| 328 | previous_element = o.previous_element |
| 329 | |
| 330 | o.setup(parent, previous_element, next_element, previous_sibling, next_sibling) |
| 331 | |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 332 | self._most_recent_element = o |
| 333 | parent.contents.append(o) |
| 334 | |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 335 | if parent.next_sibling: |
| 336 | # This node is being inserted into an element that has |
| 337 | # already been parsed. Deal with any dangling references. |
| 338 | index = parent.contents.index(o) |
| 339 | if index == 0: |
| 340 | previous_element = parent |
| 341 | previous_sibling = None |
| 342 | else: |
| 343 | previous_element = previous_sibling = parent.contents[index-1] |
| 344 | if index == len(parent.contents)-1: |
| 345 | next_element = parent.next_sibling |
| 346 | next_sibling = None |
| 347 | else: |
| 348 | next_element = next_sibling = parent.contents[index+1] |
| 349 | |
| 350 | o.previous_element = previous_element |
| 351 | if previous_element: |
| 352 | previous_element.next_element = o |
| 353 | o.next_element = next_element |
| 354 | if next_element: |
| 355 | next_element.previous_element = o |
| 356 | o.next_sibling = next_sibling |
| 357 | if next_sibling: |
| 358 | next_sibling.previous_sibling = o |
| 359 | o.previous_sibling = previous_sibling |
| 360 | if previous_sibling: |
| 361 | previous_sibling.next_sibling = o |
| 362 | |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 363 | def _popToTag(self, name, nsprefix=None, inclusivePop=True): |
| 364 | """Pops the tag stack up to and including the most recent |
| 365 | instance of the given tag. If inclusivePop is false, pops the tag |
| 366 | stack up to but *not* including the most recent instqance of |
| 367 | the given tag.""" |
| 368 | #print "Popping to %s" % name |
| 369 | if name == self.ROOT_TAG_NAME: |
| 370 | # The BeautifulSoup object itself can never be popped. |
| 371 | return |
| 372 | |
| 373 | most_recently_popped = None |
| 374 | |
| 375 | stack_size = len(self.tagStack) |
| 376 | for i in range(stack_size - 1, 0, -1): |
| 377 | t = self.tagStack[i] |
| 378 | if (name == t.name and nsprefix == t.prefix): |
| 379 | if inclusivePop: |
| 380 | most_recently_popped = self.popTag() |
| 381 | break |
| 382 | most_recently_popped = self.popTag() |
| 383 | |
| 384 | return most_recently_popped |
| 385 | |
| 386 | def handle_starttag(self, name, namespace, nsprefix, attrs): |
| 387 | """Push a start tag on to the stack. |
| 388 | |
| 389 | If this method returns None, the tag was rejected by the |
| 390 | SoupStrainer. You should proceed as if the tag had not occured |
| 391 | in the document. For instance, if this was a self-closing tag, |
| 392 | don't call handle_endtag. |
| 393 | """ |
| 394 | |
| 395 | # print "Start tag %s: %s" % (name, attrs) |
| 396 | self.endData() |
| 397 | |
| 398 | if (self.parse_only and len(self.tagStack) <= 1 |
| 399 | and (self.parse_only.text |
| 400 | or not self.parse_only.search_tag(name, attrs))): |
| 401 | return None |
| 402 | |
| 403 | tag = Tag(self, self.builder, name, namespace, nsprefix, attrs, |
| 404 | self.currentTag, self._most_recent_element) |
| 405 | if tag is None: |
| 406 | return tag |
| 407 | if self._most_recent_element: |
| 408 | self._most_recent_element.next_element = tag |
| 409 | self._most_recent_element = tag |
| 410 | self.pushTag(tag) |
| 411 | return tag |
| 412 | |
| 413 | def handle_endtag(self, name, nsprefix=None): |
| 414 | #print "End tag: " + name |
| 415 | self.endData() |
| 416 | self._popToTag(name, nsprefix) |
| 417 | |
| 418 | def handle_data(self, data): |
| 419 | self.current_data.append(data) |
| 420 | |
| 421 | def decode(self, pretty_print=False, |
| 422 | eventual_encoding=DEFAULT_OUTPUT_ENCODING, |
| 423 | formatter="minimal"): |
| 424 | """Returns a string or Unicode representation of this document. |
| 425 | To get Unicode, pass None for encoding.""" |
| 426 | |
| 427 | if self.is_xml: |
| 428 | # Print the XML declaration |
| 429 | encoding_part = '' |
Andrew Geissler | 82c905d | 2020-04-13 13:39:40 -0500 | [diff] [blame] | 430 | if eventual_encoding is not None: |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 431 | encoding_part = ' encoding="%s"' % eventual_encoding |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 432 | prefix = '<?xml version="1.0"%s?>\n' % encoding_part |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 433 | else: |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 434 | prefix = '' |
Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 435 | if not pretty_print: |
| 436 | indent_level = None |
| 437 | else: |
| 438 | indent_level = 0 |
| 439 | return prefix + super(BeautifulSoup, self).decode( |
| 440 | indent_level, eventual_encoding, formatter) |
| 441 | |
| 442 | # Alias to make it easier to type import: 'from bs4 import _soup' |
| 443 | _s = BeautifulSoup |
| 444 | _soup = BeautifulSoup |
| 445 | |
| 446 | class BeautifulStoneSoup(BeautifulSoup): |
| 447 | """Deprecated interface to an XML parser.""" |
| 448 | |
| 449 | def __init__(self, *args, **kwargs): |
| 450 | kwargs['features'] = 'xml' |
| 451 | warnings.warn( |
| 452 | 'The BeautifulStoneSoup class is deprecated. Instead of using ' |
| 453 | 'it, pass features="xml" into the BeautifulSoup constructor.') |
| 454 | super(BeautifulStoneSoup, self).__init__(*args, **kwargs) |
| 455 | |
| 456 | |
| 457 | class StopParsing(Exception): |
| 458 | pass |
| 459 | |
| 460 | class FeatureNotFound(ValueError): |
| 461 | pass |
| 462 | |
| 463 | |
| 464 | #By default, act as an HTML pretty-printer. |
| 465 | if __name__ == '__main__': |
| 466 | import sys |
| 467 | soup = BeautifulSoup(sys.stdin) |
Patrick Williams | c0f7c04 | 2017-02-23 20:41:17 -0600 | [diff] [blame] | 468 | print(soup.prettify()) |