blob: 6ccd4d23d6742f33a051d483d5e65a1f13805034 [file] [log] [blame]
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001from collections import defaultdict
2import itertools
3import sys
4from bs4.element import (
5 CharsetMetaAttributeValue,
6 ContentMetaAttributeValue,
7 whitespace_re
8 )
9
10__all__ = [
11 'HTMLTreeBuilder',
12 'SAXTreeBuilder',
13 'TreeBuilder',
14 'TreeBuilderRegistry',
15 ]
16
17# Some useful features for a TreeBuilder to have.
18FAST = 'fast'
19PERMISSIVE = 'permissive'
20STRICT = 'strict'
21XML = 'xml'
22HTML = 'html'
23HTML_5 = 'html5'
24
25
26class TreeBuilderRegistry(object):
27
28 def __init__(self):
29 self.builders_for_feature = defaultdict(list)
30 self.builders = []
31
32 def register(self, treebuilder_class):
33 """Register a treebuilder based on its advertised features."""
34 for feature in treebuilder_class.features:
35 self.builders_for_feature[feature].insert(0, treebuilder_class)
36 self.builders.insert(0, treebuilder_class)
37
38 def lookup(self, *features):
39 if len(self.builders) == 0:
40 # There are no builders at all.
41 return None
42
43 if len(features) == 0:
44 # They didn't ask for any features. Give them the most
45 # recently registered builder.
46 return self.builders[0]
47
48 # Go down the list of features in order, and eliminate any builders
49 # that don't match every feature.
50 features = list(features)
51 features.reverse()
52 candidates = None
53 candidate_set = None
54 while len(features) > 0:
55 feature = features.pop()
56 we_have_the_feature = self.builders_for_feature.get(feature, [])
57 if len(we_have_the_feature) > 0:
58 if candidates is None:
59 candidates = we_have_the_feature
60 candidate_set = set(candidates)
61 else:
62 # Eliminate any candidates that don't have this feature.
63 candidate_set = candidate_set.intersection(
64 set(we_have_the_feature))
65
66 # The only valid candidates are the ones in candidate_set.
67 # Go through the original list of candidates and pick the first one
68 # that's in candidate_set.
69 if candidate_set is None:
70 return None
71 for candidate in candidates:
72 if candidate in candidate_set:
73 return candidate
74 return None
75
76# The BeautifulSoup class will take feature lists from developers and use them
77# to look up builders in this registry.
78builder_registry = TreeBuilderRegistry()
79
80class TreeBuilder(object):
81 """Turn a document into a Beautiful Soup object tree."""
82
Patrick Williamsc0f7c042017-02-23 20:41:17 -060083 NAME = "[Unknown tree builder]"
84 ALTERNATE_NAMES = []
Patrick Williamsc124f4f2015-09-15 14:41:29 -050085 features = []
86
87 is_xml = False
Patrick Williamsc0f7c042017-02-23 20:41:17 -060088 picklable = False
Patrick Williamsc124f4f2015-09-15 14:41:29 -050089 preserve_whitespace_tags = set()
90 empty_element_tags = None # A tag will be considered an empty-element
91 # tag when and only when it has no contents.
92
93 # A value for these tag/attribute combinations is a space- or
94 # comma-separated list of CDATA, rather than a single CDATA.
95 cdata_list_attributes = {}
96
97
98 def __init__(self):
99 self.soup = None
100
101 def reset(self):
102 pass
103
104 def can_be_empty_element(self, tag_name):
105 """Might a tag with this name be an empty-element tag?
106
107 The final markup may or may not actually present this tag as
108 self-closing.
109
110 For instance: an HTMLBuilder does not consider a <p> tag to be
111 an empty-element tag (it's not in
112 HTMLBuilder.empty_element_tags). This means an empty <p> tag
113 will be presented as "<p></p>", not "<p />".
114
115 The default implementation has no opinion about which tags are
116 empty-element tags, so a tag will be presented as an
117 empty-element tag if and only if it has no contents.
118 "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
119 be left alone.
120 """
121 if self.empty_element_tags is None:
122 return True
123 return tag_name in self.empty_element_tags
124
125 def feed(self, markup):
126 raise NotImplementedError()
127
128 def prepare_markup(self, markup, user_specified_encoding=None,
129 document_declared_encoding=None):
130 return markup, None, None, False
131
132 def test_fragment_to_document(self, fragment):
133 """Wrap an HTML fragment to make it look like a document.
134
135 Different parsers do this differently. For instance, lxml
136 introduces an empty <head> tag, and html5lib
137 doesn't. Abstracting this away lets us write simple tests
138 which run HTML fragments through the parser and compare the
139 results against other HTML fragments.
140
141 This method should not be used outside of tests.
142 """
143 return fragment
144
145 def set_up_substitutions(self, tag):
146 return False
147
148 def _replace_cdata_list_attribute_values(self, tag_name, attrs):
149 """Replaces class="foo bar" with class=["foo", "bar"]
150
151 Modifies its input in place.
152 """
153 if not attrs:
154 return attrs
155 if self.cdata_list_attributes:
156 universal = self.cdata_list_attributes.get('*', [])
157 tag_specific = self.cdata_list_attributes.get(
158 tag_name.lower(), None)
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600159 for attr in list(attrs.keys()):
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500160 if attr in universal or (tag_specific and attr in tag_specific):
161 # We have a "class"-type attribute whose string
162 # value is a whitespace-separated list of
163 # values. Split it into a list.
164 value = attrs[attr]
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600165 if isinstance(value, str):
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500166 values = whitespace_re.split(value)
167 else:
168 # html5lib sometimes calls setAttributes twice
169 # for the same tag when rearranging the parse
170 # tree. On the second call the attribute value
171 # here is already a list. If this happens,
172 # leave the value alone rather than trying to
173 # split it again.
174 values = value
175 attrs[attr] = values
176 return attrs
177
178class SAXTreeBuilder(TreeBuilder):
179 """A Beautiful Soup treebuilder that listens for SAX events."""
180
181 def feed(self, markup):
182 raise NotImplementedError()
183
184 def close(self):
185 pass
186
187 def startElement(self, name, attrs):
188 attrs = dict((key[1], value) for key, value in list(attrs.items()))
189 #print "Start %s, %r" % (name, attrs)
190 self.soup.handle_starttag(name, attrs)
191
192 def endElement(self, name):
193 #print "End %s" % name
194 self.soup.handle_endtag(name)
195
196 def startElementNS(self, nsTuple, nodeName, attrs):
197 # Throw away (ns, nodeName) for now.
198 self.startElement(nodeName, attrs)
199
200 def endElementNS(self, nsTuple, nodeName):
201 # Throw away (ns, nodeName) for now.
202 self.endElement(nodeName)
203 #handler.endElementNS((ns, node.nodeName), node.nodeName)
204
205 def startPrefixMapping(self, prefix, nodeValue):
206 # Ignore the prefix for now.
207 pass
208
209 def endPrefixMapping(self, prefix):
210 # Ignore the prefix for now.
211 # handler.endPrefixMapping(prefix)
212 pass
213
214 def characters(self, content):
215 self.soup.handle_data(content)
216
217 def startDocument(self):
218 pass
219
220 def endDocument(self):
221 pass
222
223
224class HTMLTreeBuilder(TreeBuilder):
225 """This TreeBuilder knows facts about HTML.
226
227 Such as which tags are empty-element tags.
228 """
229
230 preserve_whitespace_tags = set(['pre', 'textarea'])
231 empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
232 'spacer', 'link', 'frame', 'base'])
233
234 # The HTML standard defines these attributes as containing a
235 # space-separated list of values, not a single value. That is,
236 # class="foo bar" means that the 'class' attribute has two values,
237 # 'foo' and 'bar', not the single value 'foo bar'. When we
238 # encounter one of these attributes, we will parse its value into
239 # a list of values if possible. Upon output, the list will be
240 # converted back into a string.
241 cdata_list_attributes = {
242 "*" : ['class', 'accesskey', 'dropzone'],
243 "a" : ['rel', 'rev'],
244 "link" : ['rel', 'rev'],
245 "td" : ["headers"],
246 "th" : ["headers"],
247 "td" : ["headers"],
248 "form" : ["accept-charset"],
249 "object" : ["archive"],
250
251 # These are HTML5 specific, as are *.accesskey and *.dropzone above.
252 "area" : ["rel"],
253 "icon" : ["sizes"],
254 "iframe" : ["sandbox"],
255 "output" : ["for"],
256 }
257
258 def set_up_substitutions(self, tag):
259 # We are only interested in <meta> tags
260 if tag.name != 'meta':
261 return False
262
263 http_equiv = tag.get('http-equiv')
264 content = tag.get('content')
265 charset = tag.get('charset')
266
267 # We are interested in <meta> tags that say what encoding the
268 # document was originally in. This means HTML 5-style <meta>
269 # tags that provide the "charset" attribute. It also means
270 # HTML 4-style <meta> tags that provide the "content"
271 # attribute and have "http-equiv" set to "content-type".
272 #
273 # In both cases we will replace the value of the appropriate
274 # attribute with a standin object that can take on any
275 # encoding.
276 meta_encoding = None
277 if charset is not None:
278 # HTML 5 style:
279 # <meta charset="utf8">
280 meta_encoding = charset
281 tag['charset'] = CharsetMetaAttributeValue(charset)
282
283 elif (content is not None and http_equiv is not None
284 and http_equiv.lower() == 'content-type'):
285 # HTML 4 style:
286 # <meta http-equiv="content-type" content="text/html; charset=utf8">
287 tag['content'] = ContentMetaAttributeValue(content)
288
289 return (meta_encoding is not None)
290
291def register_treebuilders_from(module):
292 """Copy TreeBuilders from the given module into this module."""
293 # I'm fairly sure this is not the best way to do this.
294 this_module = sys.modules['bs4.builder']
295 for name in module.__all__:
296 obj = getattr(module, name)
297
298 if issubclass(obj, TreeBuilder):
299 setattr(this_module, name, obj)
300 this_module.__all__.append(name)
301 # Register the builder while we're at it.
302 this_module.builder_registry.register(obj)
303
304class ParserRejectedMarkup(Exception):
305 pass
306
307# Builders are registered in reverse order of priority, so that custom
308# builder registrations will take precedence. In general, we want lxml
309# to take precedence over html5lib, because it's faster. And we only
310# want to use HTMLParser as a last result.
311from . import _htmlparser
312register_treebuilders_from(_htmlparser)
313try:
314 from . import _html5lib
315 register_treebuilders_from(_html5lib)
316except ImportError:
317 # They don't have html5lib installed.
318 pass
319try:
320 from . import _lxml
321 register_treebuilders_from(_lxml)
322except ImportError:
323 # They don't have lxml installed.
324 pass