blob: 740f5f29cd72f82820b67eb19e9a2178129733e1 [file] [log] [blame]
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001from collections import defaultdict
2import itertools
3import sys
4from bs4.element import (
5 CharsetMetaAttributeValue,
6 ContentMetaAttributeValue,
7 whitespace_re
8 )
9
10__all__ = [
11 'HTMLTreeBuilder',
12 'SAXTreeBuilder',
13 'TreeBuilder',
14 'TreeBuilderRegistry',
15 ]
16
17# Some useful features for a TreeBuilder to have.
18FAST = 'fast'
19PERMISSIVE = 'permissive'
20STRICT = 'strict'
21XML = 'xml'
22HTML = 'html'
23HTML_5 = 'html5'
24
25
26class TreeBuilderRegistry(object):
27
28 def __init__(self):
29 self.builders_for_feature = defaultdict(list)
30 self.builders = []
31
32 def register(self, treebuilder_class):
33 """Register a treebuilder based on its advertised features."""
34 for feature in treebuilder_class.features:
35 self.builders_for_feature[feature].insert(0, treebuilder_class)
36 self.builders.insert(0, treebuilder_class)
37
38 def lookup(self, *features):
39 if len(self.builders) == 0:
40 # There are no builders at all.
41 return None
42
43 if len(features) == 0:
44 # They didn't ask for any features. Give them the most
45 # recently registered builder.
46 return self.builders[0]
47
48 # Go down the list of features in order, and eliminate any builders
49 # that don't match every feature.
50 features = list(features)
51 features.reverse()
52 candidates = None
53 candidate_set = None
54 while len(features) > 0:
55 feature = features.pop()
56 we_have_the_feature = self.builders_for_feature.get(feature, [])
57 if len(we_have_the_feature) > 0:
58 if candidates is None:
59 candidates = we_have_the_feature
60 candidate_set = set(candidates)
61 else:
62 # Eliminate any candidates that don't have this feature.
63 candidate_set = candidate_set.intersection(
64 set(we_have_the_feature))
65
66 # The only valid candidates are the ones in candidate_set.
67 # Go through the original list of candidates and pick the first one
68 # that's in candidate_set.
69 if candidate_set is None:
70 return None
71 for candidate in candidates:
72 if candidate in candidate_set:
73 return candidate
74 return None
75
76# The BeautifulSoup class will take feature lists from developers and use them
77# to look up builders in this registry.
78builder_registry = TreeBuilderRegistry()
79
80class TreeBuilder(object):
81 """Turn a document into a Beautiful Soup object tree."""
82
83 features = []
84
85 is_xml = False
86 preserve_whitespace_tags = set()
87 empty_element_tags = None # A tag will be considered an empty-element
88 # tag when and only when it has no contents.
89
90 # A value for these tag/attribute combinations is a space- or
91 # comma-separated list of CDATA, rather than a single CDATA.
92 cdata_list_attributes = {}
93
94
95 def __init__(self):
96 self.soup = None
97
98 def reset(self):
99 pass
100
101 def can_be_empty_element(self, tag_name):
102 """Might a tag with this name be an empty-element tag?
103
104 The final markup may or may not actually present this tag as
105 self-closing.
106
107 For instance: an HTMLBuilder does not consider a <p> tag to be
108 an empty-element tag (it's not in
109 HTMLBuilder.empty_element_tags). This means an empty <p> tag
110 will be presented as "<p></p>", not "<p />".
111
112 The default implementation has no opinion about which tags are
113 empty-element tags, so a tag will be presented as an
114 empty-element tag if and only if it has no contents.
115 "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
116 be left alone.
117 """
118 if self.empty_element_tags is None:
119 return True
120 return tag_name in self.empty_element_tags
121
122 def feed(self, markup):
123 raise NotImplementedError()
124
125 def prepare_markup(self, markup, user_specified_encoding=None,
126 document_declared_encoding=None):
127 return markup, None, None, False
128
129 def test_fragment_to_document(self, fragment):
130 """Wrap an HTML fragment to make it look like a document.
131
132 Different parsers do this differently. For instance, lxml
133 introduces an empty <head> tag, and html5lib
134 doesn't. Abstracting this away lets us write simple tests
135 which run HTML fragments through the parser and compare the
136 results against other HTML fragments.
137
138 This method should not be used outside of tests.
139 """
140 return fragment
141
142 def set_up_substitutions(self, tag):
143 return False
144
145 def _replace_cdata_list_attribute_values(self, tag_name, attrs):
146 """Replaces class="foo bar" with class=["foo", "bar"]
147
148 Modifies its input in place.
149 """
150 if not attrs:
151 return attrs
152 if self.cdata_list_attributes:
153 universal = self.cdata_list_attributes.get('*', [])
154 tag_specific = self.cdata_list_attributes.get(
155 tag_name.lower(), None)
156 for attr in attrs.keys():
157 if attr in universal or (tag_specific and attr in tag_specific):
158 # We have a "class"-type attribute whose string
159 # value is a whitespace-separated list of
160 # values. Split it into a list.
161 value = attrs[attr]
162 if isinstance(value, basestring):
163 values = whitespace_re.split(value)
164 else:
165 # html5lib sometimes calls setAttributes twice
166 # for the same tag when rearranging the parse
167 # tree. On the second call the attribute value
168 # here is already a list. If this happens,
169 # leave the value alone rather than trying to
170 # split it again.
171 values = value
172 attrs[attr] = values
173 return attrs
174
175class SAXTreeBuilder(TreeBuilder):
176 """A Beautiful Soup treebuilder that listens for SAX events."""
177
178 def feed(self, markup):
179 raise NotImplementedError()
180
181 def close(self):
182 pass
183
184 def startElement(self, name, attrs):
185 attrs = dict((key[1], value) for key, value in list(attrs.items()))
186 #print "Start %s, %r" % (name, attrs)
187 self.soup.handle_starttag(name, attrs)
188
189 def endElement(self, name):
190 #print "End %s" % name
191 self.soup.handle_endtag(name)
192
193 def startElementNS(self, nsTuple, nodeName, attrs):
194 # Throw away (ns, nodeName) for now.
195 self.startElement(nodeName, attrs)
196
197 def endElementNS(self, nsTuple, nodeName):
198 # Throw away (ns, nodeName) for now.
199 self.endElement(nodeName)
200 #handler.endElementNS((ns, node.nodeName), node.nodeName)
201
202 def startPrefixMapping(self, prefix, nodeValue):
203 # Ignore the prefix for now.
204 pass
205
206 def endPrefixMapping(self, prefix):
207 # Ignore the prefix for now.
208 # handler.endPrefixMapping(prefix)
209 pass
210
211 def characters(self, content):
212 self.soup.handle_data(content)
213
214 def startDocument(self):
215 pass
216
217 def endDocument(self):
218 pass
219
220
221class HTMLTreeBuilder(TreeBuilder):
222 """This TreeBuilder knows facts about HTML.
223
224 Such as which tags are empty-element tags.
225 """
226
227 preserve_whitespace_tags = set(['pre', 'textarea'])
228 empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
229 'spacer', 'link', 'frame', 'base'])
230
231 # The HTML standard defines these attributes as containing a
232 # space-separated list of values, not a single value. That is,
233 # class="foo bar" means that the 'class' attribute has two values,
234 # 'foo' and 'bar', not the single value 'foo bar'. When we
235 # encounter one of these attributes, we will parse its value into
236 # a list of values if possible. Upon output, the list will be
237 # converted back into a string.
238 cdata_list_attributes = {
239 "*" : ['class', 'accesskey', 'dropzone'],
240 "a" : ['rel', 'rev'],
241 "link" : ['rel', 'rev'],
242 "td" : ["headers"],
243 "th" : ["headers"],
244 "td" : ["headers"],
245 "form" : ["accept-charset"],
246 "object" : ["archive"],
247
248 # These are HTML5 specific, as are *.accesskey and *.dropzone above.
249 "area" : ["rel"],
250 "icon" : ["sizes"],
251 "iframe" : ["sandbox"],
252 "output" : ["for"],
253 }
254
255 def set_up_substitutions(self, tag):
256 # We are only interested in <meta> tags
257 if tag.name != 'meta':
258 return False
259
260 http_equiv = tag.get('http-equiv')
261 content = tag.get('content')
262 charset = tag.get('charset')
263
264 # We are interested in <meta> tags that say what encoding the
265 # document was originally in. This means HTML 5-style <meta>
266 # tags that provide the "charset" attribute. It also means
267 # HTML 4-style <meta> tags that provide the "content"
268 # attribute and have "http-equiv" set to "content-type".
269 #
270 # In both cases we will replace the value of the appropriate
271 # attribute with a standin object that can take on any
272 # encoding.
273 meta_encoding = None
274 if charset is not None:
275 # HTML 5 style:
276 # <meta charset="utf8">
277 meta_encoding = charset
278 tag['charset'] = CharsetMetaAttributeValue(charset)
279
280 elif (content is not None and http_equiv is not None
281 and http_equiv.lower() == 'content-type'):
282 # HTML 4 style:
283 # <meta http-equiv="content-type" content="text/html; charset=utf8">
284 tag['content'] = ContentMetaAttributeValue(content)
285
286 return (meta_encoding is not None)
287
288def register_treebuilders_from(module):
289 """Copy TreeBuilders from the given module into this module."""
290 # I'm fairly sure this is not the best way to do this.
291 this_module = sys.modules['bs4.builder']
292 for name in module.__all__:
293 obj = getattr(module, name)
294
295 if issubclass(obj, TreeBuilder):
296 setattr(this_module, name, obj)
297 this_module.__all__.append(name)
298 # Register the builder while we're at it.
299 this_module.builder_registry.register(obj)
300
301class ParserRejectedMarkup(Exception):
302 pass
303
304# Builders are registered in reverse order of priority, so that custom
305# builder registrations will take precedence. In general, we want lxml
306# to take precedence over html5lib, because it's faster. And we only
307# want to use HTMLParser as a last result.
308from . import _htmlparser
309register_treebuilders_from(_htmlparser)
310try:
311 from . import _html5lib
312 register_treebuilders_from(_html5lib)
313except ImportError:
314 # They don't have html5lib installed.
315 pass
316try:
317 from . import _lxml
318 register_treebuilders_from(_lxml)
319except ImportError:
320 # They don't have lxml installed.
321 pass