3 | #
4 | # See 'LICENSE' for copying
5 | #
6 |
7 |
8 |
--------------------------------------------------------------------------------
/weeman/lib/bs4/COPYING.txt:
--------------------------------------------------------------------------------
1 | Beautiful Soup is made available under the MIT license:
2 |
3 | Copyright (c) 2004-2016 Leonard Richardson
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining
6 | a copy of this software and associated documentation files (the
7 | "Software"), to deal in the Software without restriction, including
8 | without limitation the rights to use, copy, modify, merge, publish,
9 | distribute, sublicense, and/or sell copies of the Software, and to
10 | permit persons to whom the Software is furnished to do so, subject to
11 | the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be
14 | included in all copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | SOFTWARE.
24 |
25 | Beautiful Soup incorporates code from the html5lib library, which is
26 | also made available under the MIT license. Copyright (c) 2006-2013
27 | James Graham and other contributors
28 |
--------------------------------------------------------------------------------
/weeman/lib/bs4/__init__.py:
--------------------------------------------------------------------------------
1 | """Beautiful Soup
2 | Elixir and Tonic
3 | "The Screen-Scraper's Friend"
4 | http://www.crummy.com/software/BeautifulSoup/
5 |
6 | Beautiful Soup uses a pluggable XML or HTML parser to parse a
7 | (possibly invalid) document into a tree representation. Beautiful Soup
8 | provides methods and Pythonic idioms that make it easy to navigate,
9 | search, and modify the parse tree.
10 |
11 | Beautiful Soup works with Python 2.7 and up. It works better if lxml
12 | and/or html5lib is installed.
13 |
14 | For more than you ever wanted to know about Beautiful Soup, see the
15 | documentation:
16 | http://www.crummy.com/software/BeautifulSoup/bs4/doc/
17 |
18 | """
19 |
20 | # Use of this source code is governed by a BSD-style license that can be
21 | # found in the LICENSE file.
22 |
23 | __author__ = "Leonard Richardson (leonardr@segfault.org)"
24 | __version__ = "4.5.1"
25 | __copyright__ = "Copyright (c) 2004-2016 Leonard Richardson"
26 | __license__ = "MIT"
27 |
28 | __all__ = ['BeautifulSoup']
29 |
30 | import os
31 | import re
32 | import traceback
33 | import warnings
34 |
35 | from .builder import builder_registry, ParserRejectedMarkup
36 | from .dammit import UnicodeDammit
37 | from .element import (
38 | CData,
39 | Comment,
40 | DEFAULT_OUTPUT_ENCODING,
41 | Declaration,
42 | Doctype,
43 | NavigableString,
44 | PageElement,
45 | ProcessingInstruction,
46 | ResultSet,
47 | SoupStrainer,
48 | Tag,
49 | )
50 |
51 | # The very first thing we do is give a useful error if someone is
52 | # running this code under Python 3 without converting it.
53 | 'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
54 |
55 | class BeautifulSoup(Tag):
56 | """
57 | This class defines the basic interface called by the tree builders.
58 |
59 | These methods will be called by the parser:
60 | reset()
61 | feed(markup)
62 |
63 | The tree builder may call these methods from its feed() implementation:
64 | handle_starttag(name, attrs) # See note about return value
65 | handle_endtag(name)
66 | handle_data(data) # Appends to the current data node
67 | endData(containerClass=NavigableString) # Ends the current data node
68 |
69 | No matter how complicated the underlying parser is, you should be
70 | able to build a tree using 'start tag' events, 'end tag' events,
71 | 'data' events, and "done with data" events.
72 |
73 | If you encounter an empty-element tag (aka a self-closing tag,
74 | like HTML's
tag), call handle_starttag and then
75 | handle_endtag.
76 | """
77 | ROOT_TAG_NAME = u'[document]'
78 |
79 | # If the end-user gives no indication which tree builder they
80 | # want, look for one with these features.
81 | DEFAULT_BUILDER_FEATURES = ['html', 'fast']
82 |
83 | ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
84 |
85 | NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
86 |
87 | def __init__(self, markup="", features=None, builder=None,
88 | parse_only=None, from_encoding=None, exclude_encodings=None,
89 | **kwargs):
90 | """The Soup object is initialized as the 'root tag', and the
91 | provided markup (which can be a string or a file-like object)
92 | is fed into the underlying parser."""
93 |
94 | if 'convertEntities' in kwargs:
95 | warnings.warn(
96 | "BS4 does not respect the convertEntities argument to the "
97 | "BeautifulSoup constructor. Entities are always converted "
98 | "to Unicode characters.")
99 |
100 | if 'markupMassage' in kwargs:
101 | del kwargs['markupMassage']
102 | warnings.warn(
103 | "BS4 does not respect the markupMassage argument to the "
104 | "BeautifulSoup constructor. The tree builder is responsible "
105 | "for any necessary markup massage.")
106 |
107 | if 'smartQuotesTo' in kwargs:
108 | del kwargs['smartQuotesTo']
109 | warnings.warn(
110 | "BS4 does not respect the smartQuotesTo argument to the "
111 | "BeautifulSoup constructor. Smart quotes are always converted "
112 | "to Unicode characters.")
113 |
114 | if 'selfClosingTags' in kwargs:
115 | del kwargs['selfClosingTags']
116 | warnings.warn(
117 | "BS4 does not respect the selfClosingTags argument to the "
118 | "BeautifulSoup constructor. The tree builder is responsible "
119 | "for understanding self-closing tags.")
120 |
121 | if 'isHTML' in kwargs:
122 | del kwargs['isHTML']
123 | warnings.warn(
124 | "BS4 does not respect the isHTML argument to the "
125 | "BeautifulSoup constructor. Suggest you use "
126 | "features='lxml' for HTML and features='lxml-xml' for "
127 | "XML.")
128 |
129 | def deprecated_argument(old_name, new_name):
130 | if old_name in kwargs:
131 | warnings.warn(
132 | 'The "%s" argument to the BeautifulSoup constructor '
133 | 'has been renamed to "%s."' % (old_name, new_name))
134 | value = kwargs[old_name]
135 | del kwargs[old_name]
136 | return value
137 | return None
138 |
139 | parse_only = parse_only or deprecated_argument(
140 | "parseOnlyThese", "parse_only")
141 |
142 | from_encoding = from_encoding or deprecated_argument(
143 | "fromEncoding", "from_encoding")
144 |
145 | if from_encoding and isinstance(markup, unicode):
146 | warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
147 | from_encoding = None
148 |
149 | if len(kwargs) > 0:
150 | arg = kwargs.keys().pop()
151 | raise TypeError(
152 | "__init__() got an unexpected keyword argument '%s'" % arg)
153 |
154 | if builder is None:
155 | original_features = features
156 | if isinstance(features, basestring):
157 | features = [features]
158 | if features is None or len(features) == 0:
159 | features = self.DEFAULT_BUILDER_FEATURES
160 | builder_class = builder_registry.lookup(*features)
161 | if builder_class is None:
162 | raise FeatureNotFound(
163 | "Couldn't find a tree builder with the features you "
164 | "requested: %s. Do you need to install a parser library?"
165 | % ",".join(features))
166 | builder = builder_class()
167 | if not (original_features == builder.NAME or
168 | original_features in builder.ALTERNATE_NAMES):
169 | if builder.is_xml:
170 | markup_type = "XML"
171 | else:
172 | markup_type = "HTML"
173 |
174 | caller = traceback.extract_stack()[0]
175 | filename = caller[0]
176 | line_number = caller[1]
177 | warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
178 | filename=filename,
179 | line_number=line_number,
180 | parser=builder.NAME,
181 | markup_type=markup_type))
182 |
183 | self.builder = builder
184 | self.is_xml = builder.is_xml
185 | self.known_xml = self.is_xml
186 | self.builder.soup = self
187 |
188 | self.parse_only = parse_only
189 |
190 | if hasattr(markup, 'read'): # It's a file-type object.
191 | markup = markup.read()
192 | elif len(markup) <= 256 and (
193 | (isinstance(markup, bytes) and not b'<' in markup)
194 | or (isinstance(markup, unicode) and not u'<' in markup)
195 | ):
196 | # Print out warnings for a couple beginner problems
197 | # involving passing non-markup to Beautiful Soup.
198 | # Beautiful Soup will still parse the input as markup,
199 | # just in case that's what the user really wants.
200 | if (isinstance(markup, unicode)
201 | and not os.path.supports_unicode_filenames):
202 | possible_filename = markup.encode("utf8")
203 | else:
204 | possible_filename = markup
205 | is_file = False
206 | try:
207 | is_file = os.path.exists(possible_filename)
208 | except Exception, e:
209 | # This is almost certainly a problem involving
210 | # characters not valid in filenames on this
211 | # system. Just let it go.
212 | pass
213 | if is_file:
214 | if isinstance(markup, unicode):
215 | markup = markup.encode("utf8")
216 | warnings.warn(
217 | '"%s" looks like a filename, not markup. You should'
218 | 'probably open this file and pass the filehandle into'
219 | 'Beautiful Soup.' % markup)
220 | self._check_markup_is_url(markup)
221 |
222 | for (self.markup, self.original_encoding, self.declared_html_encoding,
223 | self.contains_replacement_characters) in (
224 | self.builder.prepare_markup(
225 | markup, from_encoding, exclude_encodings=exclude_encodings)):
226 | self.reset()
227 | try:
228 | self._feed()
229 | break
230 | except ParserRejectedMarkup:
231 | pass
232 |
233 | # Clear out the markup and remove the builder's circular
234 | # reference to this object.
235 | self.markup = None
236 | self.builder.soup = None
237 |
238 | def __copy__(self):
239 | copy = type(self)(
240 | self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
241 | )
242 |
243 | # Although we encoded the tree to UTF-8, that may not have
244 | # been the encoding of the original markup. Set the copy's
245 | # .original_encoding to reflect the original object's
246 | # .original_encoding.
247 | copy.original_encoding = self.original_encoding
248 | return copy
249 |
250 | def __getstate__(self):
251 | # Frequently a tree builder can't be pickled.
252 | d = dict(self.__dict__)
253 | if 'builder' in d and not self.builder.picklable:
254 | d['builder'] = None
255 | return d
256 |
257 | @staticmethod
258 | def _check_markup_is_url(markup):
259 | """
260 | Check if markup looks like it's actually a url and raise a warning
261 | if so. Markup can be unicode or str (py2) / bytes (py3).
262 | """
263 | if isinstance(markup, bytes):
264 | space = b' '
265 | cant_start_with = (b"http:", b"https:")
266 | elif isinstance(markup, unicode):
267 | space = u' '
268 | cant_start_with = (u"http:", u"https:")
269 | else:
270 | return
271 |
272 | if any(markup.startswith(prefix) for prefix in cant_start_with):
273 | if not space in markup:
274 | if isinstance(markup, bytes):
275 | decoded_markup = markup.decode('utf-8', 'replace')
276 | else:
277 | decoded_markup = markup
278 | warnings.warn(
279 | '"%s" looks like a URL. Beautiful Soup is not an'
280 | ' HTTP client. You should probably use an HTTP client like'
281 | ' requests to get the document behind the URL, and feed'
282 | ' that document to Beautiful Soup.' % decoded_markup
283 | )
284 |
285 | def _feed(self):
286 | # Convert the document to Unicode.
287 | self.builder.reset()
288 |
289 | self.builder.feed(self.markup)
290 | # Close out any unfinished strings and close all the open tags.
291 | self.endData()
292 | while self.currentTag.name != self.ROOT_TAG_NAME:
293 | self.popTag()
294 |
295 | def reset(self):
296 | Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
297 | self.hidden = 1
298 | self.builder.reset()
299 | self.current_data = []
300 | self.currentTag = None
301 | self.tagStack = []
302 | self.preserve_whitespace_tag_stack = []
303 | self.pushTag(self)
304 |
305 | def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
306 | """Create a new tag associated with this soup."""
307 | return Tag(None, self.builder, name, namespace, nsprefix, attrs)
308 |
309 | def new_string(self, s, subclass=NavigableString):
310 | """Create a new NavigableString associated with this soup."""
311 | return subclass(s)
312 |
313 | def insert_before(self, successor):
314 | raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
315 |
316 | def insert_after(self, successor):
317 | raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
318 |
319 | def popTag(self):
320 | tag = self.tagStack.pop()
321 | if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
322 | self.preserve_whitespace_tag_stack.pop()
323 | #print "Pop", tag.name
324 | if self.tagStack:
325 | self.currentTag = self.tagStack[-1]
326 | return self.currentTag
327 |
328 | def pushTag(self, tag):
329 | #print "Push", tag.name
330 | if self.currentTag:
331 | self.currentTag.contents.append(tag)
332 | self.tagStack.append(tag)
333 | self.currentTag = self.tagStack[-1]
334 | if tag.name in self.builder.preserve_whitespace_tags:
335 | self.preserve_whitespace_tag_stack.append(tag)
336 |
337 | def endData(self, containerClass=NavigableString):
338 | if self.current_data:
339 | current_data = u''.join(self.current_data)
340 | # If whitespace is not preserved, and this string contains
341 | # nothing but ASCII spaces, replace it with a single space
342 | # or newline.
343 | if not self.preserve_whitespace_tag_stack:
344 | strippable = True
345 | for i in current_data:
346 | if i not in self.ASCII_SPACES:
347 | strippable = False
348 | break
349 | if strippable:
350 | if '\n' in current_data:
351 | current_data = '\n'
352 | else:
353 | current_data = ' '
354 |
355 | # Reset the data collector.
356 | self.current_data = []
357 |
358 | # Should we add this string to the tree at all?
359 | if self.parse_only and len(self.tagStack) <= 1 and \
360 | (not self.parse_only.text or \
361 | not self.parse_only.search(current_data)):
362 | return
363 |
364 | o = containerClass(current_data)
365 | self.object_was_parsed(o)
366 |
367 | def object_was_parsed(self, o, parent=None, most_recent_element=None):
368 | """Add an object to the parse tree."""
369 | parent = parent or self.currentTag
370 | previous_element = most_recent_element or self._most_recent_element
371 |
372 | next_element = previous_sibling = next_sibling = None
373 | if isinstance(o, Tag):
374 | next_element = o.next_element
375 | next_sibling = o.next_sibling
376 | previous_sibling = o.previous_sibling
377 | if not previous_element:
378 | previous_element = o.previous_element
379 |
380 | o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
381 |
382 | self._most_recent_element = o
383 | parent.contents.append(o)
384 |
385 | if parent.next_sibling:
386 | # This node is being inserted into an element that has
387 | # already been parsed. Deal with any dangling references.
388 | index = len(parent.contents)-1
389 | while index >= 0:
390 | if parent.contents[index] is o:
391 | break
392 | index -= 1
393 | else:
394 | raise ValueError(
395 | "Error building tree: supposedly %r was inserted "
396 | "into %r after the fact, but I don't see it!" % (
397 | o, parent
398 | )
399 | )
400 | if index == 0:
401 | previous_element = parent
402 | previous_sibling = None
403 | else:
404 | previous_element = previous_sibling = parent.contents[index-1]
405 | if index == len(parent.contents)-1:
406 | next_element = parent.next_sibling
407 | next_sibling = None
408 | else:
409 | next_element = next_sibling = parent.contents[index+1]
410 |
411 | o.previous_element = previous_element
412 | if previous_element:
413 | previous_element.next_element = o
414 | o.next_element = next_element
415 | if next_element:
416 | next_element.previous_element = o
417 | o.next_sibling = next_sibling
418 | if next_sibling:
419 | next_sibling.previous_sibling = o
420 | o.previous_sibling = previous_sibling
421 | if previous_sibling:
422 | previous_sibling.next_sibling = o
423 |
424 | def _popToTag(self, name, nsprefix=None, inclusivePop=True):
425 | """Pops the tag stack up to and including the most recent
426 | instance of the given tag. If inclusivePop is false, pops the tag
427 | stack up to but *not* including the most recent instqance of
428 | the given tag."""
429 | #print "Popping to %s" % name
430 | if name == self.ROOT_TAG_NAME:
431 | # The BeautifulSoup object itself can never be popped.
432 | return
433 |
434 | most_recently_popped = None
435 |
436 | stack_size = len(self.tagStack)
437 | for i in range(stack_size - 1, 0, -1):
438 | t = self.tagStack[i]
439 | if (name == t.name and nsprefix == t.prefix):
440 | if inclusivePop:
441 | most_recently_popped = self.popTag()
442 | break
443 | most_recently_popped = self.popTag()
444 |
445 | return most_recently_popped
446 |
447 | def handle_starttag(self, name, namespace, nsprefix, attrs):
448 | """Push a start tag on to the stack.
449 |
450 | If this method returns None, the tag was rejected by the
451 | SoupStrainer. You should proceed as if the tag had not occurred
452 | in the document. For instance, if this was a self-closing tag,
453 | don't call handle_endtag.
454 | """
455 |
456 | # print "Start tag %s: %s" % (name, attrs)
457 | self.endData()
458 |
459 | if (self.parse_only and len(self.tagStack) <= 1
460 | and (self.parse_only.text
461 | or not self.parse_only.search_tag(name, attrs))):
462 | return None
463 |
464 | tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
465 | self.currentTag, self._most_recent_element)
466 | if tag is None:
467 | return tag
468 | if self._most_recent_element:
469 | self._most_recent_element.next_element = tag
470 | self._most_recent_element = tag
471 | self.pushTag(tag)
472 | return tag
473 |
474 | def handle_endtag(self, name, nsprefix=None):
475 | #print "End tag: " + name
476 | self.endData()
477 | self._popToTag(name, nsprefix)
478 |
479 | def handle_data(self, data):
480 | self.current_data.append(data)
481 |
482 | def decode(self, pretty_print=False,
483 | eventual_encoding=DEFAULT_OUTPUT_ENCODING,
484 | formatter="minimal"):
485 | """Returns a string or Unicode representation of this document.
486 | To get Unicode, pass None for encoding."""
487 |
488 | if self.is_xml:
489 | # Print the XML declaration
490 | encoding_part = ''
491 | if eventual_encoding != None:
492 | encoding_part = ' encoding="%s"' % eventual_encoding
493 | prefix = u'\n' % encoding_part
494 | else:
495 | prefix = u''
496 | if not pretty_print:
497 | indent_level = None
498 | else:
499 | indent_level = 0
500 | return prefix + super(BeautifulSoup, self).decode(
501 | indent_level, eventual_encoding, formatter)
502 |
503 | # Alias to make it easier to type import: 'from bs4 import _soup'
504 | _s = BeautifulSoup
505 | _soup = BeautifulSoup
506 |
507 | class BeautifulStoneSoup(BeautifulSoup):
508 | """Deprecated interface to an XML parser."""
509 |
510 | def __init__(self, *args, **kwargs):
511 | kwargs['features'] = 'xml'
512 | warnings.warn(
513 | 'The BeautifulStoneSoup class is deprecated. Instead of using '
514 | 'it, pass features="xml" into the BeautifulSoup constructor.')
515 | super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
516 |
517 |
518 | class StopParsing(Exception):
519 | pass
520 |
521 | class FeatureNotFound(ValueError):
522 | pass
523 |
524 |
525 | #By default, act as an HTML pretty-printer.
526 | if __name__ == '__main__':
527 | import sys
528 | soup = BeautifulSoup(sys.stdin)
529 | print soup.prettify()
530 |
--------------------------------------------------------------------------------
/weeman/lib/bs4/builder/0.txt:
--------------------------------------------------------------------------------
1 | https://github.com/GottModusTermux/
2 |
--------------------------------------------------------------------------------
/weeman/lib/bs4/builder/__init__.py:
--------------------------------------------------------------------------------
1 | # Use of this source code is governed by a BSD-style license that can be
2 | # found in the LICENSE file.
3 |
4 | from collections import defaultdict
5 | import itertools
6 | import sys
7 | from lib.bs4.element import (
8 | CharsetMetaAttributeValue,
9 | ContentMetaAttributeValue,
10 | HTMLAwareEntitySubstitution,
11 | whitespace_re
12 | )
13 |
14 | __all__ = [
15 | 'HTMLTreeBuilder',
16 | 'SAXTreeBuilder',
17 | 'TreeBuilder',
18 | 'TreeBuilderRegistry',
19 | ]
20 |
21 | # Some useful features for a TreeBuilder to have.
22 | FAST = 'fast'
23 | PERMISSIVE = 'permissive'
24 | STRICT = 'strict'
25 | XML = 'xml'
26 | HTML = 'html'
27 | HTML_5 = 'html5'
28 |
29 |
30 | class TreeBuilderRegistry(object):
31 |
32 | def __init__(self):
33 | self.builders_for_feature = defaultdict(list)
34 | self.builders = []
35 |
36 | def register(self, treebuilder_class):
37 | """Register a treebuilder based on its advertised features."""
38 | for feature in treebuilder_class.features:
39 | self.builders_for_feature[feature].insert(0, treebuilder_class)
40 | self.builders.insert(0, treebuilder_class)
41 |
42 | def lookup(self, *features):
43 | if len(self.builders) == 0:
44 | # There are no builders at all.
45 | return None
46 |
47 | if len(features) == 0:
48 | # They didn't ask for any features. Give them the most
49 | # recently registered builder.
50 | return self.builders[0]
51 |
52 | # Go down the list of features in order, and eliminate any builders
53 | # that don't match every feature.
54 | features = list(features)
55 | features.reverse()
56 | candidates = None
57 | candidate_set = None
58 | while len(features) > 0:
59 | feature = features.pop()
60 | we_have_the_feature = self.builders_for_feature.get(feature, [])
61 | if len(we_have_the_feature) > 0:
62 | if candidates is None:
63 | candidates = we_have_the_feature
64 | candidate_set = set(candidates)
65 | else:
66 | # Eliminate any candidates that don't have this feature.
67 | candidate_set = candidate_set.intersection(
68 | set(we_have_the_feature))
69 |
70 | # The only valid candidates are the ones in candidate_set.
71 | # Go through the original list of candidates and pick the first one
72 | # that's in candidate_set.
73 | if candidate_set is None:
74 | return None
75 | for candidate in candidates:
76 | if candidate in candidate_set:
77 | return candidate
78 | return None
79 |
80 | # The BeautifulSoup class will take feature lists from developers and use them
81 | # to look up builders in this registry.
82 | builder_registry = TreeBuilderRegistry()
83 |
84 | class TreeBuilder(object):
85 | """Turn a document into a Beautiful Soup object tree."""
86 |
87 | NAME = "[Unknown tree builder]"
88 | ALTERNATE_NAMES = []
89 | features = []
90 |
91 | is_xml = False
92 | picklable = False
93 | preserve_whitespace_tags = set()
94 | empty_element_tags = None # A tag will be considered an empty-element
95 | # tag when and only when it has no contents.
96 |
97 | # A value for these tag/attribute combinations is a space- or
98 | # comma-separated list of CDATA, rather than a single CDATA.
99 | cdata_list_attributes = {}
100 |
101 |
102 | def __init__(self):
103 | self.soup = None
104 |
105 | def reset(self):
106 | pass
107 |
108 | def can_be_empty_element(self, tag_name):
109 | """Might a tag with this name be an empty-element tag?
110 |
111 | The final markup may or may not actually present this tag as
112 | self-closing.
113 |
114 | For instance: an HTMLBuilder does not consider a tag to be
115 | an empty-element tag (it's not in
116 | HTMLBuilder.empty_element_tags). This means an empty
tag
117 | will be presented as "
", not "".
118 |
119 | The default implementation has no opinion about which tags are
120 | empty-element tags, so a tag will be presented as an
121 | empty-element tag if and only if it has no contents.
122 | "" will become "", and "bar" will
123 | be left alone.
124 | """
125 | if self.empty_element_tags is None:
126 | return True
127 | return tag_name in self.empty_element_tags
128 |
129 | def feed(self, markup):
130 | raise NotImplementedError()
131 |
132 | def prepare_markup(self, markup, user_specified_encoding=None,
133 | document_declared_encoding=None):
134 | return markup, None, None, False
135 |
136 | def test_fragment_to_document(self, fragment):
137 | """Wrap an HTML fragment to make it look like a document.
138 |
139 | Different parsers do this differently. For instance, lxml
140 | introduces an empty tag, and html5lib
141 | doesn't. Abstracting this away lets us write simple tests
142 | which run HTML fragments through the parser and compare the
143 | results against other HTML fragments.
144 |
145 | This method should not be used outside of tests.
146 | """
147 | return fragment
148 |
149 | def set_up_substitutions(self, tag):
150 | return False
151 |
152 | def _replace_cdata_list_attribute_values(self, tag_name, attrs):
153 | """Replaces class="foo bar" with class=["foo", "bar"]
154 |
155 | Modifies its input in place.
156 | """
157 | if not attrs:
158 | return attrs
159 | if self.cdata_list_attributes:
160 | universal = self.cdata_list_attributes.get('*', [])
161 | tag_specific = self.cdata_list_attributes.get(
162 | tag_name.lower(), None)
163 | for attr in attrs.keys():
164 | if attr in universal or (tag_specific and attr in tag_specific):
165 | # We have a "class"-type attribute whose string
166 | # value is a whitespace-separated list of
167 | # values. Split it into a list.
168 | value = attrs[attr]
169 | if isinstance(value, basestring):
170 | values = whitespace_re.split(value)
171 | else:
172 | # html5lib sometimes calls setAttributes twice
173 | # for the same tag when rearranging the parse
174 | # tree. On the second call the attribute value
175 | # here is already a list. If this happens,
176 | # leave the value alone rather than trying to
177 | # split it again.
178 | values = value
179 | attrs[attr] = values
180 | return attrs
181 |
182 | class SAXTreeBuilder(TreeBuilder):
183 | """A Beautiful Soup treebuilder that listens for SAX events."""
184 |
185 | def feed(self, markup):
186 | raise NotImplementedError()
187 |
188 | def close(self):
189 | pass
190 |
191 | def startElement(self, name, attrs):
192 | attrs = dict((key[1], value) for key, value in list(attrs.items()))
193 | #print "Start %s, %r" % (name, attrs)
194 | self.soup.handle_starttag(name, attrs)
195 |
196 | def endElement(self, name):
197 | #print "End %s" % name
198 | self.soup.handle_endtag(name)
199 |
200 | def startElementNS(self, nsTuple, nodeName, attrs):
201 | # Throw away (ns, nodeName) for now.
202 | self.startElement(nodeName, attrs)
203 |
204 | def endElementNS(self, nsTuple, nodeName):
205 | # Throw away (ns, nodeName) for now.
206 | self.endElement(nodeName)
207 | #handler.endElementNS((ns, node.nodeName), node.nodeName)
208 |
209 | def startPrefixMapping(self, prefix, nodeValue):
210 | # Ignore the prefix for now.
211 | pass
212 |
213 | def endPrefixMapping(self, prefix):
214 | # Ignore the prefix for now.
215 | # handler.endPrefixMapping(prefix)
216 | pass
217 |
218 | def characters(self, content):
219 | self.soup.handle_data(content)
220 |
221 | def startDocument(self):
222 | pass
223 |
224 | def endDocument(self):
225 | pass
226 |
227 |
228 | class HTMLTreeBuilder(TreeBuilder):
229 | """This TreeBuilder knows facts about HTML.
230 |
231 | Such as which tags are empty-element tags.
232 | """
233 |
234 | preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
235 | empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
236 | 'spacer', 'link', 'frame', 'base'])
237 |
238 | # The HTML standard defines these attributes as containing a
239 | # space-separated list of values, not a single value. That is,
240 | # class="foo bar" means that the 'class' attribute has two values,
241 | # 'foo' and 'bar', not the single value 'foo bar'. When we
242 | # encounter one of these attributes, we will parse its value into
243 | # a list of values if possible. Upon output, the list will be
244 | # converted back into a string.
245 | cdata_list_attributes = {
246 | "*" : ['class', 'accesskey', 'dropzone'],
247 | "a" : ['rel', 'rev'],
248 | "link" : ['rel', 'rev'],
249 | "td" : ["headers"],
250 | "th" : ["headers"],
251 | "td" : ["headers"],
252 | "form" : ["accept-charset"],
253 | "object" : ["archive"],
254 |
255 | # These are HTML5 specific, as are *.accesskey and *.dropzone above.
256 | "area" : ["rel"],
257 | "icon" : ["sizes"],
258 | "iframe" : ["sandbox"],
259 | "output" : ["for"],
260 | }
261 |
262 | def set_up_substitutions(self, tag):
263 | # We are only interested in tags
264 | if tag.name != 'meta':
265 | return False
266 |
267 | http_equiv = tag.get('http-equiv')
268 | content = tag.get('content')
269 | charset = tag.get('charset')
270 |
271 | # We are interested in tags that say what encoding the
272 | # document was originally in. This means HTML 5-style
273 | # tags that provide the "charset" attribute. It also means
274 | # HTML 4-style tags that provide the "content"
275 | # attribute and have "http-equiv" set to "content-type".
276 | #
277 | # In both cases we will replace the value of the appropriate
278 | # attribute with a standin object that can take on any
279 | # encoding.
280 | meta_encoding = None
281 | if charset is not None:
282 | # HTML 5 style:
283 | #
284 | meta_encoding = charset
285 | tag['charset'] = CharsetMetaAttributeValue(charset)
286 |
287 | elif (content is not None and http_equiv is not None
288 | and http_equiv.lower() == 'content-type'):
289 | # HTML 4 style:
290 | #
291 | tag['content'] = ContentMetaAttributeValue(content)
292 |
293 | return (meta_encoding is not None)
294 |
295 | def register_treebuilders_from(module):
296 | """Copy TreeBuilders from the given module into this module."""
297 | # I'm fairly sure this is not the best way to do this.
298 | this_module = sys.modules['lib.bs4.builder']
299 | for name in module.__all__:
300 | obj = getattr(module, name)
301 |
302 | if issubclass(obj, TreeBuilder):
303 | setattr(this_module, name, obj)
304 | this_module.__all__.append(name)
305 | # Register the builder while we're at it.
306 | this_module.builder_registry.register(obj)
307 |
308 | class ParserRejectedMarkup(Exception):
309 | pass
310 |
311 | # Builders are registered in reverse order of priority, so that custom
312 | # builder registrations will take precedence. In general, we want lxml
313 | # to take precedence over html5lib, because it's faster. And we only
314 | # want to use HTMLParser as a last result.
315 | from . import _htmlparser
316 | register_treebuilders_from(_htmlparser)
317 | try:
318 | from . import _html5lib
319 | register_treebuilders_from(_html5lib)
320 | except ImportError:
321 | # They don't have html5lib installed.
322 | pass
323 | try:
324 | from . import _lxml
325 | register_treebuilders_from(_lxml)
326 | except ImportError:
327 | # They don't have lxml installed.
328 | pass
329 |
--------------------------------------------------------------------------------
/weeman/lib/bs4/builder/_html5lib.py:
--------------------------------------------------------------------------------
1 | # Use of this source code is governed by a BSD-style license that can be
2 | # found in the LICENSE file.
3 |
4 | __all__ = [
5 | 'HTML5TreeBuilder',
6 | ]
7 |
8 | import warnings
9 | from lib.bs4.builder import (
10 | PERMISSIVE,
11 | HTML,
12 | HTML_5,
13 | HTMLTreeBuilder,
14 | )
15 | from lib.bs4.element import (
16 | NamespacedAttribute,
17 | whitespace_re,
18 | )
19 | import lib.html5lib
20 | from html5lib.constants import namespaces
21 | from lib.bs4.element import (
22 | Comment,
23 | Doctype,
24 | NavigableString,
25 | Tag,
26 | )
27 |
28 | try:
29 | # Pre-0.99999999
30 | from html5lib.treebuilders import _base as treebuilder_base
31 | new_html5lib = False
32 | except ImportError, e:
33 | # 0.99999999 and up
34 | from html5lib.treebuilders import base as treebuilder_base
35 | new_html5lib = True
36 |
37 | class HTML5TreeBuilder(HTMLTreeBuilder):
38 | """Use html5lib to build a tree."""
39 |
40 | NAME = "html5lib"
41 |
42 | features = [NAME, PERMISSIVE, HTML_5, HTML]
43 |
44 | def prepare_markup(self, markup, user_specified_encoding,
45 | document_declared_encoding=None, exclude_encodings=None):
46 | # Store the user-specified encoding for use later on.
47 | self.user_specified_encoding = user_specified_encoding
48 |
49 | # document_declared_encoding and exclude_encodings aren't used
50 | # ATM because the html5lib TreeBuilder doesn't use
51 | # UnicodeDammit.
52 | if exclude_encodings:
53 | warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
54 | yield (markup, None, None, False)
55 |
56 | # These methods are defined by Beautiful Soup.
57 | def feed(self, markup):
58 | if self.soup.parse_only is not None:
59 | warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
60 | parser = html5lib.HTMLParser(tree=self.create_treebuilder)
61 |
62 | extra_kwargs = dict()
63 | if not isinstance(markup, unicode):
64 | if new_html5lib:
65 | extra_kwargs['override_encoding'] = self.user_specified_encoding
66 | else:
67 | extra_kwargs['encoding'] = self.user_specified_encoding
68 | doc = parser.parse(markup, **extra_kwargs)
69 |
70 | # Set the character encoding detected by the tokenizer.
71 | if isinstance(markup, unicode):
72 | # We need to special-case this because html5lib sets
73 | # charEncoding to UTF-8 if it gets Unicode input.
74 | doc.original_encoding = None
75 | else:
76 | original_encoding = parser.tokenizer.stream.charEncoding[0]
77 | if not isinstance(original_encoding, basestring):
78 | # In 0.99999999 and up, the encoding is an html5lib
79 | # Encoding object. We want to use a string for compatibility
80 | # with other tree builders.
81 | original_encoding = original_encoding.name
82 | doc.original_encoding = original_encoding
83 |
84 | def create_treebuilder(self, namespaceHTMLElements):
85 | self.underlying_builder = TreeBuilderForHtml5lib(
86 | self.soup, namespaceHTMLElements)
87 | return self.underlying_builder
88 |
89 | def test_fragment_to_document(self, fragment):
90 | """See `TreeBuilder`."""
91 | return u'%s' % fragment
92 |
93 |
94 | class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
95 |
96 | def __init__(self, soup, namespaceHTMLElements):
97 | self.soup = soup
98 | super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
99 |
100 | def documentClass(self):
101 | self.soup.reset()
102 | return Element(self.soup, self.soup, None)
103 |
104 | def insertDoctype(self, token):
105 | name = token["name"]
106 | publicId = token["publicId"]
107 | systemId = token["systemId"]
108 |
109 | doctype = Doctype.for_name_and_ids(name, publicId, systemId)
110 | self.soup.object_was_parsed(doctype)
111 |
112 | def elementClass(self, name, namespace):
113 | tag = self.soup.new_tag(name, namespace)
114 | return Element(tag, self.soup, namespace)
115 |
116 | def commentClass(self, data):
117 | return TextNode(Comment(data), self.soup)
118 |
119 | def fragmentClass(self):
120 | self.soup = BeautifulSoup("")
121 | self.soup.name = "[document_fragment]"
122 | return Element(self.soup, self.soup, None)
123 |
124 | def appendChild(self, node):
125 | # XXX This code is not covered by the BS4 tests.
126 | self.soup.append(node.element)
127 |
128 | def getDocument(self):
129 | return self.soup
130 |
131 | def getFragment(self):
132 | return treebuilder_base.TreeBuilder.getFragment(self).element
133 |
134 | class AttrList(object):
135 | def __init__(self, element):
136 | self.element = element
137 | self.attrs = dict(self.element.attrs)
138 | def __iter__(self):
139 | return list(self.attrs.items()).__iter__()
140 | def __setitem__(self, name, value):
141 | # If this attribute is a multi-valued attribute for this element,
142 | # turn its value into a list.
143 | list_attr = HTML5TreeBuilder.cdata_list_attributes
144 | if (name in list_attr['*']
145 | or (self.element.name in list_attr
146 | and name in list_attr[self.element.name])):
147 | # A node that is being cloned may have already undergone
148 | # this procedure.
149 | if not isinstance(value, list):
150 | value = whitespace_re.split(value)
151 | self.element[name] = value
152 | def items(self):
153 | return list(self.attrs.items())
154 | def keys(self):
155 | return list(self.attrs.keys())
156 | def __len__(self):
157 | return len(self.attrs)
158 | def __getitem__(self, name):
159 | return self.attrs[name]
160 | def __contains__(self, name):
161 | return name in list(self.attrs.keys())
162 |
163 |
164 | class Element(treebuilder_base.Node):
165 | def __init__(self, element, soup, namespace):
166 | treebuilder_base.Node.__init__(self, element.name)
167 | self.element = element
168 | self.soup = soup
169 | self.namespace = namespace
170 |
171 | def appendChild(self, node):
172 | string_child = child = None
173 | if isinstance(node, basestring):
174 | # Some other piece of code decided to pass in a string
175 | # instead of creating a TextElement object to contain the
176 | # string.
177 | string_child = child = node
178 | elif isinstance(node, Tag):
179 | # Some other piece of code decided to pass in a Tag
180 | # instead of creating an Element object to contain the
181 | # Tag.
182 | child = node
183 | elif node.element.__class__ == NavigableString:
184 | string_child = child = node.element
185 | else:
186 | child = node.element
187 |
188 | if not isinstance(child, basestring) and child.parent is not None:
189 | node.element.extract()
190 |
191 | if (string_child and self.element.contents
192 | and self.element.contents[-1].__class__ == NavigableString):
193 | # We are appending a string onto another string.
194 | # TODO This has O(n^2) performance, for input like
195 | # "aaa..."
196 | old_element = self.element.contents[-1]
197 | new_element = self.soup.new_string(old_element + string_child)
198 | old_element.replace_with(new_element)
199 | self.soup._most_recent_element = new_element
200 | else:
201 | if isinstance(node, basestring):
202 | # Create a brand new NavigableString from this string.
203 | child = self.soup.new_string(node)
204 |
205 | # Tell Beautiful Soup to act as if it parsed this element
206 | # immediately after the parent's last descendant. (Or
207 | # immediately after the parent, if it has no children.)
208 | if self.element.contents:
209 | most_recent_element = self.element._last_descendant(False)
210 | elif self.element.next_element is not None:
211 | # Something from further ahead in the parse tree is
212 | # being inserted into this earlier element. This is
213 | # very annoying because it means an expensive search
214 | # for the last element in the tree.
215 | most_recent_element = self.soup._last_descendant()
216 | else:
217 | most_recent_element = self.element
218 |
219 | self.soup.object_was_parsed(
220 | child, parent=self.element,
221 | most_recent_element=most_recent_element)
222 |
223 | def getAttributes(self):
224 | return AttrList(self.element)
225 |
226 | def setAttributes(self, attributes):
227 |
228 | if attributes is not None and len(attributes) > 0:
229 |
230 | converted_attributes = []
231 | for name, value in list(attributes.items()):
232 | if isinstance(name, tuple):
233 | new_name = NamespacedAttribute(*name)
234 | del attributes[name]
235 | attributes[new_name] = value
236 |
237 | self.soup.builder._replace_cdata_list_attribute_values(
238 | self.name, attributes)
239 | for name, value in attributes.items():
240 | self.element[name] = value
241 |
242 | # The attributes may contain variables that need substitution.
243 | # Call set_up_substitutions manually.
244 | #
245 | # The Tag constructor called this method when the Tag was created,
246 | # but we just set/changed the attributes, so call it again.
247 | self.soup.builder.set_up_substitutions(self.element)
248 | attributes = property(getAttributes, setAttributes)
249 |
250 | def insertText(self, data, insertBefore=None):
251 | if insertBefore:
252 | text = TextNode(self.soup.new_string(data), self.soup)
253 | self.insertBefore(data, insertBefore)
254 | else:
255 | self.appendChild(data)
256 |
257 | def insertBefore(self, node, refNode):
258 | index = self.element.index(refNode.element)
259 | if (node.element.__class__ == NavigableString and self.element.contents
260 | and self.element.contents[index-1].__class__ == NavigableString):
261 | # (See comments in appendChild)
262 | old_node = self.element.contents[index-1]
263 | new_str = self.soup.new_string(old_node + node.element)
264 | old_node.replace_with(new_str)
265 | else:
266 | self.element.insert(index, node.element)
267 | node.parent = self
268 |
269 | def removeChild(self, node):
270 | node.element.extract()
271 |
272 | def reparentChildren(self, new_parent):
273 | """Move all of this tag's children into another tag."""
274 | # print "MOVE", self.element.contents
275 | # print "FROM", self.element
276 | # print "TO", new_parent.element
277 | element = self.element
278 | new_parent_element = new_parent.element
279 | # Determine what this tag's next_element will be once all the children
280 | # are removed.
281 | final_next_element = element.next_sibling
282 |
283 | new_parents_last_descendant = new_parent_element._last_descendant(False, False)
284 | if len(new_parent_element.contents) > 0:
285 | # The new parent already contains children. We will be
286 | # appending this tag's children to the end.
287 | new_parents_last_child = new_parent_element.contents[-1]
288 | new_parents_last_descendant_next_element = new_parents_last_descendant.next_element
289 | else:
290 | # The new parent contains no children.
291 | new_parents_last_child = None
292 | new_parents_last_descendant_next_element = new_parent_element.next_element
293 |
294 | to_append = element.contents
295 | append_after = new_parent_element.contents
296 | if len(to_append) > 0:
297 | # Set the first child's previous_element and previous_sibling
298 | # to elements within the new parent
299 | first_child = to_append[0]
300 | if new_parents_last_descendant:
301 | first_child.previous_element = new_parents_last_descendant
302 | else:
303 | first_child.previous_element = new_parent_element
304 | first_child.previous_sibling = new_parents_last_child
305 | if new_parents_last_descendant:
306 | new_parents_last_descendant.next_element = first_child
307 | else:
308 | new_parent_element.next_element = first_child
309 | if new_parents_last_child:
310 | new_parents_last_child.next_sibling = first_child
311 |
312 | # Fix the last child's next_element and next_sibling
313 | last_child = to_append[-1]
314 | last_child.next_element = new_parents_last_descendant_next_element
315 | if new_parents_last_descendant_next_element:
316 | new_parents_last_descendant_next_element.previous_element = last_child
317 | last_child.next_sibling = None
318 |
319 | for child in to_append:
320 | child.parent = new_parent_element
321 | new_parent_element.contents.append(child)
322 |
323 | # Now that this element has no children, change its .next_element.
324 | element.contents = []
325 | element.next_element = final_next_element
326 |
327 | # print "DONE WITH MOVE"
328 | # print "FROM", self.element
329 | # print "TO", new_parent_element
330 |
331 | def cloneNode(self):
332 | tag = self.soup.new_tag(self.element.name, self.namespace)
333 | node = Element(tag, self.soup, self.namespace)
334 | for key,value in self.attributes:
335 | node.attributes[key] = value
336 | return node
337 |
338 | def hasContent(self):
339 | return self.element.contents
340 |
341 | def getNameTuple(self):
342 | if self.namespace == None:
343 | return namespaces["html"], self.name
344 | else:
345 | return self.namespace, self.name
346 |
347 | nameTuple = property(getNameTuple)
348 |
349 | class TextNode(Element):
350 | def __init__(self, element, soup):
351 | treebuilder_base.Node.__init__(self, None)
352 | self.element = element
353 | self.soup = soup
354 |
355 | def cloneNode(self):
356 | raise NotImplementedError
357 |
--------------------------------------------------------------------------------
/weeman/lib/bs4/builder/_htmlparser.py:
--------------------------------------------------------------------------------
1 | """Use the HTMLParser library to parse HTML files that aren't too bad."""
2 |
3 | # Use of this source code is governed by a BSD-style license that can be
4 | # found in the LICENSE file.
5 |
6 | __all__ = [
7 | 'HTMLParserTreeBuilder',
8 | ]
9 |
10 | from HTMLParser import HTMLParser
11 |
12 | try:
13 | from HTMLParser import HTMLParseError
14 | except ImportError, e:
15 | # HTMLParseError is removed in Python 3.5. Since it can never be
16 | # thrown in 3.5, we can just define our own class as a placeholder.
17 | class HTMLParseError(Exception):
18 | pass
19 |
20 | import sys
21 | import warnings
22 |
23 | # Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
24 | # argument, which we'd like to set to False. Unfortunately,
25 | # http://bugs.python.org/issue13273 makes strict=True a better bet
26 | # before Python 3.2.3.
27 | #
28 | # At the end of this file, we monkeypatch HTMLParser so that
29 | # strict=True works well on Python 3.2.2.
30 | major, minor, release = sys.version_info[:3]
31 | CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
32 | CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
33 | CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
34 |
35 |
36 | from lib.bs4.element import (
37 | CData,
38 | Comment,
39 | Declaration,
40 | Doctype,
41 | ProcessingInstruction,
42 | )
43 | from lib.bs4.dammit import EntitySubstitution, UnicodeDammit
44 |
45 | from lib.bs4.builder import (
46 | HTML,
47 | HTMLTreeBuilder,
48 | STRICT,
49 | )
50 |
51 |
52 | HTMLPARSER = 'html.parser'
53 |
54 | class BeautifulSoupHTMLParser(HTMLParser):
55 | def handle_starttag(self, name, attrs):
56 | # XXX namespace
57 | attr_dict = {}
58 | for key, value in attrs:
59 | # Change None attribute values to the empty string
60 | # for consistency with the other tree builders.
61 | if value is None:
62 | value = ''
63 | attr_dict[key] = value
64 | attrvalue = '""'
65 | self.soup.handle_starttag(name, None, None, attr_dict)
66 |
67 | def handle_endtag(self, name):
68 | self.soup.handle_endtag(name)
69 |
70 | def handle_data(self, data):
71 | self.soup.handle_data(data)
72 |
73 | def handle_charref(self, name):
74 | # XXX workaround for a bug in HTMLParser. Remove this once
75 | # it's fixed in all supported versions.
76 | # http://bugs.python.org/issue13633
77 | if name.startswith('x'):
78 | real_name = int(name.lstrip('x'), 16)
79 | elif name.startswith('X'):
80 | real_name = int(name.lstrip('X'), 16)
81 | else:
82 | real_name = int(name)
83 |
84 | try:
85 | data = unichr(real_name)
86 | except (ValueError, OverflowError), e:
87 | data = u"\N{REPLACEMENT CHARACTER}"
88 |
89 | self.handle_data(data)
90 |
91 | def handle_entityref(self, name):
92 | character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
93 | if character is not None:
94 | data = character
95 | else:
96 | data = "&%s;" % name
97 | self.handle_data(data)
98 |
99 | def handle_comment(self, data):
100 | self.soup.endData()
101 | self.soup.handle_data(data)
102 | self.soup.endData(Comment)
103 |
104 | def handle_decl(self, data):
105 | self.soup.endData()
106 | if data.startswith("DOCTYPE "):
107 | data = data[len("DOCTYPE "):]
108 | elif data == 'DOCTYPE':
109 | # i.e. ""
110 | data = ''
111 | self.soup.handle_data(data)
112 | self.soup.endData(Doctype)
113 |
114 | def unknown_decl(self, data):
115 | if data.upper().startswith('CDATA['):
116 | cls = CData
117 | data = data[len('CDATA['):]
118 | else:
119 | cls = Declaration
120 | self.soup.endData()
121 | self.soup.handle_data(data)
122 | self.soup.endData(cls)
123 |
124 | def handle_pi(self, data):
125 | self.soup.endData()
126 | self.soup.handle_data(data)
127 | self.soup.endData(ProcessingInstruction)
128 |
129 |
130 | class HTMLParserTreeBuilder(HTMLTreeBuilder):
131 |
132 | is_xml = False
133 | picklable = True
134 | NAME = HTMLPARSER
135 | features = [NAME, HTML, STRICT]
136 |
137 | def __init__(self, *args, **kwargs):
138 | if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
139 | kwargs['strict'] = False
140 | if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
141 | kwargs['convert_charrefs'] = False
142 | self.parser_args = (args, kwargs)
143 |
144 | def prepare_markup(self, markup, user_specified_encoding=None,
145 | document_declared_encoding=None, exclude_encodings=None):
146 | """
147 | :return: A 4-tuple (markup, original encoding, encoding
148 | declared within markup, whether any characters had to be
149 | replaced with REPLACEMENT CHARACTER).
150 | """
151 | if isinstance(markup, unicode):
152 | yield (markup, None, None, False)
153 | return
154 |
155 | try_encodings = [user_specified_encoding, document_declared_encoding]
156 | dammit = UnicodeDammit(markup, try_encodings, is_html=True,
157 | exclude_encodings=exclude_encodings)
158 | yield (dammit.markup, dammit.original_encoding,
159 | dammit.declared_html_encoding,
160 | dammit.contains_replacement_characters)
161 |
162 | def feed(self, markup):
163 | args, kwargs = self.parser_args
164 | parser = BeautifulSoupHTMLParser(*args, **kwargs)
165 | parser.soup = self.soup
166 | try:
167 | parser.feed(markup)
168 | except HTMLParseError, e:
169 | warnings.warn(RuntimeWarning(
170 | "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
171 | raise e
172 |
173 | # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
174 | # 3.2.3 code. This ensures they don't treat markup like as a
175 | # string.
176 | #
177 | # XXX This code can be removed once most Python 3 users are on 3.2.3.
178 | if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
179 | import re
180 | attrfind_tolerant = re.compile(
181 | r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
182 | r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
183 | HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
184 |
185 | locatestarttagend = re.compile(r"""
186 | <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
187 | (?:\s+ # whitespace before attribute name
188 | (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
189 | (?:\s*=\s* # value indicator
190 | (?:'[^']*' # LITA-enclosed value
191 | |\"[^\"]*\" # LIT-enclosed value
192 | |[^'\">\s]+ # bare value
193 | )
194 | )?
195 | )
196 | )*
197 | \s* # trailing whitespace
198 | """, re.VERBOSE)
199 | BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
200 |
201 | from html.parser import tagfind, attrfind
202 |
203 | def parse_starttag(self, i):
204 | self.__starttag_text = None
205 | endpos = self.check_for_whole_start_tag(i)
206 | if endpos < 0:
207 | return endpos
208 | rawdata = self.rawdata
209 | self.__starttag_text = rawdata[i:endpos]
210 |
211 | # Now parse the data between i+1 and j into a tag and attrs
212 | attrs = []
213 | match = tagfind.match(rawdata, i+1)
214 | assert match, 'unexpected call to parse_starttag()'
215 | k = match.end()
216 | self.lasttag = tag = rawdata[i+1:k].lower()
217 | while k < endpos:
218 | if self.strict:
219 | m = attrfind.match(rawdata, k)
220 | else:
221 | m = attrfind_tolerant.match(rawdata, k)
222 | if not m:
223 | break
224 | attrname, rest, attrvalue = m.group(1, 2, 3)
225 | if not rest:
226 | attrvalue = None
227 | elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
228 | attrvalue[:1] == '"' == attrvalue[-1:]:
229 | attrvalue = attrvalue[1:-1]
230 | if attrvalue:
231 | attrvalue = self.unescape(attrvalue)
232 | attrs.append((attrname.lower(), attrvalue))
233 | k = m.end()
234 |
235 | end = rawdata[k:endpos].strip()
236 | if end not in (">", "/>"):
237 | lineno, offset = self.getpos()
238 | if "\n" in self.__starttag_text:
239 | lineno = lineno + self.__starttag_text.count("\n")
240 | offset = len(self.__starttag_text) \
241 | - self.__starttag_text.rfind("\n")
242 | else:
243 | offset = offset + len(self.__starttag_text)
244 | if self.strict:
245 | self.error("junk characters in start tag: %r"
246 | % (rawdata[k:endpos][:20],))
247 | self.handle_data(rawdata[i:endpos])
248 | return endpos
249 | if end.endswith('/>'):
250 | # XHTML-style empty tag:
251 | self.handle_startendtag(tag, attrs)
252 | else:
253 | self.handle_starttag(tag, attrs)
254 | if tag in self.CDATA_CONTENT_ELEMENTS:
255 | self.set_cdata_mode(tag)
256 | return endpos
257 |
258 | def set_cdata_mode(self, elem):
259 | self.cdata_elem = elem.lower()
260 | self.interesting = re.compile(r'\s*%s\s*>' % self.cdata_elem, re.I)
261 |
262 | BeautifulSoupHTMLParser.parse_starttag = parse_starttag
263 | BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
264 |
265 | CONSTRUCTOR_TAKES_STRICT = True
266 |
--------------------------------------------------------------------------------
/weeman/lib/bs4/builder/_lxml.py:
--------------------------------------------------------------------------------
1 | # Use of this source code is governed by a BSD-style license that can be
2 | # found in the LICENSE file.
3 | __all__ = [
4 | 'LXMLTreeBuilderForXML',
5 | 'LXMLTreeBuilder',
6 | ]
7 |
8 | from io import BytesIO
9 | from StringIO import StringIO
10 | import collections
11 | from lxml import etree
12 | from lib.bs4.element import (
13 | Comment,
14 | Doctype,
15 | NamespacedAttribute,
16 | ProcessingInstruction,
17 | XMLProcessingInstruction,
18 | )
19 | from lib.bs4.builder import (
20 | FAST,
21 | HTML,
22 | HTMLTreeBuilder,
23 | PERMISSIVE,
24 | ParserRejectedMarkup,
25 | TreeBuilder,
26 | XML)
27 | from lib.bs4.dammit import EncodingDetector
28 |
29 | LXML = 'lxml'
30 |
31 | class LXMLTreeBuilderForXML(TreeBuilder):
32 | DEFAULT_PARSER_CLASS = etree.XMLParser
33 |
34 | is_xml = True
35 | processing_instruction_class = XMLProcessingInstruction
36 |
37 | NAME = "lxml-xml"
38 | ALTERNATE_NAMES = ["xml"]
39 |
40 | # Well, it's permissive by XML parser standards.
41 | features = [NAME, LXML, XML, FAST, PERMISSIVE]
42 |
43 | CHUNK_SIZE = 512
44 |
45 | # This namespace mapping is specified in the XML Namespace
46 | # standard.
47 | DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
48 |
49 | def default_parser(self, encoding):
50 | # This can either return a parser object or a class, which
51 | # will be instantiated with default arguments.
52 | if self._default_parser is not None:
53 | return self._default_parser
54 | return etree.XMLParser(
55 | target=self, strip_cdata=False, recover=True, encoding=encoding)
56 |
57 | def parser_for(self, encoding):
58 | # Use the default parser.
59 | parser = self.default_parser(encoding)
60 |
61 | if isinstance(parser, collections.Callable):
62 | # Instantiate the parser with default arguments
63 | parser = parser(target=self, strip_cdata=False, encoding=encoding)
64 | return parser
65 |
66 | def __init__(self, parser=None, empty_element_tags=None):
67 | # TODO: Issue a warning if parser is present but not a
68 | # callable, since that means there's no way to create new
69 | # parsers for different encodings.
70 | self._default_parser = parser
71 | if empty_element_tags is not None:
72 | self.empty_element_tags = set(empty_element_tags)
73 | self.soup = None
74 | self.nsmaps = [self.DEFAULT_NSMAPS]
75 |
76 | def _getNsTag(self, tag):
77 | # Split the namespace URL out of a fully-qualified lxml tag
78 | # name. Copied from lxml's src/lxml/sax.py.
79 | if tag[0] == '{':
80 | return tuple(tag[1:].split('}', 1))
81 | else:
82 | return (None, tag)
83 |
84 | def prepare_markup(self, markup, user_specified_encoding=None,
85 | exclude_encodings=None,
86 | document_declared_encoding=None):
87 | """
88 | :yield: A series of 4-tuples.
89 | (markup, encoding, declared encoding,
90 | has undergone character replacement)
91 |
92 | Each 4-tuple represents a strategy for parsing the document.
93 | """
94 | # Instead of using UnicodeDammit to convert the bytestring to
95 | # Unicode using different encodings, use EncodingDetector to
96 | # iterate over the encodings, and tell lxml to try to parse
97 | # the document as each one in turn.
98 | is_html = not self.is_xml
99 | if is_html:
100 | self.processing_instruction_class = ProcessingInstruction
101 | else:
102 | self.processing_instruction_class = XMLProcessingInstruction
103 |
104 | if isinstance(markup, unicode):
105 | # We were given Unicode. Maybe lxml can parse Unicode on
106 | # this system?
107 | yield markup, None, document_declared_encoding, False
108 |
109 | if isinstance(markup, unicode):
110 | # No, apparently not. Convert the Unicode to UTF-8 and
111 | # tell lxml to parse it as UTF-8.
112 | yield (markup.encode("utf8"), "utf8",
113 | document_declared_encoding, False)
114 |
115 | try_encodings = [user_specified_encoding, document_declared_encoding]
116 | detector = EncodingDetector(
117 | markup, try_encodings, is_html, exclude_encodings)
118 | for encoding in detector.encodings:
119 | yield (detector.markup, encoding, document_declared_encoding, False)
120 |
121 | def feed(self, markup):
122 | if isinstance(markup, bytes):
123 | markup = BytesIO(markup)
124 | elif isinstance(markup, unicode):
125 | markup = StringIO(markup)
126 |
127 | # Call feed() at least once, even if the markup is empty,
128 | # or the parser won't be initialized.
129 | data = markup.read(self.CHUNK_SIZE)
130 | try:
131 | self.parser = self.parser_for(self.soup.original_encoding)
132 | self.parser.feed(data)
133 | while len(data) != 0:
134 | # Now call feed() on the rest of the data, chunk by chunk.
135 | data = markup.read(self.CHUNK_SIZE)
136 | if len(data) != 0:
137 | self.parser.feed(data)
138 | self.parser.close()
139 | except (UnicodeDecodeError, LookupError, etree.ParserError), e:
140 | raise ParserRejectedMarkup(str(e))
141 |
142 | def close(self):
143 | self.nsmaps = [self.DEFAULT_NSMAPS]
144 |
145 | def start(self, name, attrs, nsmap={}):
146 | # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
147 | attrs = dict(attrs)
148 | nsprefix = None
149 | # Invert each namespace map as it comes in.
150 | if len(self.nsmaps) > 1:
151 | # There are no new namespaces for this tag, but
152 | # non-default namespaces are in play, so we need a
153 | # separate tag stack to know when they end.
154 | self.nsmaps.append(None)
155 | elif len(nsmap) > 0:
156 | # A new namespace mapping has come into play.
157 | inverted_nsmap = dict((value, key) for key, value in nsmap.items())
158 | self.nsmaps.append(inverted_nsmap)
159 | # Also treat the namespace mapping as a set of attributes on the
160 | # tag, so we can recreate it later.
161 | attrs = attrs.copy()
162 | for prefix, namespace in nsmap.items():
163 | attribute = NamespacedAttribute(
164 | "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
165 | attrs[attribute] = namespace
166 |
167 | # Namespaces are in play. Find any attributes that came in
168 | # from lxml with namespaces attached to their names, and
169 | # turn then into NamespacedAttribute objects.
170 | new_attrs = {}
171 | for attr, value in attrs.items():
172 | namespace, attr = self._getNsTag(attr)
173 | if namespace is None:
174 | new_attrs[attr] = value
175 | else:
176 | nsprefix = self._prefix_for_namespace(namespace)
177 | attr = NamespacedAttribute(nsprefix, attr, namespace)
178 | new_attrs[attr] = value
179 | attrs = new_attrs
180 |
181 | namespace, name = self._getNsTag(name)
182 | nsprefix = self._prefix_for_namespace(namespace)
183 | self.soup.handle_starttag(name, namespace, nsprefix, attrs)
184 |
185 | def _prefix_for_namespace(self, namespace):
186 | """Find the currently active prefix for the given namespace."""
187 | if namespace is None:
188 | return None
189 | for inverted_nsmap in reversed(self.nsmaps):
190 | if inverted_nsmap is not None and namespace in inverted_nsmap:
191 | return inverted_nsmap[namespace]
192 | return None
193 |
194 | def end(self, name):
195 | self.soup.endData()
196 | completed_tag = self.soup.tagStack[-1]
197 | namespace, name = self._getNsTag(name)
198 | nsprefix = None
199 | if namespace is not None:
200 | for inverted_nsmap in reversed(self.nsmaps):
201 | if inverted_nsmap is not None and namespace in inverted_nsmap:
202 | nsprefix = inverted_nsmap[namespace]
203 | break
204 | self.soup.handle_endtag(name, nsprefix)
205 | if len(self.nsmaps) > 1:
206 | # This tag, or one of its parents, introduced a namespace
207 | # mapping, so pop it off the stack.
208 | self.nsmaps.pop()
209 |
210 | def pi(self, target, data):
211 | self.soup.endData()
212 | self.soup.handle_data(target + ' ' + data)
213 | self.soup.endData(self.processing_instruction_class)
214 |
215 | def data(self, content):
216 | self.soup.handle_data(content)
217 |
218 | def doctype(self, name, pubid, system):
219 | self.soup.endData()
220 | doctype = Doctype.for_name_and_ids(name, pubid, system)
221 | self.soup.object_was_parsed(doctype)
222 |
223 | def comment(self, content):
224 | "Handle comments as Comment objects."
225 | self.soup.endData()
226 | self.soup.handle_data(content)
227 | self.soup.endData(Comment)
228 |
229 | def test_fragment_to_document(self, fragment):
230 | """See `TreeBuilder`."""
231 | return u'\n%s' % fragment
232 |
233 |
234 | class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
235 |
236 | NAME = LXML
237 | ALTERNATE_NAMES = ["lxml-html"]
238 |
239 | features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
240 | is_xml = False
241 | processing_instruction_class = ProcessingInstruction
242 |
243 | def default_parser(self, encoding):
244 | return etree.HTMLParser
245 |
246 | def feed(self, markup):
247 | encoding = self.soup.original_encoding
248 | try:
249 | self.parser = self.parser_for(encoding)
250 | self.parser.feed(markup)
251 | self.parser.close()
252 | except (UnicodeDecodeError, LookupError, etree.ParserError), e:
253 | raise ParserRejectedMarkup(str(e))
254 |
255 |
256 | def test_fragment_to_document(self, fragment):
257 | """See `TreeBuilder`."""
258 | return u'%s' % fragment
259 |
--------------------------------------------------------------------------------
/weeman/lib/bs4/dammit.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """Beautiful Soup bonus library: Unicode, Dammit
3 |
4 | This library converts a bytestream to Unicode through any means
5 | necessary. It is heavily based on code from Mark Pilgrim's Universal
6 | Feed Parser. It works best on XML and HTML, but it does not rewrite the
7 | XML or HTML to reflect a new encoding; that's the tree builder's job.
8 | """
9 | # Use of this source code is governed by a BSD-style license that can be
10 | # found in the LICENSE file.
11 | __license__ = "MIT"
12 |
13 | import codecs
14 | from htmlentitydefs import codepoint2name
15 | import re
16 | import logging
17 | import string
18 |
19 | # Import a library to autodetect character encodings.
20 | chardet_type = None
21 | try:
22 | # First try the fast C implementation.
23 | # PyPI package: cchardet
24 | import cchardet
25 | def chardet_dammit(s):
26 | return cchardet.detect(s)['encoding']
27 | except ImportError:
28 | try:
29 | # Fall back to the pure Python implementation
30 | # Debian package: python-chardet
31 | # PyPI package: chardet
32 | import chardet
33 | def chardet_dammit(s):
34 | return chardet.detect(s)['encoding']
35 | #import chardet.constants
36 | #chardet.constants._debug = 1
37 | except ImportError:
38 | # No chardet available.
39 | def chardet_dammit(s):
40 | return None
41 |
42 | # Available from http://cjkpython.i18n.org/.
43 | try:
44 | import iconv_codec
45 | except ImportError:
46 | pass
47 |
48 | xml_encoding_re = re.compile(
49 | '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
50 | html_meta_re = re.compile(
51 | '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
52 |
53 | class EntitySubstitution(object):
54 |
55 | """Substitute XML or HTML entities for the corresponding characters."""
56 |
57 | def _populate_class_variables():
58 | lookup = {}
59 | reverse_lookup = {}
60 | characters_for_re = []
61 | for codepoint, name in list(codepoint2name.items()):
62 | character = unichr(codepoint)
63 | if codepoint != 34:
64 | # There's no point in turning the quotation mark into
65 | # ", unless it happens within an attribute value, which
66 | # is handled elsewhere.
67 | characters_for_re.append(character)
68 | lookup[character] = name
69 | # But we do want to turn " into the quotation mark.
70 | reverse_lookup[name] = character
71 | re_definition = "[%s]" % "".join(characters_for_re)
72 | return lookup, reverse_lookup, re.compile(re_definition)
73 | (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
74 | CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
75 |
76 | CHARACTER_TO_XML_ENTITY = {
77 | "'": "apos",
78 | '"': "quot",
79 | "&": "amp",
80 | "<": "lt",
81 | ">": "gt",
82 | }
83 |
84 | BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
85 | "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
86 | ")")
87 |
88 | AMPERSAND_OR_BRACKET = re.compile("([<>&])")
89 |
90 | @classmethod
91 | def _substitute_html_entity(cls, matchobj):
92 | entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
93 | return "&%s;" % entity
94 |
95 | @classmethod
96 | def _substitute_xml_entity(cls, matchobj):
97 | """Used with a regular expression to substitute the
98 | appropriate XML entity for an XML special character."""
99 | entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
100 | return "&%s;" % entity
101 |
102 | @classmethod
103 | def quoted_attribute_value(self, value):
104 | """Make a value into a quoted XML attribute, possibly escaping it.
105 |
106 | Most strings will be quoted using double quotes.
107 |
108 | Bob's Bar -> "Bob's Bar"
109 |
110 | If a string contains double quotes, it will be quoted using
111 | single quotes.
112 |
113 | Welcome to "my bar" -> 'Welcome to "my bar"'
114 |
115 | If a string contains both single and double quotes, the
116 | double quotes will be escaped, and the string will be quoted
117 | using double quotes.
118 |
119 | Welcome to "Bob's Bar" -> "Welcome to "Bob's bar"
120 | """
121 | quote_with = '"'
122 | if '"' in value:
123 | if "'" in value:
124 | # The string contains both single and double
125 | # quotes. Turn the double quotes into
126 | # entities. We quote the double quotes rather than
127 | # the single quotes because the entity name is
128 | # """ whether this is HTML or XML. If we
129 | # quoted the single quotes, we'd have to decide
130 | # between ' and &squot;.
131 | replace_with = """
132 | value = value.replace('"', replace_with)
133 | else:
134 | # There are double quotes but no single quotes.
135 | # We can use single quotes to quote the attribute.
136 | quote_with = "'"
137 | return quote_with + value + quote_with
138 |
139 | @classmethod
140 | def substitute_xml(cls, value, make_quoted_attribute=False):
141 | """Substitute XML entities for special XML characters.
142 |
143 | :param value: A string to be substituted. The less-than sign
144 | will become <, the greater-than sign will become >,
145 | and any ampersands will become &. If you want ampersands
146 | that appear to be part of an entity definition to be left
147 | alone, use substitute_xml_containing_entities() instead.
148 |
149 | :param make_quoted_attribute: If True, then the string will be
150 | quoted, as befits an attribute value.
151 | """
152 | # Escape angle brackets and ampersands.
153 | value = cls.AMPERSAND_OR_BRACKET.sub(
154 | cls._substitute_xml_entity, value)
155 |
156 | if make_quoted_attribute:
157 | value = cls.quoted_attribute_value(value)
158 | return value
159 |
160 | @classmethod
161 | def substitute_xml_containing_entities(
162 | cls, value, make_quoted_attribute=False):
163 | """Substitute XML entities for special XML characters.
164 |
165 | :param value: A string to be substituted. The less-than sign will
166 | become <, the greater-than sign will become >, and any
167 | ampersands that are not part of an entity defition will
168 | become &.
169 |
170 | :param make_quoted_attribute: If True, then the string will be
171 | quoted, as befits an attribute value.
172 | """
173 | # Escape angle brackets, and ampersands that aren't part of
174 | # entities.
175 | value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
176 | cls._substitute_xml_entity, value)
177 |
178 | if make_quoted_attribute:
179 | value = cls.quoted_attribute_value(value)
180 | return value
181 |
182 | @classmethod
183 | def substitute_html(cls, s):
184 | """Replace certain Unicode characters with named HTML entities.
185 |
186 | This differs from data.encode(encoding, 'xmlcharrefreplace')
187 | in that the goal is to make the result more readable (to those
188 | with ASCII displays) rather than to recover from
189 | errors. There's absolutely nothing wrong with a UTF-8 string
190 | containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
191 | character with "é" will make it more readable to some
192 | people.
193 | """
194 | return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
195 | cls._substitute_html_entity, s)
196 |
197 |
198 | class EncodingDetector:
199 | """Suggests a number of possible encodings for a bytestring.
200 |
201 | Order of precedence:
202 |
203 | 1. Encodings you specifically tell EncodingDetector to try first
204 | (the override_encodings argument to the constructor).
205 |
206 | 2. An encoding declared within the bytestring itself, either in an
207 | XML declaration (if the bytestring is to be interpreted as an XML
208 | document), or in a tag (if the bytestring is to be
209 | interpreted as an HTML document.)
210 |
211 | 3. An encoding detected through textual analysis by chardet,
212 | cchardet, or a similar external library.
213 |
214 | 4. UTF-8.
215 |
216 | 5. Windows-1252.
217 | """
218 | def __init__(self, markup, override_encodings=None, is_html=False,
219 | exclude_encodings=None):
220 | self.override_encodings = override_encodings or []
221 | exclude_encodings = exclude_encodings or []
222 | self.exclude_encodings = set([x.lower() for x in exclude_encodings])
223 | self.chardet_encoding = None
224 | self.is_html = is_html
225 | self.declared_encoding = None
226 |
227 | # First order of business: strip a byte-order mark.
228 | self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
229 |
230 | def _usable(self, encoding, tried):
231 | if encoding is not None:
232 | encoding = encoding.lower()
233 | if encoding in self.exclude_encodings:
234 | return False
235 | if encoding not in tried:
236 | tried.add(encoding)
237 | return True
238 | return False
239 |
240 | @property
241 | def encodings(self):
242 | """Yield a number of encodings that might work for this markup."""
243 | tried = set()
244 | for e in self.override_encodings:
245 | if self._usable(e, tried):
246 | yield e
247 |
248 | # Did the document originally start with a byte-order mark
249 | # that indicated its encoding?
250 | if self._usable(self.sniffed_encoding, tried):
251 | yield self.sniffed_encoding
252 |
253 | # Look within the document for an XML or HTML encoding
254 | # declaration.
255 | if self.declared_encoding is None:
256 | self.declared_encoding = self.find_declared_encoding(
257 | self.markup, self.is_html)
258 | if self._usable(self.declared_encoding, tried):
259 | yield self.declared_encoding
260 |
261 | # Use third-party character set detection to guess at the
262 | # encoding.
263 | if self.chardet_encoding is None:
264 | self.chardet_encoding = chardet_dammit(self.markup)
265 | if self._usable(self.chardet_encoding, tried):
266 | yield self.chardet_encoding
267 |
268 | # As a last-ditch effort, try utf-8 and windows-1252.
269 | for e in ('utf-8', 'windows-1252'):
270 | if self._usable(e, tried):
271 | yield e
272 |
273 | @classmethod
274 | def strip_byte_order_mark(cls, data):
275 | """If a byte-order mark is present, strip it and return the encoding it implies."""
276 | encoding = None
277 | if isinstance(data, unicode):
278 | # Unicode data cannot have a byte-order mark.
279 | return data, encoding
280 | if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
281 | and (data[2:4] != '\x00\x00'):
282 | encoding = 'utf-16be'
283 | data = data[2:]
284 | elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
285 | and (data[2:4] != '\x00\x00'):
286 | encoding = 'utf-16le'
287 | data = data[2:]
288 | elif data[:3] == b'\xef\xbb\xbf':
289 | encoding = 'utf-8'
290 | data = data[3:]
291 | elif data[:4] == b'\x00\x00\xfe\xff':
292 | encoding = 'utf-32be'
293 | data = data[4:]
294 | elif data[:4] == b'\xff\xfe\x00\x00':
295 | encoding = 'utf-32le'
296 | data = data[4:]
297 | return data, encoding
298 |
299 | @classmethod
300 | def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):
301 | """Given a document, tries to find its declared encoding.
302 |
303 | An XML encoding is declared at the beginning of the document.
304 |
305 | An HTML encoding is declared in a tag, hopefully near the
306 | beginning of the document.
307 | """
308 | if search_entire_document:
309 | xml_endpos = html_endpos = len(markup)
310 | else:
311 | xml_endpos = 1024
312 | html_endpos = max(2048, int(len(markup) * 0.05))
313 |
314 | declared_encoding = None
315 | declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
316 | if not declared_encoding_match and is_html:
317 | declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
318 | if declared_encoding_match is not None:
319 | declared_encoding = declared_encoding_match.groups()[0].decode(
320 | 'ascii', 'replace')
321 | if declared_encoding:
322 | return declared_encoding.lower()
323 | return None
324 |
325 | class UnicodeDammit:
326 | """A class for detecting the encoding of a *ML document and
327 | converting it to a Unicode string. If the source encoding is
328 | windows-1252, can replace MS smart quotes with their HTML or XML
329 | equivalents."""
330 |
331 | # This dictionary maps commonly seen values for "charset" in HTML
332 | # meta tags to the corresponding Python codec names. It only covers
333 | # values that aren't in Python's aliases and can't be determined
334 | # by the heuristics in find_codec.
335 | CHARSET_ALIASES = {"macintosh": "mac-roman",
336 | "x-sjis": "shift-jis"}
337 |
338 | ENCODINGS_WITH_SMART_QUOTES = [
339 | "windows-1252",
340 | "iso-8859-1",
341 | "iso-8859-2",
342 | ]
343 |
344 | def __init__(self, markup, override_encodings=[],
345 | smart_quotes_to=None, is_html=False, exclude_encodings=[]):
346 | self.smart_quotes_to = smart_quotes_to
347 | self.tried_encodings = []
348 | self.contains_replacement_characters = False
349 | self.is_html = is_html
350 | self.log = logging.getLogger(__name__)
351 | self.detector = EncodingDetector(
352 | markup, override_encodings, is_html, exclude_encodings)
353 |
354 | # Short-circuit if the data is in Unicode to begin with.
355 | if isinstance(markup, unicode) or markup == '':
356 | self.markup = markup
357 | self.unicode_markup = unicode(markup)
358 | self.original_encoding = None
359 | return
360 |
361 | # The encoding detector may have stripped a byte-order mark.
362 | # Use the stripped markup from this point on.
363 | self.markup = self.detector.markup
364 |
365 | u = None
366 | for encoding in self.detector.encodings:
367 | markup = self.detector.markup
368 | u = self._convert_from(encoding)
369 | if u is not None:
370 | break
371 |
372 | if not u:
373 | # None of the encodings worked. As an absolute last resort,
374 | # try them again with character replacement.
375 |
376 | for encoding in self.detector.encodings:
377 | if encoding != "ascii":
378 | u = self._convert_from(encoding, "replace")
379 | if u is not None:
380 | self.log.warning(
381 | "Some characters could not be decoded, and were "
382 | "replaced with REPLACEMENT CHARACTER."
383 | )
384 | self.contains_replacement_characters = True
385 | break
386 |
387 | # If none of that worked, we could at this point force it to
388 | # ASCII, but that would destroy so much data that I think
389 | # giving up is better.
390 | self.unicode_markup = u
391 | if not u:
392 | self.original_encoding = None
393 |
394 | def _sub_ms_char(self, match):
395 | """Changes a MS smart quote character to an XML or HTML
396 | entity, or an ASCII character."""
397 | orig = match.group(1)
398 | if self.smart_quotes_to == 'ascii':
399 | sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
400 | else:
401 | sub = self.MS_CHARS.get(orig)
402 | if type(sub) == tuple:
403 | if self.smart_quotes_to == 'xml':
404 | sub = ''.encode() + sub[1].encode() + ';'.encode()
405 | else:
406 | sub = '&'.encode() + sub[0].encode() + ';'.encode()
407 | else:
408 | sub = sub.encode()
409 | return sub
410 |
411 | def _convert_from(self, proposed, errors="strict"):
412 | proposed = self.find_codec(proposed)
413 | if not proposed or (proposed, errors) in self.tried_encodings:
414 | return None
415 | self.tried_encodings.append((proposed, errors))
416 | markup = self.markup
417 | # Convert smart quotes to HTML if coming from an encoding
418 | # that might have them.
419 | if (self.smart_quotes_to is not None
420 | and proposed in self.ENCODINGS_WITH_SMART_QUOTES):
421 | smart_quotes_re = b"([\x80-\x9f])"
422 | smart_quotes_compiled = re.compile(smart_quotes_re)
423 | markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
424 |
425 | try:
426 | #print "Trying to convert document to %s (errors=%s)" % (
427 | # proposed, errors)
428 | u = self._to_unicode(markup, proposed, errors)
429 | self.markup = u
430 | self.original_encoding = proposed
431 | except Exception as e:
432 | #print "That didn't work!"
433 | #print e
434 | return None
435 | #print "Correct encoding: %s" % proposed
436 | return self.markup
437 |
438 | def _to_unicode(self, data, encoding, errors="strict"):
439 | '''Given a string and its encoding, decodes the string into Unicode.
440 | %encoding is a string recognized by encodings.aliases'''
441 | return unicode(data, encoding, errors)
442 |
443 | @property
444 | def declared_html_encoding(self):
445 | if not self.is_html:
446 | return None
447 | return self.detector.declared_encoding
448 |
449 | def find_codec(self, charset):
450 | value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
451 | or (charset and self._codec(charset.replace("-", "")))
452 | or (charset and self._codec(charset.replace("-", "_")))
453 | or (charset and charset.lower())
454 | or charset
455 | )
456 | if value:
457 | return value.lower()
458 | return None
459 |
460 | def _codec(self, charset):
461 | if not charset:
462 | return charset
463 | codec = None
464 | try:
465 | codecs.lookup(charset)
466 | codec = charset
467 | except (LookupError, ValueError):
468 | pass
469 | return codec
470 |
471 |
472 | # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
473 | MS_CHARS = {b'\x80': ('euro', '20AC'),
474 | b'\x81': ' ',
475 | b'\x82': ('sbquo', '201A'),
476 | b'\x83': ('fnof', '192'),
477 | b'\x84': ('bdquo', '201E'),
478 | b'\x85': ('hellip', '2026'),
479 | b'\x86': ('dagger', '2020'),
480 | b'\x87': ('Dagger', '2021'),
481 | b'\x88': ('circ', '2C6'),
482 | b'\x89': ('permil', '2030'),
483 | b'\x8A': ('Scaron', '160'),
484 | b'\x8B': ('lsaquo', '2039'),
485 | b'\x8C': ('OElig', '152'),
486 | b'\x8D': '?',
487 | b'\x8E': ('#x17D', '17D'),
488 | b'\x8F': '?',
489 | b'\x90': '?',
490 | b'\x91': ('lsquo', '2018'),
491 | b'\x92': ('rsquo', '2019'),
492 | b'\x93': ('ldquo', '201C'),
493 | b'\x94': ('rdquo', '201D'),
494 | b'\x95': ('bull', '2022'),
495 | b'\x96': ('ndash', '2013'),
496 | b'\x97': ('mdash', '2014'),
497 | b'\x98': ('tilde', '2DC'),
498 | b'\x99': ('trade', '2122'),
499 | b'\x9a': ('scaron', '161'),
500 | b'\x9b': ('rsaquo', '203A'),
501 | b'\x9c': ('oelig', '153'),
502 | b'\x9d': '?',
503 | b'\x9e': ('#x17E', '17E'),
504 | b'\x9f': ('Yuml', ''),}
505 |
506 | # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
507 | # horrors like stripping diacritical marks to turn á into a, but also
508 | # contains non-horrors like turning “ into ".
509 | MS_CHARS_TO_ASCII = {
510 | b'\x80' : 'EUR',
511 | b'\x81' : ' ',
512 | b'\x82' : ',',
513 | b'\x83' : 'f',
514 | b'\x84' : ',,',
515 | b'\x85' : '...',
516 | b'\x86' : '+',
517 | b'\x87' : '++',
518 | b'\x88' : '^',
519 | b'\x89' : '%',
520 | b'\x8a' : 'S',
521 | b'\x8b' : '<',
522 | b'\x8c' : 'OE',
523 | b'\x8d' : '?',
524 | b'\x8e' : 'Z',
525 | b'\x8f' : '?',
526 | b'\x90' : '?',
527 | b'\x91' : "'",
528 | b'\x92' : "'",
529 | b'\x93' : '"',
530 | b'\x94' : '"',
531 | b'\x95' : '*',
532 | b'\x96' : '-',
533 | b'\x97' : '--',
534 | b'\x98' : '~',
535 | b'\x99' : '(TM)',
536 | b'\x9a' : 's',
537 | b'\x9b' : '>',
538 | b'\x9c' : 'oe',
539 | b'\x9d' : '?',
540 | b'\x9e' : 'z',
541 | b'\x9f' : 'Y',
542 | b'\xa0' : ' ',
543 | b'\xa1' : '!',
544 | b'\xa2' : 'c',
545 | b'\xa3' : 'GBP',
546 | b'\xa4' : '$', #This approximation is especially parochial--this is the
547 | #generic currency symbol.
548 | b'\xa5' : 'YEN',
549 | b'\xa6' : '|',
550 | b'\xa7' : 'S',
551 | b'\xa8' : '..',
552 | b'\xa9' : '',
553 | b'\xaa' : '(th)',
554 | b'\xab' : '<<',
555 | b'\xac' : '!',
556 | b'\xad' : ' ',
557 | b'\xae' : '(R)',
558 | b'\xaf' : '-',
559 | b'\xb0' : 'o',
560 | b'\xb1' : '+-',
561 | b'\xb2' : '2',
562 | b'\xb3' : '3',
563 | b'\xb4' : ("'", 'acute'),
564 | b'\xb5' : 'u',
565 | b'\xb6' : 'P',
566 | b'\xb7' : '*',
567 | b'\xb8' : ',',
568 | b'\xb9' : '1',
569 | b'\xba' : '(th)',
570 | b'\xbb' : '>>',
571 | b'\xbc' : '1/4',
572 | b'\xbd' : '1/2',
573 | b'\xbe' : '3/4',
574 | b'\xbf' : '?',
575 | b'\xc0' : 'A',
576 | b'\xc1' : 'A',
577 | b'\xc2' : 'A',
578 | b'\xc3' : 'A',
579 | b'\xc4' : 'A',
580 | b'\xc5' : 'A',
581 | b'\xc6' : 'AE',
582 | b'\xc7' : 'C',
583 | b'\xc8' : 'E',
584 | b'\xc9' : 'E',
585 | b'\xca' : 'E',
586 | b'\xcb' : 'E',
587 | b'\xcc' : 'I',
588 | b'\xcd' : 'I',
589 | b'\xce' : 'I',
590 | b'\xcf' : 'I',
591 | b'\xd0' : 'D',
592 | b'\xd1' : 'N',
593 | b'\xd2' : 'O',
594 | b'\xd3' : 'O',
595 | b'\xd4' : 'O',
596 | b'\xd5' : 'O',
597 | b'\xd6' : 'O',
598 | b'\xd7' : '*',
599 | b'\xd8' : 'O',
600 | b'\xd9' : 'U',
601 | b'\xda' : 'U',
602 | b'\xdb' : 'U',
603 | b'\xdc' : 'U',
604 | b'\xdd' : 'Y',
605 | b'\xde' : 'b',
606 | b'\xdf' : 'B',
607 | b'\xe0' : 'a',
608 | b'\xe1' : 'a',
609 | b'\xe2' : 'a',
610 | b'\xe3' : 'a',
611 | b'\xe4' : 'a',
612 | b'\xe5' : 'a',
613 | b'\xe6' : 'ae',
614 | b'\xe7' : 'c',
615 | b'\xe8' : 'e',
616 | b'\xe9' : 'e',
617 | b'\xea' : 'e',
618 | b'\xeb' : 'e',
619 | b'\xec' : 'i',
620 | b'\xed' : 'i',
621 | b'\xee' : 'i',
622 | b'\xef' : 'i',
623 | b'\xf0' : 'o',
624 | b'\xf1' : 'n',
625 | b'\xf2' : 'o',
626 | b'\xf3' : 'o',
627 | b'\xf4' : 'o',
628 | b'\xf5' : 'o',
629 | b'\xf6' : 'o',
630 | b'\xf7' : '/',
631 | b'\xf8' : 'o',
632 | b'\xf9' : 'u',
633 | b'\xfa' : 'u',
634 | b'\xfb' : 'u',
635 | b'\xfc' : 'u',
636 | b'\xfd' : 'y',
637 | b'\xfe' : 'b',
638 | b'\xff' : 'y',
639 | }
640 |
641 | # A map used when removing rogue Windows-1252/ISO-8859-1
642 | # characters in otherwise UTF-8 documents.
643 | #
644 | # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
645 | # Windows-1252.
646 | WINDOWS_1252_TO_UTF8 = {
647 | 0x80 : b'\xe2\x82\xac', # €
648 | 0x82 : b'\xe2\x80\x9a', # ‚
649 | 0x83 : b'\xc6\x92', # ƒ
650 | 0x84 : b'\xe2\x80\x9e', # „
651 | 0x85 : b'\xe2\x80\xa6', # …
652 | 0x86 : b'\xe2\x80\xa0', # †
653 | 0x87 : b'\xe2\x80\xa1', # ‡
654 | 0x88 : b'\xcb\x86', # ˆ
655 | 0x89 : b'\xe2\x80\xb0', # ‰
656 | 0x8a : b'\xc5\xa0', # Š
657 | 0x8b : b'\xe2\x80\xb9', # ‹
658 | 0x8c : b'\xc5\x92', # Œ
659 | 0x8e : b'\xc5\xbd', # Ž
660 | 0x91 : b'\xe2\x80\x98', # ‘
661 | 0x92 : b'\xe2\x80\x99', # ’
662 | 0x93 : b'\xe2\x80\x9c', # “
663 | 0x94 : b'\xe2\x80\x9d', # ”
664 | 0x95 : b'\xe2\x80\xa2', # •
665 | 0x96 : b'\xe2\x80\x93', # –
666 | 0x97 : b'\xe2\x80\x94', # —
667 | 0x98 : b'\xcb\x9c', # ˜
668 | 0x99 : b'\xe2\x84\xa2', # ™
669 | 0x9a : b'\xc5\xa1', # š
670 | 0x9b : b'\xe2\x80\xba', # ›
671 | 0x9c : b'\xc5\x93', # œ
672 | 0x9e : b'\xc5\xbe', # ž
673 | 0x9f : b'\xc5\xb8', # Ÿ
674 | 0xa0 : b'\xc2\xa0', #
675 | 0xa1 : b'\xc2\xa1', # ¡
676 | 0xa2 : b'\xc2\xa2', # ¢
677 | 0xa3 : b'\xc2\xa3', # £
678 | 0xa4 : b'\xc2\xa4', # ¤
679 | 0xa5 : b'\xc2\xa5', # ¥
680 | 0xa6 : b'\xc2\xa6', # ¦
681 | 0xa7 : b'\xc2\xa7', # §
682 | 0xa8 : b'\xc2\xa8', # ¨
683 | 0xa9 : b'\xc2\xa9', # ©
684 | 0xaa : b'\xc2\xaa', # ª
685 | 0xab : b'\xc2\xab', # «
686 | 0xac : b'\xc2\xac', # ¬
687 | 0xad : b'\xc2\xad', #
688 | 0xae : b'\xc2\xae', # ®
689 | 0xaf : b'\xc2\xaf', # ¯
690 | 0xb0 : b'\xc2\xb0', # °
691 | 0xb1 : b'\xc2\xb1', # ±
692 | 0xb2 : b'\xc2\xb2', # ²
693 | 0xb3 : b'\xc2\xb3', # ³
694 | 0xb4 : b'\xc2\xb4', # ´
695 | 0xb5 : b'\xc2\xb5', # µ
696 | 0xb6 : b'\xc2\xb6', # ¶
697 | 0xb7 : b'\xc2\xb7', # ·
698 | 0xb8 : b'\xc2\xb8', # ¸
699 | 0xb9 : b'\xc2\xb9', # ¹
700 | 0xba : b'\xc2\xba', # º
701 | 0xbb : b'\xc2\xbb', # »
702 | 0xbc : b'\xc2\xbc', # ¼
703 | 0xbd : b'\xc2\xbd', # ½
704 | 0xbe : b'\xc2\xbe', # ¾
705 | 0xbf : b'\xc2\xbf', # ¿
706 | 0xc0 : b'\xc3\x80', # À
707 | 0xc1 : b'\xc3\x81', # Á
708 | 0xc2 : b'\xc3\x82', # Â
709 | 0xc3 : b'\xc3\x83', # Ã
710 | 0xc4 : b'\xc3\x84', # Ä
711 | 0xc5 : b'\xc3\x85', # Å
712 | 0xc6 : b'\xc3\x86', # Æ
713 | 0xc7 : b'\xc3\x87', # Ç
714 | 0xc8 : b'\xc3\x88', # È
715 | 0xc9 : b'\xc3\x89', # É
716 | 0xca : b'\xc3\x8a', # Ê
717 | 0xcb : b'\xc3\x8b', # Ë
718 | 0xcc : b'\xc3\x8c', # Ì
719 | 0xcd : b'\xc3\x8d', # Í
720 | 0xce : b'\xc3\x8e', # Î
721 | 0xcf : b'\xc3\x8f', # Ï
722 | 0xd0 : b'\xc3\x90', # Ð
723 | 0xd1 : b'\xc3\x91', # Ñ
724 | 0xd2 : b'\xc3\x92', # Ò
725 | 0xd3 : b'\xc3\x93', # Ó
726 | 0xd4 : b'\xc3\x94', # Ô
727 | 0xd5 : b'\xc3\x95', # Õ
728 | 0xd6 : b'\xc3\x96', # Ö
729 | 0xd7 : b'\xc3\x97', # ×
730 | 0xd8 : b'\xc3\x98', # Ø
731 | 0xd9 : b'\xc3\x99', # Ù
732 | 0xda : b'\xc3\x9a', # Ú
733 | 0xdb : b'\xc3\x9b', # Û
734 | 0xdc : b'\xc3\x9c', # Ü
735 | 0xdd : b'\xc3\x9d', # Ý
736 | 0xde : b'\xc3\x9e', # Þ
737 | 0xdf : b'\xc3\x9f', # ß
738 | 0xe0 : b'\xc3\xa0', # à
739 | 0xe1 : b'\xa1', # á
740 | 0xe2 : b'\xc3\xa2', # â
741 | 0xe3 : b'\xc3\xa3', # ã
742 | 0xe4 : b'\xc3\xa4', # ä
743 | 0xe5 : b'\xc3\xa5', # å
744 | 0xe6 : b'\xc3\xa6', # æ
745 | 0xe7 : b'\xc3\xa7', # ç
746 | 0xe8 : b'\xc3\xa8', # è
747 | 0xe9 : b'\xc3\xa9', # é
748 | 0xea : b'\xc3\xaa', # ê
749 | 0xeb : b'\xc3\xab', # ë
750 | 0xec : b'\xc3\xac', # ì
751 | 0xed : b'\xc3\xad', # í
752 | 0xee : b'\xc3\xae', # î
753 | 0xef : b'\xc3\xaf', # ï
754 | 0xf0 : b'\xc3\xb0', # ð
755 | 0xf1 : b'\xc3\xb1', # ñ
756 | 0xf2 : b'\xc3\xb2', # ò
757 | 0xf3 : b'\xc3\xb3', # ó
758 | 0xf4 : b'\xc3\xb4', # ô
759 | 0xf5 : b'\xc3\xb5', # õ
760 | 0xf6 : b'\xc3\xb6', # ö
761 | 0xf7 : b'\xc3\xb7', # ÷
762 | 0xf8 : b'\xc3\xb8', # ø
763 | 0xf9 : b'\xc3\xb9', # ù
764 | 0xfa : b'\xc3\xba', # ú
765 | 0xfb : b'\xc3\xbb', # û
766 | 0xfc : b'\xc3\xbc', # ü
767 | 0xfd : b'\xc3\xbd', # ý
768 | 0xfe : b'\xc3\xbe', # þ
769 | }
770 |
771 | MULTIBYTE_MARKERS_AND_SIZES = [
772 | (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
773 | (0xe0, 0xef, 3), # 3-byte characters start with E0-EF
774 | (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
775 | ]
776 |
777 | FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
778 | LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
779 |
780 | @classmethod
781 | def detwingle(cls, in_bytes, main_encoding="utf8",
782 | embedded_encoding="windows-1252"):
783 | """Fix characters from one encoding embedded in some other encoding.
784 |
785 | Currently the only situation supported is Windows-1252 (or its
786 | subset ISO-8859-1), embedded in UTF-8.
787 |
788 | The input must be a bytestring. If you've already converted
789 | the document to Unicode, you're too late.
790 |
791 | The output is a bytestring in which `embedded_encoding`
792 | characters have been converted to their `main_encoding`
793 | equivalents.
794 | """
795 | if embedded_encoding.replace('_', '-').lower() not in (
796 | 'windows-1252', 'windows_1252'):
797 | raise NotImplementedError(
798 | "Windows-1252 and ISO-8859-1 are the only currently supported "
799 | "embedded encodings.")
800 |
801 | if main_encoding.lower() not in ('utf8', 'utf-8'):
802 | raise NotImplementedError(
803 | "UTF-8 is the only currently supported main encoding.")
804 |
805 | byte_chunks = []
806 |
807 | chunk_start = 0
808 | pos = 0
809 | while pos < len(in_bytes):
810 | byte = in_bytes[pos]
811 | if not isinstance(byte, int):
812 | # Python 2.x
813 | byte = ord(byte)
814 | if (byte >= cls.FIRST_MULTIBYTE_MARKER
815 | and byte <= cls.LAST_MULTIBYTE_MARKER):
816 | # This is the start of a UTF-8 multibyte character. Skip
817 | # to the end.
818 | for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
819 | if byte >= start and byte <= end:
820 | pos += size
821 | break
822 | elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
823 | # We found a Windows-1252 character!
824 | # Save the string up to this point as a chunk.
825 | byte_chunks.append(in_bytes[chunk_start:pos])
826 |
827 | # Now translate the Windows-1252 character into UTF-8
828 | # and add it as another, one-byte chunk.
829 | byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
830 | pos += 1
831 | chunk_start = pos
832 | else:
833 | # Go on to the next character.
834 | pos += 1
835 | if chunk_start == 0:
836 | # The string is unchanged.
837 | return in_bytes
838 | else:
839 | # Store the final chunk.
840 | byte_chunks.append(in_bytes[chunk_start:])
841 | return b''.join(byte_chunks)
842 |
843 |
--------------------------------------------------------------------------------
/weeman/lib/bs4/diagnose.py:
--------------------------------------------------------------------------------
1 | """Diagnostic functions, mainly for use when doing tech support."""
2 |
3 | # Use of this source code is governed by a BSD-style license that can be
4 | # found in the LICENSE file.
5 | __license__ = "MIT"
6 |
7 | import cProfile
8 | from StringIO import StringIO
9 | from HTMLParser import HTMLParser
10 | import lib.bs4
11 | from lib.bs4 import BeautifulSoup, __version__
12 | from lib.bs4.builder import builder_registry
13 |
14 | import os
15 | import pstats
16 | import random
17 | import tempfile
18 | import time
19 | import traceback
20 | import sys
21 | import cProfile
22 |
23 | def diagnose(data):
24 | """Diagnostic suite for isolating common problems."""
25 | print "Diagnostic running on Beautiful Soup %s" % __version__
26 | print "Python version %s" % sys.version
27 |
28 | basic_parsers = ["html.parser", "html5lib", "lxml"]
29 | for name in basic_parsers:
30 | for builder in builder_registry.builders:
31 | if name in builder.features:
32 | break
33 | else:
34 | basic_parsers.remove(name)
35 | print (
36 | "I noticed that %s is not installed. Installing it may help." %
37 | name)
38 |
39 | if 'lxml' in basic_parsers:
40 | basic_parsers.append(["lxml", "xml"])
41 | try:
42 | from lxml import etree
43 | print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
44 | except ImportError, e:
45 | print (
46 | "lxml is not installed or couldn't be imported.")
47 |
48 |
49 | if 'html5lib' in basic_parsers:
50 | try:
51 | import html5lib
52 | print "Found html5lib version %s" % html5lib.__version__
53 | except ImportError, e:
54 | print (
55 | "html5lib is not installed or couldn't be imported.")
56 |
57 | if hasattr(data, 'read'):
58 | data = data.read()
59 | elif os.path.exists(data):
60 | print '"%s" looks like a filename. Reading data from the file.' % data
61 | with open(data) as fp:
62 | data = fp.read()
63 | elif data.startswith("http:") or data.startswith("https:"):
64 | print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
65 | print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
66 | return
67 | print
68 |
69 | for parser in basic_parsers:
70 | print "Trying to parse your markup with %s" % parser
71 | success = False
72 | try:
73 | soup = BeautifulSoup(data, parser)
74 | success = True
75 | except Exception, e:
76 | print "%s could not parse the markup." % parser
77 | traceback.print_exc()
78 | if success:
79 | print "Here's what %s did with the markup:" % parser
80 | print soup.prettify()
81 |
82 | print "-" * 80
83 |
84 | def lxml_trace(data, html=True, **kwargs):
85 | """Print out the lxml events that occur during parsing.
86 |
87 | This lets you see how lxml parses a document when no Beautiful
88 | Soup code is running.
89 | """
90 | from lxml import etree
91 | for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
92 | print("%s, %4s, %s" % (event, element.tag, element.text))
93 |
94 | class AnnouncingParser(HTMLParser):
95 | """Announces HTMLParser parse events, without doing anything else."""
96 |
97 | def _p(self, s):
98 | print(s)
99 |
100 | def handle_starttag(self, name, attrs):
101 | self._p("%s START" % name)
102 |
103 | def handle_endtag(self, name):
104 | self._p("%s END" % name)
105 |
106 | def handle_data(self, data):
107 | self._p("%s DATA" % data)
108 |
109 | def handle_charref(self, name):
110 | self._p("%s CHARREF" % name)
111 |
112 | def handle_entityref(self, name):
113 | self._p("%s ENTITYREF" % name)
114 |
115 | def handle_comment(self, data):
116 | self._p("%s COMMENT" % data)
117 |
118 | def handle_decl(self, data):
119 | self._p("%s DECL" % data)
120 |
121 | def unknown_decl(self, data):
122 | self._p("%s UNKNOWN-DECL" % data)
123 |
124 | def handle_pi(self, data):
125 | self._p("%s PI" % data)
126 |
127 | def htmlparser_trace(data):
128 | """Print out the HTMLParser events that occur during parsing.
129 |
130 | This lets you see how HTMLParser parses a document when no
131 | Beautiful Soup code is running.
132 | """
133 | parser = AnnouncingParser()
134 | parser.feed(data)
135 |
136 | _vowels = "aeiou"
137 | _consonants = "bcdfghjklmnpqrstvwxyz"
138 |
139 | def rword(length=5):
140 | "Generate a random word-like string."
141 | s = ''
142 | for i in range(length):
143 | if i % 2 == 0:
144 | t = _consonants
145 | else:
146 | t = _vowels
147 | s += random.choice(t)
148 | return s
149 |
150 | def rsentence(length=4):
151 | "Generate a random sentence-like string."
152 | return " ".join(rword(random.randint(4,9)) for i in range(length))
153 |
154 | def rdoc(num_elements=1000):
155 | """Randomly generate an invalid HTML document."""
156 | tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
157 | elements = []
158 | for i in range(num_elements):
159 | choice = random.randint(0,3)
160 | if choice == 0:
161 | # New tag.
162 | tag_name = random.choice(tag_names)
163 | elements.append("<%s>" % tag_name)
164 | elif choice == 1:
165 | elements.append(rsentence(random.randint(1,4)))
166 | elif choice == 2:
167 | # Close a tag.
168 | tag_name = random.choice(tag_names)
169 | elements.append("%s>" % tag_name)
170 | return "" + "\n".join(elements) + ""
171 |
172 | def benchmark_parsers(num_elements=100000):
173 | """Very basic head-to-head performance benchmark."""
174 | print "Comparative parser benchmark on Beautiful Soup %s" % __version__
175 | data = rdoc(num_elements)
176 | print "Generated a large invalid HTML document (%d bytes)." % len(data)
177 |
178 | for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
179 | success = False
180 | try:
181 | a = time.time()
182 | soup = BeautifulSoup(data, parser)
183 | b = time.time()
184 | success = True
185 | except Exception, e:
186 | print "%s could not parse the markup." % parser
187 | traceback.print_exc()
188 | if success:
189 | print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
190 |
191 | from lxml import etree
192 | a = time.time()
193 | etree.HTML(data)
194 | b = time.time()
195 | print "Raw lxml parsed the markup in %.2fs." % (b-a)
196 |
197 | import html5lib
198 | parser = html5lib.HTMLParser()
199 | a = time.time()
200 | parser.parse(data)
201 | b = time.time()
202 | print "Raw html5lib parsed the markup in %.2fs." % (b-a)
203 |
204 | def profile(num_elements=100000, parser="lxml"):
205 |
206 | filehandle = tempfile.NamedTemporaryFile()
207 | filename = filehandle.name
208 |
209 | data = rdoc(num_elements)
210 | vars = dict(bs4=bs4, data=data, parser=parser)
211 | cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename)
212 |
213 | stats = pstats.Stats(filename)
214 | # stats.strip_dirs()
215 | stats.sort_stats("cumulative")
216 | stats.print_stats('_html5lib|bs4', 50)
217 |
218 | if __name__ == '__main__':
219 | diagnose(sys.stdin.read())
220 |
--------------------------------------------------------------------------------
/weeman/lib/bs4/testing.py:
--------------------------------------------------------------------------------
1 | """Helper classes for tests."""
2 |
3 | # Use of this source code is governed by a BSD-style license that can be
4 | # found in the LICENSE file.
5 | __license__ = "MIT"
6 |
7 | import pickle
8 | import copy
9 | import functools
10 | import unittest
11 | from unittest import TestCase
12 | from lib.bs4 import BeautifulSoup
13 | from lib.bs4.element import (
14 | CharsetMetaAttributeValue,
15 | Comment,
16 | ContentMetaAttributeValue,
17 | Doctype,
18 | SoupStrainer,
19 | )
20 |
21 | from bs4.builder import HTMLParserTreeBuilder
22 | default_builder = HTMLParserTreeBuilder
23 |
24 |
25 | class SoupTest(unittest.TestCase):
26 |
27 | @property
28 | def default_builder(self):
29 | return default_builder()
30 |
31 | def soup(self, markup, **kwargs):
32 | """Build a Beautiful Soup object from markup."""
33 | builder = kwargs.pop('builder', self.default_builder)
34 | return BeautifulSoup(markup, builder=builder, **kwargs)
35 |
36 | def document_for(self, markup):
37 | """Turn an HTML fragment into a document.
38 |
39 | The details depend on the builder.
40 | """
41 | return self.default_builder.test_fragment_to_document(markup)
42 |
43 | def assertSoupEquals(self, to_parse, compare_parsed_to=None):
44 | builder = self.default_builder
45 | obj = BeautifulSoup(to_parse, builder=builder)
46 | if compare_parsed_to is None:
47 | compare_parsed_to = to_parse
48 |
49 | self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
50 |
51 | def assertConnectedness(self, element):
52 | """Ensure that next_element and previous_element are properly
53 | set for all descendants of the given element.
54 | """
55 | earlier = None
56 | for e in element.descendants:
57 | if earlier:
58 | self.assertEqual(e, earlier.next_element)
59 | self.assertEqual(earlier, e.previous_element)
60 | earlier = e
61 |
62 | class HTMLTreeBuilderSmokeTest(object):
63 |
64 | """A basic test of a treebuilder's competence.
65 |
66 | Any HTML treebuilder, present or future, should be able to pass
67 | these tests. With invalid markup, there's room for interpretation,
68 | and different parsers can handle it differently. But with the
69 | markup in these tests, there's not much room for interpretation.
70 | """
71 |
72 | def test_pickle_and_unpickle_identity(self):
73 | # Pickling a tree, then unpickling it, yields a tree identical
74 | # to the original.
75 | tree = self.soup("foo")
76 | dumped = pickle.dumps(tree, 2)
77 | loaded = pickle.loads(dumped)
78 | self.assertEqual(loaded.__class__, BeautifulSoup)
79 | self.assertEqual(loaded.decode(), tree.decode())
80 |
81 | def assertDoctypeHandled(self, doctype_fragment):
82 | """Assert that a given doctype string is handled correctly."""
83 | doctype_str, soup = self._document_with_doctype(doctype_fragment)
84 |
85 | # Make sure a Doctype object was created.
86 | doctype = soup.contents[0]
87 | self.assertEqual(doctype.__class__, Doctype)
88 | self.assertEqual(doctype, doctype_fragment)
89 | self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
90 |
91 | # Make sure that the doctype was correctly associated with the
92 | # parse tree and that the rest of the document parsed.
93 | self.assertEqual(soup.p.contents[0], 'foo')
94 |
95 | def _document_with_doctype(self, doctype_fragment):
96 | """Generate and parse a document with the given doctype."""
97 | doctype = '' % doctype_fragment
98 | markup = doctype + '\nfoo
'
99 | soup = self.soup(markup)
100 | return doctype, soup
101 |
102 | def test_normal_doctypes(self):
103 | """Make sure normal, everyday HTML doctypes are handled correctly."""
104 | self.assertDoctypeHandled("html")
105 | self.assertDoctypeHandled(
106 | 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
107 |
108 | def test_empty_doctype(self):
109 | soup = self.soup("")
110 | doctype = soup.contents[0]
111 | self.assertEqual("", doctype.strip())
112 |
113 | def test_public_doctype_with_url(self):
114 | doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
115 | self.assertDoctypeHandled(doctype)
116 |
117 | def test_system_doctype(self):
118 | self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
119 |
120 | def test_namespaced_system_doctype(self):
121 | # We can handle a namespaced doctype with a system ID.
122 | self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
123 |
124 | def test_namespaced_public_doctype(self):
125 | # Test a namespaced doctype with a public id.
126 | self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
127 |
128 | def test_real_xhtml_document(self):
129 | """A real XHTML document should come out more or less the same as it went in."""
130 | markup = b"""
131 |
132 |
133 | Hello.
134 | Goodbye.
135 | """
136 | soup = self.soup(markup)
137 | self.assertEqual(
138 | soup.encode("utf-8").replace(b"\n", b""),
139 | markup.replace(b"\n", b""))
140 |
141 | def test_processing_instruction(self):
142 | # We test both Unicode and bytestring to verify that
143 | # process_markup correctly sets processing_instruction_class
144 | # even when the markup is already Unicode and there is no
145 | # need to process anything.
146 | markup = u""""""
147 | soup = self.soup(markup)
148 | self.assertEqual(markup, soup.decode())
149 |
150 | markup = b""""""
151 | soup = self.soup(markup)
152 | self.assertEqual(markup, soup.encode("utf8"))
153 |
154 | def test_deepcopy(self):
155 | """Make sure you can copy the tree builder.
156 |
157 | This is important because the builder is part of a
158 | BeautifulSoup object, and we want to be able to copy that.
159 | """
160 | copy.deepcopy(self.default_builder)
161 |
162 | def test_p_tag_is_never_empty_element(self):
163 | """A tag is never designated as an empty-element tag.
164 |
165 | Even if the markup shows it as an empty-element tag, it
166 | shouldn't be presented that way.
167 | """
168 | soup = self.soup("
")
169 | self.assertFalse(soup.p.is_empty_element)
170 | self.assertEqual(str(soup.p), "")
171 |
172 | def test_unclosed_tags_get_closed(self):
173 | """A tag that's not closed by the end of the document should be closed.
174 |
175 | This applies to all tags except empty-element tags.
176 | """
177 | self.assertSoupEquals("", "
")
178 | self.assertSoupEquals("", "")
179 |
180 | self.assertSoupEquals("
", "
")
181 |
182 | def test_br_is_always_empty_element_tag(self):
183 | """A
tag is designated as an empty-element tag.
184 |
185 | Some parsers treat
as one
tag, some parsers as
186 | two tags, but it should always be an empty-element tag.
187 | """
188 | soup = self.soup("
")
189 | self.assertTrue(soup.br.is_empty_element)
190 | self.assertEqual(str(soup.br), "
")
191 |
192 | def test_nested_formatting_elements(self):
193 | self.assertSoupEquals("")
194 |
195 | def test_double_head(self):
196 | html = '''
197 |
198 |
199 | Ordinary HEAD element test
200 |
201 |
204 |
205 | Hello, world!
206 |
207 |
208 | '''
209 | soup = self.soup(html)
210 | self.assertEqual("text/javascript", soup.find('script')['type'])
211 |
212 | def test_comment(self):
213 | # Comments are represented as Comment objects.
214 | markup = "foobaz
"
215 | self.assertSoupEquals(markup)
216 |
217 | soup = self.soup(markup)
218 | comment = soup.find(text="foobar")
219 | self.assertEqual(comment.__class__, Comment)
220 |
221 | # The comment is properly integrated into the tree.
222 | foo = soup.find(text="foo")
223 | self.assertEqual(comment, foo.next_element)
224 | baz = soup.find(text="baz")
225 | self.assertEqual(comment, baz.previous_element)
226 |
227 | def test_preserved_whitespace_in_pre_and_textarea(self):
228 | """Whitespace must be preserved in and