├── BeautifulSoup.py
├── README.md
├── __init__.py
└── google.py
/BeautifulSoup.py:
--------------------------------------------------------------------------------
1 | """Beautiful Soup
2 | Elixir and Tonic
3 | "The Screen-Scraper's Friend"
4 | http://www.crummy.com/software/BeautifulSoup/
5 |
6 | Beautiful Soup parses a (possibly invalid) XML or HTML document into a
7 | tree representation. It provides methods and Pythonic idioms that make
8 | it easy to navigate, search, and modify the tree.
9 |
10 | A well-formed XML/HTML document yields a well-formed data
11 | structure. An ill-formed XML/HTML document yields a correspondingly
12 | ill-formed data structure. If your document is only locally
13 | well-formed, you can use this library to find and process the
14 | well-formed part of it.
15 |
16 | Beautiful Soup works with Python 2.2 and up. It has no external
17 | dependencies, but you'll have more success at converting data to UTF-8
18 | if you also install these three packages:
19 |
20 | * chardet, for auto-detecting character encodings
21 | http://chardet.feedparser.org/
22 | * cjkcodecs and iconv_codec, which add more encodings to the ones supported
23 | by stock Python.
24 | http://cjkpython.i18n.org/
25 |
26 | Beautiful Soup defines classes for two main parsing strategies:
27 |
28 | * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
29 | language that kind of looks like XML.
30 |
31 | * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
32 | or invalid. This class has web browser-like heuristics for
33 | obtaining a sensible parse tree in the face of common HTML errors.
34 |
35 | Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
36 | the encoding of an HTML or XML document, and converting it to
37 | Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
38 |
39 | For more than you ever wanted to know about Beautiful Soup, see the
40 | documentation:
41 | http://www.crummy.com/software/BeautifulSoup/documentation.html
42 |
43 | Here, have some legalese:
44 |
45 | Copyright (c) 2004-2010, Leonard Richardson
46 |
47 | All rights reserved.
48 |
49 | Redistribution and use in source and binary forms, with or without
50 | modification, are permitted provided that the following conditions are
51 | met:
52 |
53 | * Redistributions of source code must retain the above copyright
54 | notice, this list of conditions and the following disclaimer.
55 |
56 | * Redistributions in binary form must reproduce the above
57 | copyright notice, this list of conditions and the following
58 | disclaimer in the documentation and/or other materials provided
59 | with the distribution.
60 |
61 | * Neither the name of the the Beautiful Soup Consortium and All
62 | Night Kosher Bakery nor the names of its contributors may be
63 | used to endorse or promote products derived from this software
64 | without specific prior written permission.
65 |
66 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
67 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
68 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
69 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
70 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
71 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
72 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
73 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
74 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
75 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
76 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
77 |
78 | """
79 | from __future__ import generators
80 |
81 | __author__ = "Leonard Richardson (leonardr@segfault.org)"
82 | __version__ = "3.2.0"
83 | __copyright__ = "Copyright (c) 2004-2010 Leonard Richardson"
84 | __license__ = "New-style BSD"
85 |
86 | from sgmllib import SGMLParser, SGMLParseError
87 | import codecs
88 | import markupbase
89 | import types
90 | import re
91 | import sgmllib
92 | try:
93 | from htmlentitydefs import name2codepoint
94 | except ImportError:
95 | name2codepoint = {}
96 | try:
97 | set
98 | except NameError:
99 | from sets import Set as set
100 |
101 | #These hacks make Beautiful Soup able to parse XML with namespaces
102 | sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
103 | markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
104 |
105 | DEFAULT_OUTPUT_ENCODING = "utf-8"
106 |
107 | def _match_css_class(str):
108 | """Build a RE to match the given CSS class."""
109 | return re.compile(r"(^|.*\s)%s($|\s)" % str)
110 |
111 | # First, the classes that represent markup elements.
112 |
113 | class PageElement(object):
114 | """Contains the navigational information for some part of the page
115 | (either a tag or a piece of text)"""
116 |
117 | def setup(self, parent=None, previous=None):
118 | """Sets up the initial relations between this element and
119 | other elements."""
120 | self.parent = parent
121 | self.previous = previous
122 | self.next = None
123 | self.previousSibling = None
124 | self.nextSibling = None
125 | if self.parent and self.parent.contents:
126 | self.previousSibling = self.parent.contents[-1]
127 | self.previousSibling.nextSibling = self
128 |
129 | def replaceWith(self, replaceWith):
130 | oldParent = self.parent
131 | myIndex = self.parent.index(self)
132 | if hasattr(replaceWith, "parent")\
133 | and replaceWith.parent is self.parent:
134 | # We're replacing this element with one of its siblings.
135 | index = replaceWith.parent.index(replaceWith)
136 | if index and index < myIndex:
137 | # Furthermore, it comes before this element. That
138 | # means that when we extract it, the index of this
139 | # element will change.
140 | myIndex = myIndex - 1
141 | self.extract()
142 | oldParent.insert(myIndex, replaceWith)
143 |
144 | def replaceWithChildren(self):
145 | myParent = self.parent
146 | myIndex = self.parent.index(self)
147 | self.extract()
148 | reversedChildren = list(self.contents)
149 | reversedChildren.reverse()
150 | for child in reversedChildren:
151 | myParent.insert(myIndex, child)
152 |
153 | def extract(self):
154 | """Destructively rips this element out of the tree."""
155 | if self.parent:
156 | try:
157 | del self.parent.contents[self.parent.index(self)]
158 | except ValueError:
159 | pass
160 |
161 | #Find the two elements that would be next to each other if
162 | #this element (and any children) hadn't been parsed. Connect
163 | #the two.
164 | lastChild = self._lastRecursiveChild()
165 | nextElement = lastChild.next
166 |
167 | if self.previous:
168 | self.previous.next = nextElement
169 | if nextElement:
170 | nextElement.previous = self.previous
171 | self.previous = None
172 | lastChild.next = None
173 |
174 | self.parent = None
175 | if self.previousSibling:
176 | self.previousSibling.nextSibling = self.nextSibling
177 | if self.nextSibling:
178 | self.nextSibling.previousSibling = self.previousSibling
179 | self.previousSibling = self.nextSibling = None
180 | return self
181 |
182 | def _lastRecursiveChild(self):
183 | "Finds the last element beneath this object to be parsed."
184 | lastChild = self
185 | while hasattr(lastChild, 'contents') and lastChild.contents:
186 | lastChild = lastChild.contents[-1]
187 | return lastChild
188 |
189 | def insert(self, position, newChild):
190 | if isinstance(newChild, basestring) \
191 | and not isinstance(newChild, NavigableString):
192 | newChild = NavigableString(newChild)
193 |
194 | position = min(position, len(self.contents))
195 | if hasattr(newChild, 'parent') and newChild.parent is not None:
196 | # We're 'inserting' an element that's already one
197 | # of this object's children.
198 | if newChild.parent is self:
199 | index = self.index(newChild)
200 | if index > position:
201 | # Furthermore we're moving it further down the
202 | # list of this object's children. That means that
203 | # when we extract this element, our target index
204 | # will jump down one.
205 | position = position - 1
206 | newChild.extract()
207 |
208 | newChild.parent = self
209 | previousChild = None
210 | if position == 0:
211 | newChild.previousSibling = None
212 | newChild.previous = self
213 | else:
214 | previousChild = self.contents[position-1]
215 | newChild.previousSibling = previousChild
216 | newChild.previousSibling.nextSibling = newChild
217 | newChild.previous = previousChild._lastRecursiveChild()
218 | if newChild.previous:
219 | newChild.previous.next = newChild
220 |
221 | newChildsLastElement = newChild._lastRecursiveChild()
222 |
223 | if position >= len(self.contents):
224 | newChild.nextSibling = None
225 |
226 | parent = self
227 | parentsNextSibling = None
228 | while not parentsNextSibling:
229 | parentsNextSibling = parent.nextSibling
230 | parent = parent.parent
231 | if not parent: # This is the last element in the document.
232 | break
233 | if parentsNextSibling:
234 | newChildsLastElement.next = parentsNextSibling
235 | else:
236 | newChildsLastElement.next = None
237 | else:
238 | nextChild = self.contents[position]
239 | newChild.nextSibling = nextChild
240 | if newChild.nextSibling:
241 | newChild.nextSibling.previousSibling = newChild
242 | newChildsLastElement.next = nextChild
243 |
244 | if newChildsLastElement.next:
245 | newChildsLastElement.next.previous = newChildsLastElement
246 | self.contents.insert(position, newChild)
247 |
248 | def append(self, tag):
249 | """Appends the given tag to the contents of this tag."""
250 | self.insert(len(self.contents), tag)
251 |
252 | def findNext(self, name=None, attrs={}, text=None, **kwargs):
253 | """Returns the first item that matches the given criteria and
254 | appears after this Tag in the document."""
255 | return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
256 |
257 | def findAllNext(self, name=None, attrs={}, text=None, limit=None,
258 | **kwargs):
259 | """Returns all items that match the given criteria and appear
260 | after this Tag in the document."""
261 | return self._findAll(name, attrs, text, limit, self.nextGenerator,
262 | **kwargs)
263 |
264 | def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
265 | """Returns the closest sibling to this Tag that matches the
266 | given criteria and appears after this Tag in the document."""
267 | return self._findOne(self.findNextSiblings, name, attrs, text,
268 | **kwargs)
269 |
270 | def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
271 | **kwargs):
272 | """Returns the siblings of this Tag that match the given
273 | criteria and appear after this Tag in the document."""
274 | return self._findAll(name, attrs, text, limit,
275 | self.nextSiblingGenerator, **kwargs)
276 | fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
277 |
278 | def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
279 | """Returns the first item that matches the given criteria and
280 | appears before this Tag in the document."""
281 | return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
282 |
283 | def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
284 | **kwargs):
285 | """Returns all items that match the given criteria and appear
286 | before this Tag in the document."""
287 | return self._findAll(name, attrs, text, limit, self.previousGenerator,
288 | **kwargs)
289 | fetchPrevious = findAllPrevious # Compatibility with pre-3.x
290 |
291 | def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
292 | """Returns the closest sibling to this Tag that matches the
293 | given criteria and appears before this Tag in the document."""
294 | return self._findOne(self.findPreviousSiblings, name, attrs, text,
295 | **kwargs)
296 |
297 | def findPreviousSiblings(self, name=None, attrs={}, text=None,
298 | limit=None, **kwargs):
299 | """Returns the siblings of this Tag that match the given
300 | criteria and appear before this Tag in the document."""
301 | return self._findAll(name, attrs, text, limit,
302 | self.previousSiblingGenerator, **kwargs)
303 | fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
304 |
305 | def findParent(self, name=None, attrs={}, **kwargs):
306 | """Returns the closest parent of this Tag that matches the given
307 | criteria."""
308 | # NOTE: We can't use _findOne because findParents takes a different
309 | # set of arguments.
310 | r = None
311 | l = self.findParents(name, attrs, 1)
312 | if l:
313 | r = l[0]
314 | return r
315 |
316 | def findParents(self, name=None, attrs={}, limit=None, **kwargs):
317 | """Returns the parents of this Tag that match the given
318 | criteria."""
319 |
320 | return self._findAll(name, attrs, None, limit, self.parentGenerator,
321 | **kwargs)
322 | fetchParents = findParents # Compatibility with pre-3.x
323 |
324 | #These methods do the real heavy lifting.
325 |
326 | def _findOne(self, method, name, attrs, text, **kwargs):
327 | r = None
328 | l = method(name, attrs, text, 1, **kwargs)
329 | if l:
330 | r = l[0]
331 | return r
332 |
333 | def _findAll(self, name, attrs, text, limit, generator, **kwargs):
334 | "Iterates over a generator looking for things that match."
335 |
336 | if isinstance(name, SoupStrainer):
337 | strainer = name
338 | # (Possibly) special case some findAll*(...) searches
339 | elif text is None and not limit and not attrs and not kwargs:
340 | # findAll*(True)
341 | if name is True:
342 | return [element for element in generator()
343 | if isinstance(element, Tag)]
344 | # findAll*('tag-name')
345 | elif isinstance(name, basestring):
346 | return [element for element in generator()
347 | if isinstance(element, Tag) and
348 | element.name == name]
349 | else:
350 | strainer = SoupStrainer(name, attrs, text, **kwargs)
351 | # Build a SoupStrainer
352 | else:
353 | strainer = SoupStrainer(name, attrs, text, **kwargs)
354 | results = ResultSet(strainer)
355 | g = generator()
356 | while True:
357 | try:
358 | i = g.next()
359 | except StopIteration:
360 | break
361 | if i:
362 | found = strainer.search(i)
363 | if found:
364 | results.append(found)
365 | if limit and len(results) >= limit:
366 | break
367 | return results
368 |
369 | #These Generators can be used to navigate starting from both
370 | #NavigableStrings and Tags.
371 | def nextGenerator(self):
372 | i = self
373 | while i is not None:
374 | i = i.next
375 | yield i
376 |
377 | def nextSiblingGenerator(self):
378 | i = self
379 | while i is not None:
380 | i = i.nextSibling
381 | yield i
382 |
383 | def previousGenerator(self):
384 | i = self
385 | while i is not None:
386 | i = i.previous
387 | yield i
388 |
389 | def previousSiblingGenerator(self):
390 | i = self
391 | while i is not None:
392 | i = i.previousSibling
393 | yield i
394 |
395 | def parentGenerator(self):
396 | i = self
397 | while i is not None:
398 | i = i.parent
399 | yield i
400 |
401 | # Utility methods
402 | def substituteEncoding(self, str, encoding=None):
403 | encoding = encoding or "utf-8"
404 | return str.replace("%SOUP-ENCODING%", encoding)
405 |
406 | def toEncoding(self, s, encoding=None):
407 | """Encodes an object to a string in some encoding, or to Unicode.
408 | ."""
409 | if isinstance(s, unicode):
410 | if encoding:
411 | s = s.encode(encoding)
412 | elif isinstance(s, str):
413 | if encoding:
414 | s = s.encode(encoding)
415 | else:
416 | s = unicode(s)
417 | else:
418 | if encoding:
419 | s = self.toEncoding(str(s), encoding)
420 | else:
421 | s = unicode(s)
422 | return s
423 |
424 | class NavigableString(unicode, PageElement):
425 |
426 | def __new__(cls, value):
427 | """Create a new NavigableString.
428 |
429 | When unpickling a NavigableString, this method is called with
430 | the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
431 | passed in to the superclass's __new__ or the superclass won't know
432 | how to handle non-ASCII characters.
433 | """
434 | if isinstance(value, unicode):
435 | return unicode.__new__(cls, value)
436 | return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
437 |
438 | def __getnewargs__(self):
439 | return (NavigableString.__str__(self),)
440 |
441 | def __getattr__(self, attr):
442 | """text.string gives you text. This is for backwards
443 | compatibility for Navigable*String, but for CData* it lets you
444 | get the string without the CData wrapper."""
445 | if attr == 'string':
446 | return self
447 | else:
448 | raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
449 |
450 | def __unicode__(self):
451 | return str(self).decode(DEFAULT_OUTPUT_ENCODING)
452 |
453 | def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
454 | if encoding:
455 | return self.encode(encoding)
456 | else:
457 | return self
458 |
459 | class CData(NavigableString):
460 |
461 | def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
462 | return "" % NavigableString.__str__(self, encoding)
463 |
464 | class ProcessingInstruction(NavigableString):
465 | def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
466 | output = self
467 | if "%SOUP-ENCODING%" in output:
468 | output = self.substituteEncoding(output, encoding)
469 | return "%s?>" % self.toEncoding(output, encoding)
470 |
471 | class Comment(NavigableString):
472 | def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
473 | return "" % NavigableString.__str__(self, encoding)
474 |
475 | class Declaration(NavigableString):
476 | def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
477 | return "" % NavigableString.__str__(self, encoding)
478 |
479 | class Tag(PageElement):
480 |
481 | """Represents a found HTML tag with its attributes and contents."""
482 |
483 | def _invert(h):
484 | "Cheap function to invert a hash."
485 | i = {}
486 | for k,v in h.items():
487 | i[v] = k
488 | return i
489 |
490 | XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
491 | "quot" : '"',
492 | "amp" : "&",
493 | "lt" : "<",
494 | "gt" : ">" }
495 |
496 | XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
497 |
498 | def _convertEntities(self, match):
499 | """Used in a call to re.sub to replace HTML, XML, and numeric
500 | entities with the appropriate Unicode characters. If HTML
501 | entities are being converted, any unrecognized entities are
502 | escaped."""
503 | x = match.group(1)
504 | if self.convertHTMLEntities and x in name2codepoint:
505 | return unichr(name2codepoint[x])
506 | elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
507 | if self.convertXMLEntities:
508 | return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
509 | else:
510 | return u'&%s;' % x
511 | elif len(x) > 0 and x[0] == '#':
512 | # Handle numeric entities
513 | if len(x) > 1 and x[1] == 'x':
514 | return unichr(int(x[2:], 16))
515 | else:
516 | return unichr(int(x[1:]))
517 |
518 | elif self.escapeUnrecognizedEntities:
519 | return u'&%s;' % x
520 | else:
521 | return u'&%s;' % x
522 |
523 | def __init__(self, parser, name, attrs=None, parent=None,
524 | previous=None):
525 | "Basic constructor."
526 |
527 | # We don't actually store the parser object: that lets extracted
528 | # chunks be garbage-collected
529 | self.parserClass = parser.__class__
530 | self.isSelfClosing = parser.isSelfClosingTag(name)
531 | self.name = name
532 | if attrs is None:
533 | attrs = []
534 | elif isinstance(attrs, dict):
535 | attrs = attrs.items()
536 | self.attrs = attrs
537 | self.contents = []
538 | self.setup(parent, previous)
539 | self.hidden = False
540 | self.containsSubstitutions = False
541 | self.convertHTMLEntities = parser.convertHTMLEntities
542 | self.convertXMLEntities = parser.convertXMLEntities
543 | self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
544 |
545 | # Convert any HTML, XML, or numeric entities in the attribute values.
546 | convert = lambda(k, val): (k,
547 | re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
548 | self._convertEntities,
549 | val))
550 | self.attrs = map(convert, self.attrs)
551 |
552 | def getString(self):
553 | if (len(self.contents) == 1
554 | and isinstance(self.contents[0], NavigableString)):
555 | return self.contents[0]
556 |
557 | def setString(self, string):
558 | """Replace the contents of the tag with a string"""
559 | self.clear()
560 | self.append(string)
561 |
562 | string = property(getString, setString)
563 |
564 | def getText(self, separator=u""):
565 | if not len(self.contents):
566 | return u""
567 | stopNode = self._lastRecursiveChild().next
568 | strings = []
569 | current = self.contents[0]
570 | while current is not stopNode:
571 | if isinstance(current, NavigableString):
572 | strings.append(current.strip())
573 | current = current.next
574 | return separator.join(strings)
575 |
576 | text = property(getText)
577 |
578 | def get(self, key, default=None):
579 | """Returns the value of the 'key' attribute for the tag, or
580 | the value given for 'default' if it doesn't have that
581 | attribute."""
582 | return self._getAttrMap().get(key, default)
583 |
584 | def clear(self):
585 | """Extract all children."""
586 | for child in self.contents[:]:
587 | child.extract()
588 |
589 | def index(self, element):
590 | for i, child in enumerate(self.contents):
591 | if child is element:
592 | return i
593 | raise ValueError("Tag.index: element not in tag")
594 |
595 | def has_key(self, key):
596 | return self._getAttrMap().has_key(key)
597 |
598 | def __getitem__(self, key):
599 | """tag[key] returns the value of the 'key' attribute for the tag,
600 | and throws an exception if it's not there."""
601 | return self._getAttrMap()[key]
602 |
603 | def __iter__(self):
604 | "Iterating over a tag iterates over its contents."
605 | return iter(self.contents)
606 |
607 | def __len__(self):
608 | "The length of a tag is the length of its list of contents."
609 | return len(self.contents)
610 |
611 | def __contains__(self, x):
612 | return x in self.contents
613 |
614 | def __nonzero__(self):
615 | "A tag is non-None even if it has no contents."
616 | return True
617 |
618 | def __setitem__(self, key, value):
619 | """Setting tag[key] sets the value of the 'key' attribute for the
620 | tag."""
621 | self._getAttrMap()
622 | self.attrMap[key] = value
623 | found = False
624 | for i in range(0, len(self.attrs)):
625 | if self.attrs[i][0] == key:
626 | self.attrs[i] = (key, value)
627 | found = True
628 | if not found:
629 | self.attrs.append((key, value))
630 | self._getAttrMap()[key] = value
631 |
632 | def __delitem__(self, key):
633 | "Deleting tag[key] deletes all 'key' attributes for the tag."
634 | for item in self.attrs:
635 | if item[0] == key:
636 | self.attrs.remove(item)
637 | #We don't break because bad HTML can define the same
638 | #attribute multiple times.
639 | self._getAttrMap()
640 | if self.attrMap.has_key(key):
641 | del self.attrMap[key]
642 |
643 | def __call__(self, *args, **kwargs):
644 | """Calling a tag like a function is the same as calling its
645 | findAll() method. Eg. tag('a') returns a list of all the A tags
646 | found within this tag."""
647 | return apply(self.findAll, args, kwargs)
648 |
649 | def __getattr__(self, tag):
650 | #print "Getattr %s.%s" % (self.__class__, tag)
651 | if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
652 | return self.find(tag[:-3])
653 | elif tag.find('__') != 0:
654 | return self.find(tag)
655 | raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
656 |
657 | def __eq__(self, other):
658 | """Returns true iff this tag has the same name, the same attributes,
659 | and the same contents (recursively) as the given tag.
660 |
661 | NOTE: right now this will return false if two tags have the
662 | same attributes in a different order. Should this be fixed?"""
663 | if other is self:
664 | return True
665 | if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
666 | return False
667 | for i in range(0, len(self.contents)):
668 | if self.contents[i] != other.contents[i]:
669 | return False
670 | return True
671 |
672 | def __ne__(self, other):
673 | """Returns true iff this tag is not identical to the other tag,
674 | as defined in __eq__."""
675 | return not self == other
676 |
677 | def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
678 | """Renders this tag as a string."""
679 | return self.__str__(encoding)
680 |
681 | def __unicode__(self):
682 | return self.__str__(None)
683 |
684 | BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
685 | + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
686 | + ")")
687 |
688 | def _sub_entity(self, x):
689 | """Used with a regular expression to substitute the
690 | appropriate XML entity for an XML special character."""
691 | return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
692 |
693 | def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
694 | prettyPrint=False, indentLevel=0):
695 | """Returns a string or Unicode representation of this tag and
696 | its contents. To get Unicode, pass None for encoding.
697 |
698 | NOTE: since Python's HTML parser consumes whitespace, this
699 | method is not certain to reproduce the whitespace present in
700 | the original string."""
701 |
702 | encodedName = self.toEncoding(self.name, encoding)
703 |
704 | attrs = []
705 | if self.attrs:
706 | for key, val in self.attrs:
707 | fmt = '%s="%s"'
708 | if isinstance(val, basestring):
709 | if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
710 | val = self.substituteEncoding(val, encoding)
711 |
712 | # The attribute value either:
713 | #
714 | # * Contains no embedded double quotes or single quotes.
715 | # No problem: we enclose it in double quotes.
716 | # * Contains embedded single quotes. No problem:
717 | # double quotes work here too.
718 | # * Contains embedded double quotes. No problem:
719 | # we enclose it in single quotes.
720 | # * Embeds both single _and_ double quotes. This
721 | # can't happen naturally, but it can happen if
722 | # you modify an attribute value after parsing
723 | # the document. Now we have a bit of a
724 | # problem. We solve it by enclosing the
725 | # attribute in single quotes, and escaping any
726 | # embedded single quotes to XML entities.
727 | if '"' in val:
728 | fmt = "%s='%s'"
729 | if "'" in val:
730 | # TODO: replace with apos when
731 | # appropriate.
732 | val = val.replace("'", "&squot;")
733 |
734 | # Now we're okay w/r/t quotes. But the attribute
735 | # value might also contain angle brackets, or
736 | # ampersands that aren't part of entities. We need
737 | # to escape those to XML entities too.
738 | val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
739 |
740 | attrs.append(fmt % (self.toEncoding(key, encoding),
741 | self.toEncoding(val, encoding)))
742 | close = ''
743 | closeTag = ''
744 | if self.isSelfClosing:
745 | close = ' /'
746 | else:
747 | closeTag = '%s>' % encodedName
748 |
749 | indentTag, indentContents = 0, 0
750 | if prettyPrint:
751 | indentTag = indentLevel
752 | space = (' ' * (indentTag-1))
753 | indentContents = indentTag + 1
754 | contents = self.renderContents(encoding, prettyPrint, indentContents)
755 | if self.hidden:
756 | s = contents
757 | else:
758 | s = []
759 | attributeString = ''
760 | if attrs:
761 | attributeString = ' ' + ' '.join(attrs)
762 | if prettyPrint:
763 | s.append(space)
764 | s.append('<%s%s%s>' % (encodedName, attributeString, close))
765 | if prettyPrint:
766 | s.append("\n")
767 | s.append(contents)
768 | if prettyPrint and contents and contents[-1] != "\n":
769 | s.append("\n")
770 | if prettyPrint and closeTag:
771 | s.append(space)
772 | s.append(closeTag)
773 | if prettyPrint and closeTag and self.nextSibling:
774 | s.append("\n")
775 | s = ''.join(s)
776 | return s
777 |
778 | def decompose(self):
779 | """Recursively destroys the contents of this tree."""
780 | self.extract()
781 | if len(self.contents) == 0:
782 | return
783 | current = self.contents[0]
784 | while current is not None:
785 | next = current.next
786 | if isinstance(current, Tag):
787 | del current.contents[:]
788 | current.parent = None
789 | current.previous = None
790 | current.previousSibling = None
791 | current.next = None
792 | current.nextSibling = None
793 | current = next
794 |
795 | def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
796 | return self.__str__(encoding, True)
797 |
798 | def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
799 | prettyPrint=False, indentLevel=0):
800 | """Renders the contents of this tag as a string in the given
801 | encoding. If encoding is None, returns a Unicode string.."""
802 | s=[]
803 | for c in self:
804 | text = None
805 | if isinstance(c, NavigableString):
806 | text = c.__str__(encoding)
807 | elif isinstance(c, Tag):
808 | s.append(c.__str__(encoding, prettyPrint, indentLevel))
809 | if text and prettyPrint:
810 | text = text.strip()
811 | if text:
812 | if prettyPrint:
813 | s.append(" " * (indentLevel-1))
814 | s.append(text)
815 | if prettyPrint:
816 | s.append("\n")
817 | return ''.join(s)
818 |
819 | #Soup methods
820 |
821 | def find(self, name=None, attrs={}, recursive=True, text=None,
822 | **kwargs):
823 | """Return only the first child of this Tag matching the given
824 | criteria."""
825 | r = None
826 | l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
827 | if l:
828 | r = l[0]
829 | return r
830 | findChild = find
831 |
832 | def findAll(self, name=None, attrs={}, recursive=True, text=None,
833 | limit=None, **kwargs):
834 | """Extracts a list of Tag objects that match the given
835 | criteria. You can specify the name of the Tag and any
836 | attributes you want the Tag to have.
837 |
838 | The value of a key-value pair in the 'attrs' map can be a
839 | string, a list of strings, a regular expression object, or a
840 | callable that takes a string and returns whether or not the
841 | string matches for some custom definition of 'matches'. The
842 | same is true of the tag name."""
843 | generator = self.recursiveChildGenerator
844 | if not recursive:
845 | generator = self.childGenerator
846 | return self._findAll(name, attrs, text, limit, generator, **kwargs)
847 | findChildren = findAll
848 |
849 | # Pre-3.x compatibility methods
850 | first = find
851 | fetch = findAll
852 |
853 | def fetchText(self, text=None, recursive=True, limit=None):
854 | return self.findAll(text=text, recursive=recursive, limit=limit)
855 |
856 | def firstText(self, text=None, recursive=True):
857 | return self.find(text=text, recursive=recursive)
858 |
859 | #Private methods
860 |
861 | def _getAttrMap(self):
862 | """Initializes a map representation of this tag's attributes,
863 | if not already initialized."""
864 | if not getattr(self, 'attrMap'):
865 | self.attrMap = {}
866 | for (key, value) in self.attrs:
867 | self.attrMap[key] = value
868 | return self.attrMap
869 |
870 | #Generator methods
871 | def childGenerator(self):
872 | # Just use the iterator from the contents
873 | return iter(self.contents)
874 |
875 | def recursiveChildGenerator(self):
876 | if not len(self.contents):
877 | raise StopIteration
878 | stopNode = self._lastRecursiveChild().next
879 | current = self.contents[0]
880 | while current is not stopNode:
881 | yield current
882 | current = current.next
883 |
884 |
885 | # Next, a couple classes to represent queries and their results.
886 | class SoupStrainer:
887 | """Encapsulates a number of ways of matching a markup element (tag or
888 | text)."""
889 |
890 | def __init__(self, name=None, attrs={}, text=None, **kwargs):
891 | self.name = name
892 | if isinstance(attrs, basestring):
893 | kwargs['class'] = _match_css_class(attrs)
894 | attrs = None
895 | if kwargs:
896 | if attrs:
897 | attrs = attrs.copy()
898 | attrs.update(kwargs)
899 | else:
900 | attrs = kwargs
901 | self.attrs = attrs
902 | self.text = text
903 |
904 | def __str__(self):
905 | if self.text:
906 | return self.text
907 | else:
908 | return "%s|%s" % (self.name, self.attrs)
909 |
910 | def searchTag(self, markupName=None, markupAttrs={}):
911 | found = None
912 | markup = None
913 | if isinstance(markupName, Tag):
914 | markup = markupName
915 | markupAttrs = markup
916 | callFunctionWithTagData = callable(self.name) \
917 | and not isinstance(markupName, Tag)
918 |
919 | if (not self.name) \
920 | or callFunctionWithTagData \
921 | or (markup and self._matches(markup, self.name)) \
922 | or (not markup and self._matches(markupName, self.name)):
923 | if callFunctionWithTagData:
924 | match = self.name(markupName, markupAttrs)
925 | else:
926 | match = True
927 | markupAttrMap = None
928 | for attr, matchAgainst in self.attrs.items():
929 | if not markupAttrMap:
930 | if hasattr(markupAttrs, 'get'):
931 | markupAttrMap = markupAttrs
932 | else:
933 | markupAttrMap = {}
934 | for k,v in markupAttrs:
935 | markupAttrMap[k] = v
936 | attrValue = markupAttrMap.get(attr)
937 | if not self._matches(attrValue, matchAgainst):
938 | match = False
939 | break
940 | if match:
941 | if markup:
942 | found = markup
943 | else:
944 | found = markupName
945 | return found
946 |
947 | def search(self, markup):
948 | #print 'looking for %s in %s' % (self, markup)
949 | found = None
950 | # If given a list of items, scan it for a text element that
951 | # matches.
952 | if hasattr(markup, "__iter__") \
953 | and not isinstance(markup, Tag):
954 | for element in markup:
955 | if isinstance(element, NavigableString) \
956 | and self.search(element):
957 | found = element
958 | break
959 | # If it's a Tag, make sure its name or attributes match.
960 | # Don't bother with Tags if we're searching for text.
961 | elif isinstance(markup, Tag):
962 | if not self.text:
963 | found = self.searchTag(markup)
964 | # If it's text, make sure the text matches.
965 | elif isinstance(markup, NavigableString) or \
966 | isinstance(markup, basestring):
967 | if self._matches(markup, self.text):
968 | found = markup
969 | else:
970 | raise Exception, "I don't know how to match against a %s" \
971 | % markup.__class__
972 | return found
973 |
974 | def _matches(self, markup, matchAgainst):
975 | #print "Matching %s against %s" % (markup, matchAgainst)
976 | result = False
977 | if matchAgainst is True:
978 | result = markup is not None
979 | elif callable(matchAgainst):
980 | result = matchAgainst(markup)
981 | else:
982 | #Custom match methods take the tag as an argument, but all
983 | #other ways of matching match the tag name as a string.
984 | if isinstance(markup, Tag):
985 | markup = markup.name
986 | if markup and not isinstance(markup, basestring):
987 | markup = unicode(markup)
988 | #Now we know that chunk is either a string, or None.
989 | if hasattr(matchAgainst, 'match'):
990 | # It's a regexp object.
991 | result = markup and matchAgainst.search(markup)
992 | elif hasattr(matchAgainst, '__iter__'): # list-like
993 | result = markup in matchAgainst
994 | elif hasattr(matchAgainst, 'items'):
995 | result = markup.has_key(matchAgainst)
996 | elif matchAgainst and isinstance(markup, basestring):
997 | if isinstance(markup, unicode):
998 | matchAgainst = unicode(matchAgainst)
999 | else:
1000 | matchAgainst = str(matchAgainst)
1001 |
1002 | if not result:
1003 | result = matchAgainst == markup
1004 | return result
1005 |
1006 | class ResultSet(list):
1007 | """A ResultSet is just a list that keeps track of the SoupStrainer
1008 | that created it."""
1009 | def __init__(self, source):
1010 | list.__init__([])
1011 | self.source = source
1012 |
1013 | # Now, some helper functions.
1014 |
1015 | def buildTagMap(default, *args):
1016 | """Turns a list of maps, lists, or scalars into a single map.
1017 | Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
1018 | NESTING_RESET_TAGS maps out of lists and partial maps."""
1019 | built = {}
1020 | for portion in args:
1021 | if hasattr(portion, 'items'):
1022 | #It's a map. Merge it.
1023 | for k,v in portion.items():
1024 | built[k] = v
1025 | elif hasattr(portion, '__iter__'): # is a list
1026 | #It's a list. Map each item to the default.
1027 | for k in portion:
1028 | built[k] = default
1029 | else:
1030 | #It's a scalar. Map it to the default.
1031 | built[portion] = default
1032 | return built
1033 |
1034 | # Now, the parser classes.
1035 |
1036 | class BeautifulStoneSoup(Tag, SGMLParser):
1037 |
1038 | """This class contains the basic parser and search code. It defines
1039 | a parser that knows nothing about tag behavior except for the
1040 | following:
1041 |
1042 | You can't close a tag without closing all the tags it encloses.
1043 | That is, "
(No space between name of closing tag and tag close)
1102 | (Extraneous whitespace in declaration)
1103 |
1104 | You can pass in a custom list of (RE object, replace method)
1105 | tuples to get Beautiful Soup to scrub your input the way you
1106 | want."""
1107 |
1108 | self.parseOnlyThese = parseOnlyThese
1109 | self.fromEncoding = fromEncoding
1110 | self.smartQuotesTo = smartQuotesTo
1111 | self.convertEntities = convertEntities
1112 | # Set the rules for how we'll deal with the entities we
1113 | # encounter
1114 | if self.convertEntities:
1115 | # It doesn't make sense to convert encoded characters to
1116 | # entities even while you're converting entities to Unicode.
1117 | # Just convert it all to Unicode.
1118 | self.smartQuotesTo = None
1119 | if convertEntities == self.HTML_ENTITIES:
1120 | self.convertXMLEntities = False
1121 | self.convertHTMLEntities = True
1122 | self.escapeUnrecognizedEntities = True
1123 | elif convertEntities == self.XHTML_ENTITIES:
1124 | self.convertXMLEntities = True
1125 | self.convertHTMLEntities = True
1126 | self.escapeUnrecognizedEntities = False
1127 | elif convertEntities == self.XML_ENTITIES:
1128 | self.convertXMLEntities = True
1129 | self.convertHTMLEntities = False
1130 | self.escapeUnrecognizedEntities = False
1131 | else:
1132 | self.convertXMLEntities = False
1133 | self.convertHTMLEntities = False
1134 | self.escapeUnrecognizedEntities = False
1135 |
1136 | self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
1137 | SGMLParser.__init__(self)
1138 |
1139 | if hasattr(markup, 'read'): # It's a file-type object.
1140 | markup = markup.read()
1141 | self.markup = markup
1142 | self.markupMassage = markupMassage
1143 | try:
1144 | self._feed(isHTML=isHTML)
1145 | except StopParsing:
1146 | pass
1147 | self.markup = None # The markup can now be GCed
1148 |
1149 | def convert_charref(self, name):
1150 | """This method fixes a bug in Python's SGMLParser."""
1151 | try:
1152 | n = int(name)
1153 | except ValueError:
1154 | return
1155 | if not 0 <= n <= 127 : # ASCII ends at 127, not 255
1156 | return
1157 | return self.convert_codepoint(n)
1158 |
1159 | def _feed(self, inDocumentEncoding=None, isHTML=False):
1160 | # Convert the document to Unicode.
1161 | markup = self.markup
1162 | if isinstance(markup, unicode):
1163 | if not hasattr(self, 'originalEncoding'):
1164 | self.originalEncoding = None
1165 | else:
1166 | dammit = UnicodeDammit\
1167 | (markup, [self.fromEncoding, inDocumentEncoding],
1168 | smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
1169 | markup = dammit.unicode
1170 | self.originalEncoding = dammit.originalEncoding
1171 | self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
1172 | if markup:
1173 | if self.markupMassage:
1174 | if not hasattr(self.markupMassage, "__iter__"):
1175 | self.markupMassage = self.MARKUP_MASSAGE
1176 | for fix, m in self.markupMassage:
1177 | markup = fix.sub(m, markup)
1178 | # TODO: We get rid of markupMassage so that the
1179 | # soup object can be deepcopied later on. Some
1180 | # Python installations can't copy regexes. If anyone
1181 | # was relying on the existence of markupMassage, this
1182 | # might cause problems.
1183 | del(self.markupMassage)
1184 | self.reset()
1185 |
1186 | SGMLParser.feed(self, markup)
1187 | # Close out any unfinished strings and close all the open tags.
1188 | self.endData()
1189 | while self.currentTag.name != self.ROOT_TAG_NAME:
1190 | self.popTag()
1191 |
1192 | def __getattr__(self, methodName):
1193 | """This method routes method call requests to either the SGMLParser
1194 | superclass or the Tag superclass, depending on the method name."""
1195 | #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
1196 |
1197 | if methodName.startswith('start_') or methodName.startswith('end_') \
1198 | or methodName.startswith('do_'):
1199 | return SGMLParser.__getattr__(self, methodName)
1200 | elif not methodName.startswith('__'):
1201 | return Tag.__getattr__(self, methodName)
1202 | else:
1203 | raise AttributeError
1204 |
1205 | def isSelfClosingTag(self, name):
1206 | """Returns true iff the given string is the name of a
1207 | self-closing tag according to this parser."""
1208 | return self.SELF_CLOSING_TAGS.has_key(name) \
1209 | or self.instanceSelfClosingTags.has_key(name)
1210 |
1211 | def reset(self):
1212 | Tag.__init__(self, self, self.ROOT_TAG_NAME)
1213 | self.hidden = 1
1214 | SGMLParser.reset(self)
1215 | self.currentData = []
1216 | self.currentTag = None
1217 | self.tagStack = []
1218 | self.quoteStack = []
1219 | self.pushTag(self)
1220 |
1221 | def popTag(self):
1222 | tag = self.tagStack.pop()
1223 |
1224 | #print "Pop", tag.name
1225 | if self.tagStack:
1226 | self.currentTag = self.tagStack[-1]
1227 | return self.currentTag
1228 |
1229 | def pushTag(self, tag):
1230 | #print "Push", tag.name
1231 | if self.currentTag:
1232 | self.currentTag.contents.append(tag)
1233 | self.tagStack.append(tag)
1234 | self.currentTag = self.tagStack[-1]
1235 |
1236 | def endData(self, containerClass=NavigableString):
1237 | if self.currentData:
1238 | currentData = u''.join(self.currentData)
1239 | if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
1240 | not set([tag.name for tag in self.tagStack]).intersection(
1241 | self.PRESERVE_WHITESPACE_TAGS)):
1242 | if '\n' in currentData:
1243 | currentData = '\n'
1244 | else:
1245 | currentData = ' '
1246 | self.currentData = []
1247 | if self.parseOnlyThese and len(self.tagStack) <= 1 and \
1248 | (not self.parseOnlyThese.text or \
1249 | not self.parseOnlyThese.search(currentData)):
1250 | return
1251 | o = containerClass(currentData)
1252 | o.setup(self.currentTag, self.previous)
1253 | if self.previous:
1254 | self.previous.next = o
1255 | self.previous = o
1256 | self.currentTag.contents.append(o)
1257 |
1258 |
1259 | def _popToTag(self, name, inclusivePop=True):
1260 | """Pops the tag stack up to and including the most recent
1261 | instance of the given tag. If inclusivePop is false, pops the tag
1262 | stack up to but *not* including the most recent instqance of
1263 | the given tag."""
1264 | #print "Popping to %s" % name
1265 | if name == self.ROOT_TAG_NAME:
1266 | return
1267 |
1268 | numPops = 0
1269 | mostRecentTag = None
1270 | for i in range(len(self.tagStack)-1, 0, -1):
1271 | if name == self.tagStack[i].name:
1272 | numPops = len(self.tagStack)-i
1273 | break
1274 | if not inclusivePop:
1275 | numPops = numPops - 1
1276 |
1277 | for i in range(0, numPops):
1278 | mostRecentTag = self.popTag()
1279 | return mostRecentTag
1280 |
1281 | def _smartPop(self, name):
1282 |
1283 | """We need to pop up to the previous tag of this type, unless
1284 | one of this tag's nesting reset triggers comes between this
1285 | tag and the previous tag of this type, OR unless this tag is a
1286 | generic nesting trigger and another generic nesting trigger
1287 | comes between this tag and the previous tag of this type.
1288 |
1289 | Examples:
1290 |
FooBar *
* should pop to 'p', not 'b'. 1291 |
Foo
* | * should pop to 'tr', not the first 'td'
1297 | """
1298 |
1299 | nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1300 | isNestable = nestingResetTriggers != None
1301 | isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
1302 | popTo = None
1303 | inclusive = True
1304 | for i in range(len(self.tagStack)-1, 0, -1):
1305 | p = self.tagStack[i]
1306 | if (not p or p.name == name) and not isNestable:
1307 | #Non-nestable tags get popped to the top or to their
1308 | #last occurance.
1309 | popTo = name
1310 | break
1311 | if (nestingResetTriggers is not None
1312 | and p.name in nestingResetTriggers) \
1313 | or (nestingResetTriggers is None and isResetNesting
1314 | and self.RESET_NESTING_TAGS.has_key(p.name)):
1315 |
1316 | #If we encounter one of the nesting reset triggers
1317 | #peculiar to this tag, or we encounter another tag
1318 | #that causes nesting to reset, pop up to but not
1319 | #including that tag.
1320 | popTo = p.name
1321 | inclusive = False
1322 | break
1323 | p = p.parent
1324 | if popTo:
1325 | self._popToTag(popTo, inclusive)
1326 |
1327 | def unknown_starttag(self, name, attrs, selfClosing=0):
1328 | #print "Start tag %s: %s" % (name, attrs)
1329 | if self.quoteStack:
1330 | #This is not a real tag.
1331 | #print "<%s> is not real!" % name
1332 | attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs])
1333 | self.handle_data('<%s%s>' % (name, attrs))
1334 | return
1335 | self.endData()
1336 |
1337 | if not self.isSelfClosingTag(name) and not selfClosing:
1338 | self._smartPop(name)
1339 |
1340 | if self.parseOnlyThese and len(self.tagStack) <= 1 \
1341 | and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1342 | return
1343 |
1344 | tag = Tag(self, name, attrs, self.currentTag, self.previous)
1345 | if self.previous:
1346 | self.previous.next = tag
1347 | self.previous = tag
1348 | self.pushTag(tag)
1349 | if selfClosing or self.isSelfClosingTag(name):
1350 | self.popTag()
1351 | if name in self.QUOTE_TAGS:
1352 | #print "Beginning quote (%s)" % name
1353 | self.quoteStack.append(name)
1354 | self.literal = 1
1355 | return tag
1356 |
1357 | def unknown_endtag(self, name):
1358 | #print "End tag %s" % name
1359 | if self.quoteStack and self.quoteStack[-1] != name:
1360 | #This is not a real end tag.
1361 | #print "%s> is not real!" % name
1362 | self.handle_data('%s>' % name)
1363 | return
1364 | self.endData()
1365 | self._popToTag(name)
1366 | if self.quoteStack and self.quoteStack[-1] == name:
1367 | self.quoteStack.pop()
1368 | self.literal = (len(self.quoteStack) > 0)
1369 |
1370 | def handle_data(self, data):
1371 | self.currentData.append(data)
1372 |
1373 | def _toStringSubclass(self, text, subclass):
1374 | """Adds a certain piece of text to the tree as a NavigableString
1375 | subclass."""
1376 | self.endData()
1377 | self.handle_data(text)
1378 | self.endData(subclass)
1379 |
1380 | def handle_pi(self, text):
1381 | """Handle a processing instruction as a ProcessingInstruction
1382 | object, possibly one with a %SOUP-ENCODING% slot into which an
1383 | encoding will be plugged later."""
1384 | if text[:3] == "xml":
1385 | text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
1386 | self._toStringSubclass(text, ProcessingInstruction)
1387 |
1388 | def handle_comment(self, text):
1389 | "Handle comments as Comment objects."
1390 | self._toStringSubclass(text, Comment)
1391 |
1392 | def handle_charref(self, ref):
1393 | "Handle character references as data."
1394 | if self.convertEntities:
1395 | data = unichr(int(ref))
1396 | else:
1397 | data = '%s;' % ref
1398 | self.handle_data(data)
1399 |
1400 | def handle_entityref(self, ref):
1401 | """Handle entity references as data, possibly converting known
1402 | HTML and/or XML entity references to the corresponding Unicode
1403 | characters."""
1404 | data = None
1405 | if self.convertHTMLEntities:
1406 | try:
1407 | data = unichr(name2codepoint[ref])
1408 | except KeyError:
1409 | pass
1410 |
1411 | if not data and self.convertXMLEntities:
1412 | data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
1413 |
1414 | if not data and self.convertHTMLEntities and \
1415 | not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
1416 | # TODO: We've got a problem here. We're told this is
1417 | # an entity reference, but it's not an XML entity
1418 | # reference or an HTML entity reference. Nonetheless,
1419 | # the logical thing to do is to pass it through as an
1420 | # unrecognized entity reference.
1421 | #
1422 | # Except: when the input is "&carol;" this function
1423 | # will be called with input "carol". When the input is
1424 | # "AT&T", this function will be called with input
1425 | # "T". We have no way of knowing whether a semicolon
1426 | # was present originally, so we don't know whether
1427 | # this is an unknown entity or just a misplaced
1428 | # ampersand.
1429 | #
1430 | # The more common case is a misplaced ampersand, so I
1431 | # escape the ampersand and omit the trailing semicolon.
1432 | data = "&%s" % ref
1433 | if not data:
1434 | # This case is different from the one above, because we
1435 | # haven't already gone through a supposedly comprehensive
1436 | # mapping of entities to Unicode characters. We might not
1437 | # have gone through any mapping at all. So the chances are
1438 | # very high that this is a real entity, and not a
1439 | # misplaced ampersand.
1440 | data = "&%s;" % ref
1441 | self.handle_data(data)
1442 |
1443 | def handle_decl(self, data):
1444 | "Handle DOCTYPEs and the like as Declaration objects."
1445 | self._toStringSubclass(data, Declaration)
1446 |
1447 | def parse_declaration(self, i):
1448 | """Treat a bogus SGML declaration as raw data. Treat a CDATA
1449 | declaration as a CData object."""
1450 | j = None
1451 | if self.rawdata[i:i+9] == '', i)
1453 | if k == -1:
1454 | k = len(self.rawdata)
1455 | data = self.rawdata[i+9:k]
1456 | j = k+3
1457 | self._toStringSubclass(data, CData)
1458 | else:
1459 | try:
1460 | j = SGMLParser.parse_declaration(self, i)
1461 | except SGMLParseError:
1462 | toHandle = self.rawdata[i:]
1463 | self.handle_data(toHandle)
1464 | j = i + len(toHandle)
1465 | return j
1466 |
1467 | class BeautifulSoup(BeautifulStoneSoup):
1468 |
1469 | """This parser knows the following facts about HTML:
1470 |
1471 | * Some tags have no closing tag and should be interpreted as being
1472 | closed as soon as they are encountered.
1473 |
1474 | * The text inside some tags (ie. 'script') may contain tags which
1475 | are not really part of the document and which should be parsed
1476 | as text, not tags. If you want to parse the text as tags, you can
1477 | always fetch it and parse it explicitly.
1478 |
1479 | * Tag nesting rules:
1480 |
1481 | Most tags can't be nested at all. For instance, the occurance of
1482 | a tag should implicitly close the previous tag. 1483 | 1484 | Para1 Para2 1485 | should be transformed into: 1486 | Para1 Para2 1487 | 1488 | Some tags can be nested arbitrarily. For instance, the occurance 1489 | of a tag should _not_ implicitly close the previous 1490 |tag. 1491 | 1492 | Alice said:Bob said:Blah 1493 | should NOT be transformed into: 1494 | Alice said:Bob said:Blah 1495 | 1496 | Some tags can be nested, but the nesting is reset by the 1497 | interposition of other tags. For instance, a |