├── .gitignore
├── README
├── readability
    ├── BeautifulSoup.py
    ├── __init__.py
    ├── page_parser.py
    ├── readability.py
    └── url_helpers.py
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
 1 | This code is under the Apache License 2.0.  http://www.apache.org/licenses/LICENSE-2.0
 2 | 
 3 | This is a python port of a ruby port of arc90's readability project
 4 | 
 5 | http://lab.arc90.com/experiments/readability/
 6 | 
 7 | Given a html document, it pulls out the main body text and cleans it up.
 8 | 
 9 | Ruby port by starrhorne and iterationlabs
10 | Python port by gfxmonk
11 | 
12 | This port uses BeautifulSoup for the HTML parsing. That means it can be
13 | a little slow, but will work on Google App Engine (unlike libxml-based
14 | libraries)
15 | 
16 | 
17 | **note**: I don't currently have any plans for using or improving this
18 | library, and it's far from perfect (slow, and almost certainly buggy).
19 | So if you do something cool with it or have a better tool that does
20 | the same job, please let me know and I can link to it from here.
21 | 
22 | If you're looking for alternatives / forks, here's the list so far:
23 |  - http://www.minvolai.com/blog/decruft-arc90s-readability-in-python/
24 |  - https://github.com/buriy/python-readability
25 | 


--------------------------------------------------------------------------------
/readability/BeautifulSoup.py:
--------------------------------------------------------------------------------
   1 | """Beautiful Soup
   2 | Elixir and Tonic
   3 | "The Screen-Scraper's Friend"
   4 | http://www.crummy.com/software/BeautifulSoup/
   5 | 
   6 | Beautiful Soup parses a (possibly invalid) XML or HTML document into a
   7 | tree representation. It provides methods and Pythonic idioms that make
   8 | it easy to navigate, search, and modify the tree.
   9 | 
  10 | A well-formed XML/HTML document yields a well-formed data
  11 | structure. An ill-formed XML/HTML document yields a correspondingly
  12 | ill-formed data structure. If your document is only locally
  13 | well-formed, you can use this library to find and process the
  14 | well-formed part of it.
  15 | 
  16 | Beautiful Soup works with Python 2.2 and up. It has no external
  17 | dependencies, but you'll have more success at converting data to UTF-8
  18 | if you also install these three packages:
  19 | 
  20 | * chardet, for auto-detecting character encodings
  21 |   http://chardet.feedparser.org/
  22 | * cjkcodecs and iconv_codec, which add more encodings to the ones supported
  23 |   by stock Python.
  24 |   http://cjkpython.i18n.org/
  25 | 
  26 | Beautiful Soup defines classes for two main parsing strategies:
  27 | 
  28 |  * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
  29 |    language that kind of looks like XML.
  30 | 
  31 |  * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
  32 |    or invalid. This class has web browser-like heuristics for
  33 |    obtaining a sensible parse tree in the face of common HTML errors.
  34 | 
  35 | Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
  36 | the encoding of an HTML or XML document, and converting it to
  37 | Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
  38 | 
  39 | For more than you ever wanted to know about Beautiful Soup, see the
  40 | documentation:
  41 | http://www.crummy.com/software/BeautifulSoup/documentation.html
  42 | 
  43 | Here, have some legalese:
  44 | 
  45 | Copyright (c) 2004-2009, Leonard Richardson
  46 | 
  47 | All rights reserved.
  48 | 
  49 | Redistribution and use in source and binary forms, with or without
  50 | modification, are permitted provided that the following conditions are
  51 | met:
  52 | 
  53 |   * Redistributions of source code must retain the above copyright
  54 |     notice, this list of conditions and the following disclaimer.
  55 | 
  56 |   * Redistributions in binary form must reproduce the above
  57 |     copyright notice, this list of conditions and the following
  58 |     disclaimer in the documentation and/or other materials provided
  59 |     with the distribution.
  60 | 
  61 |   * Neither the name of the the Beautiful Soup Consortium and All
  62 |     Night Kosher Bakery nor the names of its contributors may be
  63 |     used to endorse or promote products derived from this software
  64 |     without specific prior written permission.
  65 | 
  66 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  67 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  68 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  69 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  70 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  71 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  72 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  73 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  74 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  75 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  76 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
  77 | 
  78 | """
  79 | from __future__ import generators
  80 | 
  81 | __author__ = "Leonard Richardson (leonardr@segfault.org)"
  82 | __version__ = "3.1.0.1"
  83 | __copyright__ = "Copyright (c) 2004-2009 Leonard Richardson"
  84 | __license__ = "New-style BSD"
  85 | 
  86 | import codecs
  87 | import markupbase
  88 | import types
  89 | import re
  90 | from HTMLParser import HTMLParser, HTMLParseError
  91 | try:
  92 |     from htmlentitydefs import name2codepoint
  93 | except ImportError:
  94 |     name2codepoint = {}
  95 | try:
  96 |     set
  97 | except NameError:
  98 |     from sets import Set as set
  99 | 
 100 | #These hacks make Beautiful Soup able to parse XML with namespaces
 101 | markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
 102 | 
 103 | DEFAULT_OUTPUT_ENCODING = "utf-8"
 104 | 
 105 | # First, the classes that represent markup elements.
 106 | 
 107 | def sob(unicode, encoding):
 108 |     """Returns either the given Unicode string or its encoding."""
 109 |     if encoding is None:
 110 |         return unicode
 111 |     else:
 112 |         return unicode.encode(encoding)
 113 | 
 114 | class PageElement:
 115 |     """Contains the navigational information for some part of the page
 116 |     (either a tag or a piece of text)"""
 117 | 
 118 |     def setup(self, parent=None, previous=None):
 119 |         """Sets up the initial relations between this element and
 120 |         other elements."""
 121 |         self.parent = parent
 122 |         self.previous = previous
 123 |         self.next = None
 124 |         self.previousSibling = None
 125 |         self.nextSibling = None
 126 |         if self.parent and self.parent.contents:
 127 |             self.previousSibling = self.parent.contents[-1]
 128 |             self.previousSibling.nextSibling = self
 129 | 
 130 |     def replaceWith(self, replaceWith):
 131 |         oldParent = self.parent
 132 |         myIndex = self.parent.contents.index(self)
 133 |         if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
 134 |             # We're replacing this element with one of its siblings.
 135 |             index = self.parent.contents.index(replaceWith)
 136 |             if index and index < myIndex:
 137 |                 # Furthermore, it comes before this element. That
 138 |                 # means that when we extract it, the index of this
 139 |                 # element will change.
 140 |                 myIndex = myIndex - 1
 141 |         self.extract()
 142 |         oldParent.insert(myIndex, replaceWith)
 143 | 
 144 |     def extract(self):
 145 |         """Destructively rips this element out of the tree."""
 146 |         if self.parent:
 147 |             try:
 148 |                 self.parent.contents.remove(self)
 149 |             except ValueError:
 150 |                 pass
 151 | 
 152 |         #Find the two elements that would be next to each other if
 153 |         #this element (and any children) hadn't been parsed. Connect
 154 |         #the two.
 155 |         lastChild = self._lastRecursiveChild()
 156 |         nextElement = lastChild.next
 157 | 
 158 |         if self.previous:
 159 |             self.previous.next = nextElement
 160 |         if nextElement:
 161 |             nextElement.previous = self.previous
 162 |         self.previous = None
 163 |         lastChild.next = None
 164 | 
 165 |         self.parent = None
 166 |         if self.previousSibling:
 167 |             self.previousSibling.nextSibling = self.nextSibling
 168 |         if self.nextSibling:
 169 |             self.nextSibling.previousSibling = self.previousSibling
 170 |         self.previousSibling = self.nextSibling = None
 171 |         return self
 172 | 
 173 |     def _lastRecursiveChild(self):
 174 |         "Finds the last element beneath this object to be parsed."
 175 |         lastChild = self
 176 |         while hasattr(lastChild, 'contents') and lastChild.contents:
 177 |             lastChild = lastChild.contents[-1]
 178 |         return lastChild
 179 | 
 180 |     def insert(self, position, newChild):
 181 |         if (isinstance(newChild, basestring)
 182 |             or isinstance(newChild, unicode)) \
 183 |             and not isinstance(newChild, NavigableString):
 184 |             newChild = NavigableString(newChild)
 185 | 
 186 |         position =  min(position, len(self.contents))
 187 |         if hasattr(newChild, 'parent') and newChild.parent != None:
 188 |             # We're 'inserting' an element that's already one
 189 |             # of this object's children.
 190 |             if newChild.parent == self:
 191 |                 index = self.find(newChild)
 192 |                 if index and index < position:
 193 |                     # Furthermore we're moving it further down the
 194 |                     # list of this object's children. That means that
 195 |                     # when we extract this element, our target index
 196 |                     # will jump down one.
 197 |                     position = position - 1
 198 |             newChild.extract()
 199 | 
 200 |         newChild.parent = self
 201 |         previousChild = None
 202 |         if position == 0:
 203 |             newChild.previousSibling = None
 204 |             newChild.previous = self
 205 |         else:
 206 |             previousChild = self.contents[position-1]
 207 |             newChild.previousSibling = previousChild
 208 |             newChild.previousSibling.nextSibling = newChild
 209 |             newChild.previous = previousChild._lastRecursiveChild()
 210 |         if newChild.previous:
 211 |             newChild.previous.next = newChild
 212 | 
 213 |         newChildsLastElement = newChild._lastRecursiveChild()
 214 | 
 215 |         if position >= len(self.contents):
 216 |             newChild.nextSibling = None
 217 | 
 218 |             parent = self
 219 |             parentsNextSibling = None
 220 |             while not parentsNextSibling:
 221 |                 parentsNextSibling = parent.nextSibling
 222 |                 parent = parent.parent
 223 |                 if not parent: # This is the last element in the document.
 224 |                     break
 225 |             if parentsNextSibling:
 226 |                 newChildsLastElement.next = parentsNextSibling
 227 |             else:
 228 |                 newChildsLastElement.next = None
 229 |         else:
 230 |             nextChild = self.contents[position]
 231 |             newChild.nextSibling = nextChild
 232 |             if newChild.nextSibling:
 233 |                 newChild.nextSibling.previousSibling = newChild
 234 |             newChildsLastElement.next = nextChild
 235 | 
 236 |         if newChildsLastElement.next:
 237 |             newChildsLastElement.next.previous = newChildsLastElement
 238 |         self.contents.insert(position, newChild)
 239 | 
 240 |     def append(self, tag):
 241 |         """Appends the given tag to the contents of this tag."""
 242 |         self.insert(len(self.contents), tag)
 243 | 
 244 |     def findNext(self, name=None, attrs={}, text=None, **kwargs):
 245 |         """Returns the first item that matches the given criteria and
 246 |         appears after this Tag in the document."""
 247 |         return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
 248 | 
 249 |     def findAllNext(self, name=None, attrs={}, text=None, limit=None,
 250 |                     **kwargs):
 251 |         """Returns all items that match the given criteria and appear
 252 |         after this Tag in the document."""
 253 |         return self._findAll(name, attrs, text, limit, self.nextGenerator,
 254 |                              **kwargs)
 255 | 
 256 |     def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
 257 |         """Returns the closest sibling to this Tag that matches the
 258 |         given criteria and appears after this Tag in the document."""
 259 |         return self._findOne(self.findNextSiblings, name, attrs, text,
 260 |                              **kwargs)
 261 | 
 262 |     def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
 263 |                          **kwargs):
 264 |         """Returns the siblings of this Tag that match the given
 265 |         criteria and appear after this Tag in the document."""
 266 |         return self._findAll(name, attrs, text, limit,
 267 |                              self.nextSiblingGenerator, **kwargs)
 268 |     fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
 269 | 
 270 |     def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
 271 |         """Returns the first item that matches the given criteria and
 272 |         appears before this Tag in the document."""
 273 |         return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
 274 | 
 275 |     def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
 276 |                         **kwargs):
 277 |         """Returns all items that match the given criteria and appear
 278 |         before this Tag in the document."""
 279 |         return self._findAll(name, attrs, text, limit, self.previousGenerator,
 280 |                            **kwargs)
 281 |     fetchPrevious = findAllPrevious # Compatibility with pre-3.x
 282 | 
 283 |     def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
 284 |         """Returns the closest sibling to this Tag that matches the
 285 |         given criteria and appears before this Tag in the document."""
 286 |         return self._findOne(self.findPreviousSiblings, name, attrs, text,
 287 |                              **kwargs)
 288 | 
 289 |     def findPreviousSiblings(self, name=None, attrs={}, text=None,
 290 |                              limit=None, **kwargs):
 291 |         """Returns the siblings of this Tag that match the given
 292 |         criteria and appear before this Tag in the document."""
 293 |         return self._findAll(name, attrs, text, limit,
 294 |                              self.previousSiblingGenerator, **kwargs)
 295 |     fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
 296 | 
 297 |     def findParent(self, name=None, attrs={}, **kwargs):
 298 |         """Returns the closest parent of this Tag that matches the given
 299 |         criteria."""
 300 |         # NOTE: We can't use _findOne because findParents takes a different
 301 |         # set of arguments.
 302 |         r = None
 303 |         l = self.findParents(name, attrs, 1)
 304 |         if l:
 305 |             r = l[0]
 306 |         return r
 307 | 
 308 |     def findParents(self, name=None, attrs={}, limit=None, **kwargs):
 309 |         """Returns the parents of this Tag that match the given
 310 |         criteria."""
 311 | 
 312 |         return self._findAll(name, attrs, None, limit, self.parentGenerator,
 313 |                              **kwargs)
 314 |     fetchParents = findParents # Compatibility with pre-3.x
 315 | 
 316 |     #These methods do the real heavy lifting.
 317 | 
 318 |     def _findOne(self, method, name, attrs, text, **kwargs):
 319 |         r = None
 320 |         l = method(name, attrs, text, 1, **kwargs)
 321 |         if l:
 322 |             r = l[0]
 323 |         return r
 324 | 
 325 |     def _findAll(self, name, attrs, text, limit, generator, **kwargs):
 326 |         "Iterates over a generator looking for things that match."
 327 | 
 328 |         if isinstance(name, SoupStrainer):
 329 |             strainer = name
 330 |         else:
 331 |             # Build a SoupStrainer
 332 |             strainer = SoupStrainer(name, attrs, text, **kwargs)
 333 |         results = ResultSet(strainer)
 334 |         g = generator()
 335 |         while True:
 336 |             try:
 337 |                 i = g.next()
 338 |             except StopIteration:
 339 |                 break
 340 |             if i:
 341 |                 found = strainer.search(i)
 342 |                 if found:
 343 |                     results.append(found)
 344 |                     if limit and len(results) >= limit:
 345 |                         break
 346 |         return results
 347 | 
 348 |     #These Generators can be used to navigate starting from both
 349 |     #NavigableStrings and Tags.
 350 |     def nextGenerator(self):
 351 |         i = self
 352 |         while i:
 353 |             i = i.next
 354 |             yield i
 355 | 
 356 |     def nextSiblingGenerator(self):
 357 |         i = self
 358 |         while i:
 359 |             i = i.nextSibling
 360 |             yield i
 361 | 
 362 |     def previousGenerator(self):
 363 |         i = self
 364 |         while i:
 365 |             i = i.previous
 366 |             yield i
 367 | 
 368 |     def previousSiblingGenerator(self):
 369 |         i = self
 370 |         while i:
 371 |             i = i.previousSibling
 372 |             yield i
 373 | 
 374 |     def parentGenerator(self):
 375 |         i = self
 376 |         while i:
 377 |             i = i.parent
 378 |             yield i
 379 | 
 380 |     # Utility methods
 381 |     def substituteEncoding(self, str, encoding=None):
 382 |         encoding = encoding or "utf-8"
 383 |         return str.replace("%SOUP-ENCODING%", encoding)
 384 | 
 385 |     def toEncoding(self, s, encoding=None):
 386 |         """Encodes an object to a string in some encoding, or to Unicode.
 387 |         ."""
 388 |         if isinstance(s, unicode):
 389 |             if encoding:
 390 |                 s = s.encode(encoding)
 391 |         elif isinstance(s, str):
 392 |             if encoding:
 393 |                 s = s.encode(encoding)
 394 |             else:
 395 |                 s = unicode(s)
 396 |         else:
 397 |             if encoding:
 398 |                 s  = self.toEncoding(str(s), encoding)
 399 |             else:
 400 |                 s = unicode(s)
 401 |         return s
 402 | 
 403 | class NavigableString(unicode, PageElement):
 404 | 
 405 |     def __new__(cls, value):
 406 |         """Create a new NavigableString.
 407 | 
 408 |         When unpickling a NavigableString, this method is called with
 409 |         the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
 410 |         passed in to the superclass's __new__ or the superclass won't know
 411 |         how to handle non-ASCII characters.
 412 |         """
 413 |         if isinstance(value, unicode):
 414 |             return unicode.__new__(cls, value)
 415 |         return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
 416 | 
 417 |     def __getnewargs__(self):
 418 |         return (unicode(self),)
 419 | 
 420 |     def __getattr__(self, attr):
 421 |         """text.string gives you text. This is for backwards
 422 |         compatibility for Navigable*String, but for CData* it lets you
 423 |         get the string without the CData wrapper."""
 424 |         if attr == 'string':
 425 |             return self
 426 |         else:
 427 |             raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
 428 | 
 429 |     def encode(self, encoding=DEFAULT_OUTPUT_ENCODING):
 430 |         return self.decode().encode(encoding)
 431 | 
 432 |     def decodeGivenEventualEncoding(self, eventualEncoding):
 433 |         return self
 434 | 
 435 | class CData(NavigableString):
 436 | 
 437 |     def decodeGivenEventualEncoding(self, eventualEncoding):
 438 |         return u'<![CDATA[' + self + u']]>'
 439 | 
 440 | class ProcessingInstruction(NavigableString):
 441 | 
 442 |     def decodeGivenEventualEncoding(self, eventualEncoding):
 443 |         output = self
 444 |         if u'%SOUP-ENCODING%' in output:
 445 |             output = self.substituteEncoding(output, eventualEncoding)
 446 |         return u'<?' + output + u'?>'
 447 | 
 448 | class Comment(NavigableString):
 449 |     def decodeGivenEventualEncoding(self, eventualEncoding):
 450 |         return u'<!--' + self + u'-->'
 451 | 
 452 | class Declaration(NavigableString):
 453 |     def decodeGivenEventualEncoding(self, eventualEncoding):
 454 |         return u'<!' + self + u'>'
 455 | 
 456 | class Tag(PageElement):
 457 | 
 458 |     """Represents a found HTML tag with its attributes and contents."""
 459 | 
 460 |     def _invert(h):
 461 |         "Cheap function to invert a hash."
 462 |         i = {}
 463 |         for k,v in h.items():
 464 |             i[v] = k
 465 |         return i
 466 | 
 467 |     XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
 468 |                                       "quot" : '"',
 469 |                                       "amp" : "&",
 470 |                                       "lt" : "<",
 471 |                                       "gt" : ">" }
 472 | 
 473 |     XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
 474 | 
 475 |     def _convertEntities(self, match):
 476 |         """Used in a call to re.sub to replace HTML, XML, and numeric
 477 |         entities with the appropriate Unicode characters. If HTML
 478 |         entities are being converted, any unrecognized entities are
 479 |         escaped."""
 480 |         x = match.group(1)
 481 |         if self.convertHTMLEntities and x in name2codepoint:
 482 |             return unichr(name2codepoint[x])
 483 |         elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
 484 |             if self.convertXMLEntities:
 485 |                 return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
 486 |             else:
 487 |                 return u'&%s;' % x
 488 |         elif len(x) > 0 and x[0] == '#':
 489 |             # Handle numeric entities
 490 |             if len(x) > 1 and x[1] == 'x':
 491 |                 return unichr(int(x[2:], 16))
 492 |             else:
 493 |                 return unichr(int(x[1:]))
 494 | 
 495 |         elif self.escapeUnrecognizedEntities:
 496 |             return u'&amp;%s;' % x
 497 |         else:
 498 |             return u'&%s;' % x
 499 | 
 500 |     def __init__(self, parser, name, attrs=None, parent=None,
 501 |                  previous=None):
 502 |         "Basic constructor."
 503 | 
 504 |         # We don't actually store the parser object: that lets extracted
 505 |         # chunks be garbage-collected
 506 |         self.parserClass = parser.__class__
 507 |         self.isSelfClosing = parser.isSelfClosingTag(name)
 508 |         self.name = name
 509 |         if attrs == None:
 510 |             attrs = []
 511 |         self.attrs = attrs
 512 |         self.contents = []
 513 |         self.setup(parent, previous)
 514 |         self.hidden = False
 515 |         self.containsSubstitutions = False
 516 |         self.convertHTMLEntities = parser.convertHTMLEntities
 517 |         self.convertXMLEntities = parser.convertXMLEntities
 518 |         self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
 519 | 
 520 |         def convert(kval):
 521 |             "Converts HTML, XML and numeric entities in the attribute value."
 522 |             k, val = kval
 523 |             if val is None:
 524 |                 return kval
 525 |             return (k, re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
 526 |                               self._convertEntities, val))
 527 |         self.attrs = map(convert, self.attrs)
 528 | 
 529 |     def get(self, key, default=None):
 530 |         """Returns the value of the 'key' attribute for the tag, or
 531 |         the value given for 'default' if it doesn't have that
 532 |         attribute."""
 533 |         return self._getAttrMap().get(key, default)
 534 | 
 535 |     def has_key(self, key):
 536 |         return self._getAttrMap().has_key(key)
 537 | 
 538 |     def __getitem__(self, key):
 539 |         """tag[key] returns the value of the 'key' attribute for the tag,
 540 |         and throws an exception if it's not there."""
 541 |         return self._getAttrMap()[key]
 542 | 
 543 |     def __iter__(self):
 544 |         "Iterating over a tag iterates over its contents."
 545 |         return iter(self.contents)
 546 | 
 547 |     def __len__(self):
 548 |         "The length of a tag is the length of its list of contents."
 549 |         return len(self.contents)
 550 | 
 551 |     def __contains__(self, x):
 552 |         return x in self.contents
 553 | 
 554 |     def __nonzero__(self):
 555 |         "A tag is non-None even if it has no contents."
 556 |         return True
 557 | 
 558 |     def __setitem__(self, key, value):
 559 |         """Setting tag[key] sets the value of the 'key' attribute for the
 560 |         tag."""
 561 |         self._getAttrMap()
 562 |         self.attrMap[key] = value
 563 |         found = False
 564 |         for i in range(0, len(self.attrs)):
 565 |             if self.attrs[i][0] == key:
 566 |                 self.attrs[i] = (key, value)
 567 |                 found = True
 568 |         if not found:
 569 |             self.attrs.append((key, value))
 570 |         self._getAttrMap()[key] = value
 571 | 
 572 |     def __delitem__(self, key):
 573 |         "Deleting tag[key] deletes all 'key' attributes for the tag."
 574 |         for item in self.attrs:
 575 |             if item[0] == key:
 576 |                 self.attrs.remove(item)
 577 |                 #We don't break because bad HTML can define the same
 578 |                 #attribute multiple times.
 579 |             self._getAttrMap()
 580 |             if self.attrMap.has_key(key):
 581 |                 del self.attrMap[key]
 582 | 
 583 |     def __call__(self, *args, **kwargs):
 584 |         """Calling a tag like a function is the same as calling its
 585 |         findAll() method. Eg. tag('a') returns a list of all the A tags
 586 |         found within this tag."""
 587 |         return apply(self.findAll, args, kwargs)
 588 | 
 589 |     def __getattr__(self, tag):
 590 |         #print "Getattr %s.%s" % (self.__class__, tag)
 591 |         if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
 592 |             return self.find(tag[:-3])
 593 |         elif tag.find('__') != 0:
 594 |             return self.find(tag)
 595 |         raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
 596 | 
 597 |     def __eq__(self, other):
 598 |         """Returns true iff this tag has the same name, the same attributes,
 599 |         and the same contents (recursively) as the given tag.
 600 | 
 601 |         NOTE: right now this will return false if two tags have the
 602 |         same attributes in a different order. Should this be fixed?"""
 603 |         if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
 604 |             return False
 605 |         for i in range(0, len(self.contents)):
 606 |             if self.contents[i] != other.contents[i]:
 607 |                 return False
 608 |         return True
 609 | 
 610 |     def __ne__(self, other):
 611 |         """Returns true iff this tag is not identical to the other tag,
 612 |         as defined in __eq__."""
 613 |         return not self == other
 614 | 
 615 |     def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 616 |         """Renders this tag as a string."""
 617 |         return self.decode(eventualEncoding=encoding)
 618 | 
 619 |     BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
 620 |                                            + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
 621 |                                            + ")")
 622 | 
 623 |     def _sub_entity(self, x):
 624 |         """Used with a regular expression to substitute the
 625 |         appropriate XML entity for an XML special character."""
 626 |         return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
 627 | 
 628 |     def __unicode__(self):
 629 |         return self.decode()
 630 | 
 631 |     def __str__(self):
 632 |         return self.encode()
 633 | 
 634 |     def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
 635 |                prettyPrint=False, indentLevel=0):
 636 |         return self.decode(prettyPrint, indentLevel, encoding).encode(encoding)
 637 | 
 638 |     def decode(self, prettyPrint=False, indentLevel=0,
 639 |                eventualEncoding=DEFAULT_OUTPUT_ENCODING):
 640 |         """Returns a string or Unicode representation of this tag and
 641 |         its contents. To get Unicode, pass None for encoding."""
 642 | 
 643 |         attrs = []
 644 |         if self.attrs:
 645 |             for key, val in self.attrs:
 646 |                 fmt = '%s="%s"'
 647 |                 if isString(val):
 648 |                     if (self.containsSubstitutions
 649 |                         and eventualEncoding is not None
 650 |                         and '%SOUP-ENCODING%' in val):
 651 |                         val = self.substituteEncoding(val, eventualEncoding)
 652 | 
 653 |                     # The attribute value either:
 654 |                     #
 655 |                     # * Contains no embedded double quotes or single quotes.
 656 |                     #   No problem: we enclose it in double quotes.
 657 |                     # * Contains embedded single quotes. No problem:
 658 |                     #   double quotes work here too.
 659 |                     # * Contains embedded double quotes. No problem:
 660 |                     #   we enclose it in single quotes.
 661 |                     # * Embeds both single _and_ double quotes. This
 662 |                     #   can't happen naturally, but it can happen if
 663 |                     #   you modify an attribute value after parsing
 664 |                     #   the document. Now we have a bit of a
 665 |                     #   problem. We solve it by enclosing the
 666 |                     #   attribute in single quotes, and escaping any
 667 |                     #   embedded single quotes to XML entities.
 668 |                     if '"' in val:
 669 |                         fmt = "%s='%s'"
 670 |                         if "'" in val:
 671 |                             # TODO: replace with apos when
 672 |                             # appropriate.
 673 |                             val = val.replace("'", "&squot;")
 674 | 
 675 |                     # Now we're okay w/r/t quotes. But the attribute
 676 |                     # value might also contain angle brackets, or
 677 |                     # ampersands that aren't part of entities. We need
 678 |                     # to escape those to XML entities too.
 679 |                     val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
 680 |                 if val is None:
 681 |                     # Handle boolean attributes.
 682 |                     decoded = key
 683 |                 else:
 684 |                     decoded = fmt % (key, val)
 685 |                 attrs.append(decoded)
 686 |         close = ''
 687 |         closeTag = ''
 688 |         if self.isSelfClosing:
 689 |             close = ' /'
 690 |         else:
 691 |             closeTag = '</%s>' % self.name
 692 | 
 693 |         indentTag, indentContents = 0, 0
 694 |         if prettyPrint:
 695 |             indentTag = indentLevel
 696 |             space = (' ' * (indentTag-1))
 697 |             indentContents = indentTag + 1
 698 |         contents = self.decodeContents(prettyPrint, indentContents,
 699 |                                        eventualEncoding)
 700 |         if self.hidden:
 701 |             s = contents
 702 |         else:
 703 |             s = []
 704 |             attributeString = ''
 705 |             if attrs:
 706 |                 attributeString = ' ' + ' '.join(attrs)
 707 |             if prettyPrint:
 708 |                 s.append(space)
 709 |             s.append('<%s%s%s>' % (self.name, attributeString, close))
 710 |             if prettyPrint:
 711 |                 s.append("\n")
 712 |             s.append(contents)
 713 |             if prettyPrint and contents and contents[-1] != "\n":
 714 |                 s.append("\n")
 715 |             if prettyPrint and closeTag:
 716 |                 s.append(space)
 717 |             s.append(closeTag)
 718 |             if prettyPrint and closeTag and self.nextSibling:
 719 |                 s.append("\n")
 720 |             s = ''.join(s)
 721 |         return s
 722 | 
 723 |     def decompose(self):
 724 |         """Recursively destroys the contents of this tree."""
 725 |         contents = [i for i in self.contents]
 726 |         for i in contents:
 727 |             if isinstance(i, Tag):
 728 |                 i.decompose()
 729 |             else:
 730 |                 i.extract()
 731 |         self.extract()
 732 | 
 733 |     def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
 734 |         return self.encode(encoding, True)
 735 | 
 736 |     def encodeContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
 737 |                        prettyPrint=False, indentLevel=0):
 738 |         return self.decodeContents(prettyPrint, indentLevel).encode(encoding)
 739 | 
 740 |     def decodeContents(self, prettyPrint=False, indentLevel=0,
 741 |                        eventualEncoding=DEFAULT_OUTPUT_ENCODING):
 742 |         """Renders the contents of this tag as a string in the given
 743 |         encoding. If encoding is None, returns a Unicode string.."""
 744 |         s=[]
 745 |         for c in self:
 746 |             text = None
 747 |             if isinstance(c, NavigableString):
 748 |                 text = c.decodeGivenEventualEncoding(eventualEncoding)
 749 |             elif isinstance(c, Tag):
 750 |                 s.append(c.decode(prettyPrint, indentLevel, eventualEncoding))
 751 |             if text and prettyPrint:
 752 |                 text = text.strip()
 753 |             if text:
 754 |                 if prettyPrint:
 755 |                     s.append(" " * (indentLevel-1))
 756 |                 s.append(text)
 757 |                 if prettyPrint:
 758 |                     s.append("\n")
 759 |         return ''.join(s)
 760 | 
 761 |     #Soup methods
 762 | 
 763 |     def find(self, name=None, attrs={}, recursive=True, text=None,
 764 |              **kwargs):
 765 |         """Return only the first child of this Tag matching the given
 766 |         criteria."""
 767 |         r = None
 768 |         l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
 769 |         if l:
 770 |             r = l[0]
 771 |         return r
 772 |     findChild = find
 773 | 
 774 |     def findAll(self, name=None, attrs={}, recursive=True, text=None,
 775 |                 limit=None, **kwargs):
 776 |         """Extracts a list of Tag objects that match the given
 777 |         criteria.  You can specify the name of the Tag and any
 778 |         attributes you want the Tag to have.
 779 | 
 780 |         The value of a key-value pair in the 'attrs' map can be a
 781 |         string, a list of strings, a regular expression object, or a
 782 |         callable that takes a string and returns whether or not the
 783 |         string matches for some custom definition of 'matches'. The
 784 |         same is true of the tag name."""
 785 |         generator = self.recursiveChildGenerator
 786 |         if not recursive:
 787 |             generator = self.childGenerator
 788 |         return self._findAll(name, attrs, text, limit, generator, **kwargs)
 789 |     findChildren = findAll
 790 | 
 791 |     # Pre-3.x compatibility methods. Will go away in 4.0.
 792 |     first = find
 793 |     fetch = findAll
 794 | 
 795 |     def fetchText(self, text=None, recursive=True, limit=None):
 796 |         return self.findAll(text=text, recursive=recursive, limit=limit)
 797 | 
 798 |     def firstText(self, text=None, recursive=True):
 799 |         return self.find(text=text, recursive=recursive)
 800 | 
 801 |     # 3.x compatibility methods. Will go away in 4.0.
 802 |     def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
 803 |                        prettyPrint=False, indentLevel=0):
 804 |         if encoding is None:
 805 |             return self.decodeContents(prettyPrint, indentLevel, encoding)
 806 |         else:
 807 |             return self.encodeContents(encoding, prettyPrint, indentLevel)
 808 | 
 809 | 
 810 |     #Private methods
 811 | 
 812 |     def _getAttrMap(self):
 813 |         """Initializes a map representation of this tag's attributes,
 814 |         if not already initialized."""
 815 |         if not getattr(self, 'attrMap'):
 816 |             self.attrMap = {}
 817 |             for (key, value) in self.attrs:
 818 |                 self.attrMap[key] = value
 819 |         return self.attrMap
 820 | 
 821 |     #Generator methods
 822 |     def recursiveChildGenerator(self):
 823 |         if not len(self.contents):
 824 |             raise StopIteration
 825 |         stopNode = self._lastRecursiveChild().next
 826 |         current = self.contents[0]
 827 |         while current is not stopNode:
 828 |             yield current
 829 |             current = current.next
 830 | 
 831 |     def childGenerator(self):
 832 |         if not len(self.contents):
 833 |             raise StopIteration
 834 |         current = self.contents[0]
 835 |         while current:
 836 |             yield current
 837 |             current = current.nextSibling
 838 |         raise StopIteration
 839 | 
 840 | # Next, a couple classes to represent queries and their results.
 841 | class SoupStrainer:
 842 |     """Encapsulates a number of ways of matching a markup element (tag or
 843 |     text)."""
 844 | 
 845 |     def __init__(self, name=None, attrs={}, text=None, **kwargs):
 846 |         self.name = name
 847 |         if isString(attrs):
 848 |             kwargs['class'] = attrs
 849 |             attrs = None
 850 |         if kwargs:
 851 |             if attrs:
 852 |                 attrs = attrs.copy()
 853 |                 attrs.update(kwargs)
 854 |             else:
 855 |                 attrs = kwargs
 856 |         self.attrs = attrs
 857 |         self.text = text
 858 | 
 859 |     def __str__(self):
 860 |         if self.text:
 861 |             return self.text
 862 |         else:
 863 |             return "%s|%s" % (self.name, self.attrs)
 864 | 
 865 |     def searchTag(self, markupName=None, markupAttrs={}):
 866 |         found = None
 867 |         markup = None
 868 |         if isinstance(markupName, Tag):
 869 |             markup = markupName
 870 |             markupAttrs = markup
 871 |         callFunctionWithTagData = callable(self.name) \
 872 |                                 and not isinstance(markupName, Tag)
 873 | 
 874 |         if (not self.name) \
 875 |                or callFunctionWithTagData \
 876 |                or (markup and self._matches(markup, self.name)) \
 877 |                or (not markup and self._matches(markupName, self.name)):
 878 |             if callFunctionWithTagData:
 879 |                 match = self.name(markupName, markupAttrs)
 880 |             else:
 881 |                 match = True
 882 |                 markupAttrMap = None
 883 |                 for attr, matchAgainst in self.attrs.items():
 884 |                     if not markupAttrMap:
 885 |                          if hasattr(markupAttrs, 'get'):
 886 |                             markupAttrMap = markupAttrs
 887 |                          else:
 888 |                             markupAttrMap = {}
 889 |                             for k,v in markupAttrs:
 890 |                                 markupAttrMap[k] = v
 891 |                     attrValue = markupAttrMap.get(attr)
 892 |                     if not self._matches(attrValue, matchAgainst):
 893 |                         match = False
 894 |                         break
 895 |             if match:
 896 |                 if markup:
 897 |                     found = markup
 898 |                 else:
 899 |                     found = markupName
 900 |         return found
 901 | 
 902 |     def search(self, markup):
 903 |         #print 'looking for %s in %s' % (self, markup)
 904 |         found = None
 905 |         # If given a list of items, scan it for a text element that
 906 |         # matches.
 907 |         if isList(markup) and not isinstance(markup, Tag):
 908 |             for element in markup:
 909 |                 if isinstance(element, NavigableString) \
 910 |                        and self.search(element):
 911 |                     found = element
 912 |                     break
 913 |         # If it's a Tag, make sure its name or attributes match.
 914 |         # Don't bother with Tags if we're searching for text.
 915 |         elif isinstance(markup, Tag):
 916 |             if not self.text:
 917 |                 found = self.searchTag(markup)
 918 |         # If it's text, make sure the text matches.
 919 |         elif isinstance(markup, NavigableString) or \
 920 |                  isString(markup):
 921 |             if self._matches(markup, self.text):
 922 |                 found = markup
 923 |         else:
 924 |             raise Exception, "I don't know how to match against a %s" \
 925 |                   % markup.__class__
 926 |         return found
 927 | 
 928 |     def _matches(self, markup, matchAgainst):
 929 |         #print "Matching %s against %s" % (markup, matchAgainst)
 930 |         result = False
 931 |         if matchAgainst == True and type(matchAgainst) == types.BooleanType:
 932 |             result = markup != None
 933 |         elif callable(matchAgainst):
 934 |             result = matchAgainst(markup)
 935 |         else:
 936 |             #Custom match methods take the tag as an argument, but all
 937 |             #other ways of matching match the tag name as a string.
 938 |             if isinstance(markup, Tag):
 939 |                 markup = markup.name
 940 |             if markup is not None and not isString(markup):
 941 |                 markup = unicode(markup)
 942 |             #Now we know that chunk is either a string, or None.
 943 |             if hasattr(matchAgainst, 'match'):
 944 |                 # It's a regexp object.
 945 |                 result = markup and matchAgainst.search(markup)
 946 |             elif (isList(matchAgainst)
 947 |                   and (markup is not None or not isString(matchAgainst))):
 948 |                 result = markup in matchAgainst
 949 |             elif hasattr(matchAgainst, 'items'):
 950 |                 result = markup.has_key(matchAgainst)
 951 |             elif matchAgainst and isString(markup):
 952 |                 if isinstance(markup, unicode):
 953 |                     matchAgainst = unicode(matchAgainst)
 954 |                 else:
 955 |                     matchAgainst = str(matchAgainst)
 956 | 
 957 |             if not result:
 958 |                 result = matchAgainst == markup
 959 |         return result
 960 | 
 961 | class ResultSet(list):
 962 |     """A ResultSet is just a list that keeps track of the SoupStrainer
 963 |     that created it."""
 964 |     def __init__(self, source):
 965 |         list.__init__([])
 966 |         self.source = source
 967 | 
 968 | # Now, some helper functions.
 969 | 
 970 | def isList(l):
 971 |     """Convenience method that works with all 2.x versions of Python
 972 |     to determine whether or not something is listlike."""
 973 |     return ((hasattr(l, '__iter__') and not isString(l))
 974 |             or (type(l) in (types.ListType, types.TupleType)))
 975 | 
 976 | def isString(s):
 977 |     """Convenience method that works with all 2.x versions of Python
 978 |     to determine whether or not something is stringlike."""
 979 |     try:
 980 |         return isinstance(s, unicode) or isinstance(s, basestring)
 981 |     except NameError:
 982 |         return isinstance(s, str)
 983 | 
 984 | def buildTagMap(default, *args):
 985 |     """Turns a list of maps, lists, or scalars into a single map.
 986 |     Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
 987 |     NESTING_RESET_TAGS maps out of lists and partial maps."""
 988 |     built = {}
 989 |     for portion in args:
 990 |         if hasattr(portion, 'items'):
 991 |             #It's a map. Merge it.
 992 |             for k,v in portion.items():
 993 |                 built[k] = v
 994 |         elif isList(portion) and not isString(portion):
 995 |             #It's a list. Map each item to the default.
 996 |             for k in portion:
 997 |                 built[k] = default
 998 |         else:
 999 |             #It's a scalar. Map it to the default.
1000 |             built[portion] = default
1001 |     return built
1002 | 
1003 | # Now, the parser classes.
1004 | 
1005 | class HTMLParserBuilder(HTMLParser):
1006 | 
1007 |     def __init__(self, soup):
1008 |         HTMLParser.__init__(self)
1009 |         self.soup = soup
1010 | 
1011 |     # We inherit feed() and reset().
1012 | 
1013 |     def handle_starttag(self, name, attrs):
1014 |         if name == 'meta':
1015 |             self.soup.extractCharsetFromMeta(attrs)
1016 |         else:
1017 |             self.soup.unknown_starttag(name, attrs)
1018 | 
1019 |     def handle_endtag(self, name):
1020 |         self.soup.unknown_endtag(name)
1021 | 
1022 |     def handle_data(self, content):
1023 |         self.soup.handle_data(content)
1024 | 
1025 |     def _toStringSubclass(self, text, subclass):
1026 |         """Adds a certain piece of text to the tree as a NavigableString
1027 |         subclass."""
1028 |         self.soup.endData()
1029 |         self.handle_data(text)
1030 |         self.soup.endData(subclass)
1031 | 
1032 |     def handle_pi(self, text):
1033 |         """Handle a processing instruction as a ProcessingInstruction
1034 |         object, possibly one with a %SOUP-ENCODING% slot into which an
1035 |         encoding will be plugged later."""
1036 |         if text[:3] == "xml":
1037 |             text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
1038 |         self._toStringSubclass(text, ProcessingInstruction)
1039 | 
1040 |     def handle_comment(self, text):
1041 |         "Handle comments as Comment objects."
1042 |         self._toStringSubclass(text, Comment)
1043 | 
1044 |     def handle_charref(self, ref):
1045 |         "Handle character references as data."
1046 |         if self.soup.convertEntities:
1047 |             data = unichr(int(ref))
1048 |         else:
1049 |             data = '&#%s;' % ref
1050 |         self.handle_data(data)
1051 | 
1052 |     def handle_entityref(self, ref):
1053 |         """Handle entity references as data, possibly converting known
1054 |         HTML and/or XML entity references to the corresponding Unicode
1055 |         characters."""
1056 |         data = None
1057 |         if self.soup.convertHTMLEntities:
1058 |             try:
1059 |                 data = unichr(name2codepoint[ref])
1060 |             except KeyError:
1061 |                 pass
1062 | 
1063 |         if not data and self.soup.convertXMLEntities:
1064 |                 data = self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
1065 | 
1066 |         if not data and self.soup.convertHTMLEntities and \
1067 |             not self.soup.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
1068 |                 # TODO: We've got a problem here. We're told this is
1069 |                 # an entity reference, but it's not an XML entity
1070 |                 # reference or an HTML entity reference. Nonetheless,
1071 |                 # the logical thing to do is to pass it through as an
1072 |                 # unrecognized entity reference.
1073 |                 #
1074 |                 # Except: when the input is "&carol;" this function
1075 |                 # will be called with input "carol". When the input is
1076 |                 # "AT&T", this function will be called with input
1077 |                 # "T". We have no way of knowing whether a semicolon
1078 |                 # was present originally, so we don't know whether
1079 |                 # this is an unknown entity or just a misplaced
1080 |                 # ampersand.
1081 |                 #
1082 |                 # The more common case is a misplaced ampersand, so I
1083 |                 # escape the ampersand and omit the trailing semicolon.
1084 |                 data = "&amp;%s" % ref
1085 |         if not data:
1086 |             # This case is different from the one above, because we
1087 |             # haven't already gone through a supposedly comprehensive
1088 |             # mapping of entities to Unicode characters. We might not
1089 |             # have gone through any mapping at all. So the chances are
1090 |             # very high that this is a real entity, and not a
1091 |             # misplaced ampersand.
1092 |             data = "&%s;" % ref
1093 |         self.handle_data(data)
1094 | 
1095 |     def handle_decl(self, data):
1096 |         "Handle DOCTYPEs and the like as Declaration objects."
1097 |         self._toStringSubclass(data, Declaration)
1098 | 
1099 |     def parse_declaration(self, i):
1100 |         """Treat a bogus SGML declaration as raw data. Treat a CDATA
1101 |         declaration as a CData object."""
1102 |         j = None
1103 |         if self.rawdata[i:i+9] == '<![CDATA[':
1104 |              k = self.rawdata.find(']]>', i)
1105 |              if k == -1:
1106 |                  k = len(self.rawdata)
1107 |              data = self.rawdata[i+9:k]
1108 |              j = k+3
1109 |              self._toStringSubclass(data, CData)
1110 |         else:
1111 |             try:
1112 |                 j = HTMLParser.parse_declaration(self, i)
1113 |             except HTMLParseError:
1114 |                 toHandle = self.rawdata[i:]
1115 |                 self.handle_data(toHandle)
1116 |                 j = i + len(toHandle)
1117 |         return j
1118 | 
1119 | 
1120 | class BeautifulStoneSoup(Tag):
1121 | 
1122 |     """This class contains the basic parser and search code. It defines
1123 |     a parser that knows nothing about tag behavior except for the
1124 |     following:
1125 | 
1126 |       You can't close a tag without closing all the tags it encloses.
1127 |       That is, "<foo><bar></foo>" actually means
1128 |       "<foo><bar></bar></foo>".
1129 | 
1130 |     [Another possible explanation is "<foo><bar /></foo>", but since
1131 |     this class defines no SELF_CLOSING_TAGS, it will never use that
1132 |     explanation.]
1133 | 
1134 |     This class is useful for parsing XML or made-up markup languages,
1135 |     or when BeautifulSoup makes an assumption counter to what you were
1136 |     expecting."""
1137 | 
1138 |     SELF_CLOSING_TAGS = {}
1139 |     NESTABLE_TAGS = {}
1140 |     RESET_NESTING_TAGS = {}
1141 |     QUOTE_TAGS = {}
1142 |     PRESERVE_WHITESPACE_TAGS = []
1143 | 
1144 |     MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
1145 |                        lambda x: x.group(1) + ' />'),
1146 |                       (re.compile('<!\s+([^<>]*)>'),
1147 |                        lambda x: '<!' + x.group(1) + '>')
1148 |                       ]
1149 | 
1150 |     ROOT_TAG_NAME = u'[document]'
1151 | 
1152 |     HTML_ENTITIES = "html"
1153 |     XML_ENTITIES = "xml"
1154 |     XHTML_ENTITIES = "xhtml"
1155 |     # TODO: This only exists for backwards-compatibility
1156 |     ALL_ENTITIES = XHTML_ENTITIES
1157 | 
1158 |     # Used when determining whether a text node is all whitespace and
1159 |     # can be replaced with a single space. A text node that contains
1160 |     # fancy Unicode spaces (usually non-breaking) should be left
1161 |     # alone.
1162 |     STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
1163 | 
1164 |     def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
1165 |                  markupMassage=True, smartQuotesTo=XML_ENTITIES,
1166 |                  convertEntities=None, selfClosingTags=None, isHTML=False,
1167 |                  builder=HTMLParserBuilder):
1168 |         """The Soup object is initialized as the 'root tag', and the
1169 |         provided markup (which can be a string or a file-like object)
1170 |         is fed into the underlying parser.
1171 | 
1172 |         HTMLParser will process most bad HTML, and the BeautifulSoup
1173 |         class has some tricks for dealing with some HTML that kills
1174 |         HTMLParser, but Beautiful Soup can nonetheless choke or lose data
1175 |         if your data uses self-closing tags or declarations
1176 |         incorrectly.
1177 | 
1178 |         By default, Beautiful Soup uses regexes to sanitize input,
1179 |         avoiding the vast majority of these problems. If the problems
1180 |         don't apply to you, pass in False for markupMassage, and
1181 |         you'll get better performance.
1182 | 
1183 |         The default parser massage techniques fix the two most common
1184 |         instances of invalid HTML that choke HTMLParser:
1185 | 
1186 |          <br/> (No space between name of closing tag and tag close)
1187 |          <! --Comment--> (Extraneous whitespace in declaration)
1188 | 
1189 |         You can pass in a custom list of (RE object, replace method)
1190 |         tuples to get Beautiful Soup to scrub your input the way you
1191 |         want."""
1192 | 
1193 |         self.parseOnlyThese = parseOnlyThese
1194 |         self.fromEncoding = fromEncoding
1195 |         self.smartQuotesTo = smartQuotesTo
1196 |         self.convertEntities = convertEntities
1197 |         # Set the rules for how we'll deal with the entities we
1198 |         # encounter
1199 |         if self.convertEntities:
1200 |             # It doesn't make sense to convert encoded characters to
1201 |             # entities even while you're converting entities to Unicode.
1202 |             # Just convert it all to Unicode.
1203 |             self.smartQuotesTo = None
1204 |             if convertEntities == self.HTML_ENTITIES:
1205 |                 self.convertXMLEntities = False
1206 |                 self.convertHTMLEntities = True
1207 |                 self.escapeUnrecognizedEntities = True
1208 |             elif convertEntities == self.XHTML_ENTITIES:
1209 |                 self.convertXMLEntities = True
1210 |                 self.convertHTMLEntities = True
1211 |                 self.escapeUnrecognizedEntities = False
1212 |             elif convertEntities == self.XML_ENTITIES:
1213 |                 self.convertXMLEntities = True
1214 |                 self.convertHTMLEntities = False
1215 |                 self.escapeUnrecognizedEntities = False
1216 |         else:
1217 |             self.convertXMLEntities = False
1218 |             self.convertHTMLEntities = False
1219 |             self.escapeUnrecognizedEntities = False
1220 | 
1221 |         self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
1222 |         self.builder = builder(self)
1223 |         self.reset()
1224 | 
1225 |         if hasattr(markup, 'read'):        # It's a file-type object.
1226 |             markup = markup.read()
1227 |         self.markup = markup
1228 |         self.markupMassage = markupMassage
1229 |         try:
1230 |             self._feed(isHTML=isHTML)
1231 |         except StopParsing:
1232 |             pass
1233 |         self.markup = None                 # The markup can now be GCed.
1234 |         self.builder = None                # So can the builder.
1235 | 
1236 |     def _feed(self, inDocumentEncoding=None, isHTML=False):
1237 |         # Convert the document to Unicode.
1238 |         markup = self.markup
1239 |         if isinstance(markup, unicode):
1240 |             if not hasattr(self, 'originalEncoding'):
1241 |                 self.originalEncoding = None
1242 |         else:
1243 |             dammit = UnicodeDammit\
1244 |                      (markup, [self.fromEncoding, inDocumentEncoding],
1245 |                       smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
1246 |             markup = dammit.unicode
1247 |             self.originalEncoding = dammit.originalEncoding
1248 |             self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
1249 |         if markup:
1250 |             if self.markupMassage:
1251 |                 if not isList(self.markupMassage):
1252 |                     self.markupMassage = self.MARKUP_MASSAGE
1253 |                 for fix, m in self.markupMassage:
1254 |                     markup = fix.sub(m, markup)
1255 |                 # TODO: We get rid of markupMassage so that the
1256 |                 # soup object can be deepcopied later on. Some
1257 |                 # Python installations can't copy regexes. If anyone
1258 |                 # was relying on the existence of markupMassage, this
1259 |                 # might cause problems.
1260 |                 del(self.markupMassage)
1261 |         self.builder.reset()
1262 | 
1263 |         self.builder.feed(markup)
1264 |         # Close out any unfinished strings and close all the open tags.
1265 |         self.endData()
1266 |         while self.currentTag.name != self.ROOT_TAG_NAME:
1267 |             self.popTag()
1268 | 
1269 |     def isSelfClosingTag(self, name):
1270 |         """Returns true iff the given string is the name of a
1271 |         self-closing tag according to this parser."""
1272 |         return self.SELF_CLOSING_TAGS.has_key(name) \
1273 |                or self.instanceSelfClosingTags.has_key(name)
1274 | 
1275 |     def reset(self):
1276 |         Tag.__init__(self, self, self.ROOT_TAG_NAME)
1277 |         self.hidden = 1
1278 |         self.builder.reset()
1279 |         self.currentData = []
1280 |         self.currentTag = None
1281 |         self.tagStack = []
1282 |         self.quoteStack = []
1283 |         self.pushTag(self)
1284 | 
1285 |     def popTag(self):
1286 |         tag = self.tagStack.pop()
1287 |         # Tags with just one string-owning child get the child as a
1288 |         # 'string' property, so that soup.tag.string is shorthand for
1289 |         # soup.tag.contents[0]
1290 |         if len(self.currentTag.contents) == 1 and \
1291 |            isinstance(self.currentTag.contents[0], NavigableString):
1292 |             self.currentTag.string = self.currentTag.contents[0]
1293 | 
1294 |         #print "Pop", tag.name
1295 |         if self.tagStack:
1296 |             self.currentTag = self.tagStack[-1]
1297 |         return self.currentTag
1298 | 
1299 |     def pushTag(self, tag):
1300 |         #print "Push", tag.name
1301 |         if self.currentTag:
1302 |             self.currentTag.contents.append(tag)
1303 |         self.tagStack.append(tag)
1304 |         self.currentTag = self.tagStack[-1]
1305 | 
1306 |     def endData(self, containerClass=NavigableString):
1307 |         if self.currentData:
1308 |             currentData = u''.join(self.currentData)
1309 |             if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
1310 |                 not set([tag.name for tag in self.tagStack]).intersection(
1311 |                     self.PRESERVE_WHITESPACE_TAGS)):
1312 |                 if '\n' in currentData:
1313 |                     currentData = '\n'
1314 |                 else:
1315 |                     currentData = ' '
1316 |             self.currentData = []
1317 |             if self.parseOnlyThese and len(self.tagStack) <= 1 and \
1318 |                    (not self.parseOnlyThese.text or \
1319 |                     not self.parseOnlyThese.search(currentData)):
1320 |                 return
1321 |             o = containerClass(currentData)
1322 |             o.setup(self.currentTag, self.previous)
1323 |             if self.previous:
1324 |                 self.previous.next = o
1325 |             self.previous = o
1326 |             self.currentTag.contents.append(o)
1327 | 
1328 | 
1329 |     def _popToTag(self, name, inclusivePop=True):
1330 |         """Pops the tag stack up to and including the most recent
1331 |         instance of the given tag. If inclusivePop is false, pops the tag
1332 |         stack up to but *not* including the most recent instqance of
1333 |         the given tag."""
1334 |         #print "Popping to %s" % name
1335 |         if name == self.ROOT_TAG_NAME:
1336 |             return
1337 | 
1338 |         numPops = 0
1339 |         mostRecentTag = None
1340 |         for i in range(len(self.tagStack)-1, 0, -1):
1341 |             if name == self.tagStack[i].name:
1342 |                 numPops = len(self.tagStack)-i
1343 |                 break
1344 |         if not inclusivePop:
1345 |             numPops = numPops - 1
1346 | 
1347 |         for i in range(0, numPops):
1348 |             mostRecentTag = self.popTag()
1349 |         return mostRecentTag
1350 | 
1351 |     def _smartPop(self, name):
1352 | 
1353 |         """We need to pop up to the previous tag of this type, unless
1354 |         one of this tag's nesting reset triggers comes between this
1355 |         tag and the previous tag of this type, OR unless this tag is a
1356 |         generic nesting trigger and another generic nesting trigger
1357 |         comes between this tag and the previous tag of this type.
1358 | 
1359 |         Examples:
1360 |          <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
1361 |          <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
1362 |          <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
1363 | 
1364 |          <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
1365 |          <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
1366 |          <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
1367 |         """
1368 | 
1369 |         nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1370 |         isNestable = nestingResetTriggers != None
1371 |         isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
1372 |         popTo = None
1373 |         inclusive = True
1374 |         for i in range(len(self.tagStack)-1, 0, -1):
1375 |             p = self.tagStack[i]
1376 |             if (not p or p.name == name) and not isNestable:
1377 |                 #Non-nestable tags get popped to the top or to their
1378 |                 #last occurance.
1379 |                 popTo = name
1380 |                 break
1381 |             if (nestingResetTriggers != None
1382 |                 and p.name in nestingResetTriggers) \
1383 |                 or (nestingResetTriggers == None and isResetNesting
1384 |                     and self.RESET_NESTING_TAGS.has_key(p.name)):
1385 | 
1386 |                 #If we encounter one of the nesting reset triggers
1387 |                 #peculiar to this tag, or we encounter another tag
1388 |                 #that causes nesting to reset, pop up to but not
1389 |                 #including that tag.
1390 |                 popTo = p.name
1391 |                 inclusive = False
1392 |                 break
1393 |             p = p.parent
1394 |         if popTo:
1395 |             self._popToTag(popTo, inclusive)
1396 | 
1397 |     def unknown_starttag(self, name, attrs, selfClosing=0):
1398 |         #print "Start tag %s: %s" % (name, attrs)
1399 |         if self.quoteStack:
1400 |             #This is not a real tag.
1401 |             #print "<%s> is not real!" % name
1402 |             attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
1403 |             self.handle_data('<%s%s>' % (name, attrs))
1404 |             return
1405 |         self.endData()
1406 | 
1407 |         if not self.isSelfClosingTag(name) and not selfClosing:
1408 |             self._smartPop(name)
1409 | 
1410 |         if self.parseOnlyThese and len(self.tagStack) <= 1 \
1411 |                and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1412 |             return
1413 | 
1414 |         tag = Tag(self, name, attrs, self.currentTag, self.previous)
1415 |         if self.previous:
1416 |             self.previous.next = tag
1417 |         self.previous = tag
1418 |         self.pushTag(tag)
1419 |         if selfClosing or self.isSelfClosingTag(name):
1420 |             self.popTag()
1421 |         if name in self.QUOTE_TAGS:
1422 |             #print "Beginning quote (%s)" % name
1423 |             self.quoteStack.append(name)
1424 |             self.literal = 1
1425 |         return tag
1426 | 
1427 |     def unknown_endtag(self, name):
1428 |         #print "End tag %s" % name
1429 |         if self.quoteStack and self.quoteStack[-1] != name:
1430 |             #This is not a real end tag.
1431 |             #print "</%s> is not real!" % name
1432 |             self.handle_data('</%s>' % name)
1433 |             return
1434 |         self.endData()
1435 |         self._popToTag(name)
1436 |         if self.quoteStack and self.quoteStack[-1] == name:
1437 |             self.quoteStack.pop()
1438 |             self.literal = (len(self.quoteStack) > 0)
1439 | 
1440 |     def handle_data(self, data):
1441 |         self.currentData.append(data)
1442 | 
1443 |     def extractCharsetFromMeta(self, attrs):
1444 |         self.unknown_starttag('meta', attrs)
1445 | 
1446 | 
1447 | class BeautifulSoup(BeautifulStoneSoup):
1448 | 
1449 |     """This parser knows the following facts about HTML:
1450 | 
1451 |     * Some tags have no closing tag and should be interpreted as being
1452 |       closed as soon as they are encountered.
1453 | 
1454 |     * The text inside some tags (ie. 'script') may contain tags which
1455 |       are not really part of the document and which should be parsed
1456 |       as text, not tags. If you want to parse the text as tags, you can
1457 |       always fetch it and parse it explicitly.
1458 | 
1459 |     * Tag nesting rules:
1460 | 
1461 |       Most tags can't be nested at all. For instance, the occurance of
1462 |       a <p> tag should implicitly close the previous <p> tag.
1463 | 
1464 |        <p>Para1<p>Para2
1465 |         should be transformed into:
1466 |        <p>Para1</p><p>Para2
1467 | 
1468 |       Some tags can be nested arbitrarily. For instance, the occurance
1469 |       of a <blockquote> tag should _not_ implicitly close the previous
1470 |       <blockquote> tag.
1471 | 
1472 |        Alice said: <blockquote>Bob said: <blockquote>Blah
1473 |         should NOT be transformed into:
1474 |        Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
1475 | 
1476 |       Some tags can be nested, but the nesting is reset by the
1477 |       interposition of other tags. For instance, a <tr> tag should
1478 |       implicitly close the previous <tr> tag within the same <table>,
1479 |       but not close a <tr> tag in another table.
1480 | 
1481 |        <table><tr>Blah<tr>Blah
1482 |         should be transformed into:
1483 |        <table><tr>Blah</tr><tr>Blah
1484 |         but,
1485 |        <tr>Blah<table><tr>Blah
1486 |         should NOT be transformed into
1487 |        <tr>Blah<table></tr><tr>Blah
1488 | 
1489 |     Differing assumptions about tag nesting rules are a major source
1490 |     of problems with the BeautifulSoup class. If BeautifulSoup is not
1491 |     treating as nestable a tag your page author treats as nestable,
1492 |     try ICantBelieveItsBeautifulSoup, MinimalSoup, or
1493 |     BeautifulStoneSoup before writing your own subclass."""
1494 | 
1495 |     def __init__(self, *args, **kwargs):
1496 |         if not kwargs.has_key('smartQuotesTo'):
1497 |             kwargs['smartQuotesTo'] = self.HTML_ENTITIES
1498 |         kwargs['isHTML'] = True
1499 |         BeautifulStoneSoup.__init__(self, *args, **kwargs)
1500 | 
1501 |     SELF_CLOSING_TAGS = buildTagMap(None,
1502 |                                     ['br' , 'hr', 'input', 'img', 'meta',
1503 |                                     'spacer', 'link', 'frame', 'base'])
1504 | 
1505 |     PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
1506 | 
1507 |     QUOTE_TAGS = {'script' : None, 'textarea' : None}
1508 | 
1509 |     #According to the HTML standard, each of these inline tags can
1510 |     #contain another tag of the same type. Furthermore, it's common
1511 |     #to actually use these tags this way.
1512 |     NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
1513 |                             'center']
1514 | 
1515 |     #According to the HTML standard, these block tags can contain
1516 |     #another tag of the same type. Furthermore, it's common
1517 |     #to actually use these tags this way.
1518 |     NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
1519 | 
1520 |     #Lists can contain other lists, but there are restrictions.
1521 |     NESTABLE_LIST_TAGS = { 'ol' : [],
1522 |                            'ul' : [],
1523 |                            'li' : ['ul', 'ol'],
1524 |                            'dl' : [],
1525 |                            'dd' : ['dl'],
1526 |                            'dt' : ['dl'] }
1527 | 
1528 |     #Tables can contain other tables, but there are restrictions.
1529 |     NESTABLE_TABLE_TAGS = {'table' : [],
1530 |                            'tr' : ['table', 'tbody', 'tfoot', 'thead'],
1531 |                            'td' : ['tr'],
1532 |                            'th' : ['tr'],
1533 |                            'thead' : ['table'],
1534 |                            'tbody' : ['table'],
1535 |                            'tfoot' : ['table'],
1536 |                            }
1537 | 
1538 |     NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
1539 | 
1540 |     #If one of these tags is encountered, all tags up to the next tag of
1541 |     #this type are popped.
1542 |     RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
1543 |                                      NON_NESTABLE_BLOCK_TAGS,
1544 |                                      NESTABLE_LIST_TAGS,
1545 |                                      NESTABLE_TABLE_TAGS)
1546 | 
1547 |     NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
1548 |                                 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
1549 | 
1550 |     # Used to detect the charset in a META tag; see start_meta
1551 |     CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
1552 | 
1553 |     def extractCharsetFromMeta(self, attrs):
1554 |         """Beautiful Soup can detect a charset included in a META tag,
1555 |         try to convert the document to that charset, and re-parse the
1556 |         document from the beginning."""
1557 |         httpEquiv = None
1558 |         contentType = None
1559 |         contentTypeIndex = None
1560 |         tagNeedsEncodingSubstitution = False
1561 | 
1562 |         for i in range(0, len(attrs)):
1563 |             key, value = attrs[i]
1564 |             key = key.lower()
1565 |             if key == 'http-equiv':
1566 |                 httpEquiv = value
1567 |             elif key == 'content':
1568 |                 contentType = value
1569 |                 contentTypeIndex = i
1570 | 
1571 |         if httpEquiv and contentType: # It's an interesting meta tag.
1572 |             match = self.CHARSET_RE.search(contentType)
1573 |             if match:
1574 |                 if (self.declaredHTMLEncoding is not None or
1575 |                     self.originalEncoding == self.fromEncoding):
1576 |                     # An HTML encoding was sniffed while converting
1577 |                     # the document to Unicode, or an HTML encoding was
1578 |                     # sniffed during a previous pass through the
1579 |                     # document, or an encoding was specified
1580 |                     # explicitly and it worked. Rewrite the meta tag.
1581 |                     def rewrite(match):
1582 |                         return match.group(1) + "%SOUP-ENCODING%"
1583 |                     newAttr = self.CHARSET_RE.sub(rewrite, contentType)
1584 |                     attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
1585 |                                                newAttr)
1586 |                     tagNeedsEncodingSubstitution = True
1587 |                 else:
1588 |                     # This is our first pass through the document.
1589 |                     # Go through it again with the encoding information.
1590 |                     newCharset = match.group(3)
1591 |                     if newCharset and newCharset != self.originalEncoding:
1592 |                         self.declaredHTMLEncoding = newCharset
1593 |                         self._feed(self.declaredHTMLEncoding)
1594 |                         raise StopParsing
1595 |                     pass
1596 |         tag = self.unknown_starttag("meta", attrs)
1597 |         if tag and tagNeedsEncodingSubstitution:
1598 |             tag.containsSubstitutions = True
1599 | 
1600 | 
1601 | class StopParsing(Exception):
1602 |     pass
1603 | 
1604 | class ICantBelieveItsBeautifulSoup(BeautifulSoup):
1605 | 
1606 |     """The BeautifulSoup class is oriented towards skipping over
1607 |     common HTML errors like unclosed tags. However, sometimes it makes
1608 |     errors of its own. For instance, consider this fragment:
1609 | 
1610 |      <b>Foo<b>Bar</b></b>
1611 | 
1612 |     This is perfectly valid (if bizarre) HTML. However, the
1613 |     BeautifulSoup class will implicitly close the first b tag when it
1614 |     encounters the second 'b'. It will think the author wrote
1615 |     "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
1616 |     there's no real-world reason to bold something that's already
1617 |     bold. When it encounters '</b></b>' it will close two more 'b'
1618 |     tags, for a grand total of three tags closed instead of two. This
1619 |     can throw off the rest of your document structure. The same is
1620 |     true of a number of other tags, listed below.
1621 | 
1622 |     It's much more common for someone to forget to close a 'b' tag
1623 |     than to actually use nested 'b' tags, and the BeautifulSoup class
1624 |     handles the common case. This class handles the not-co-common
1625 |     case: where you can't believe someone wrote what they did, but
1626 |     it's valid HTML and BeautifulSoup screwed up by assuming it
1627 |     wouldn't be."""
1628 | 
1629 |     I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1630 |      ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1631 |       'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1632 |       'big']
1633 | 
1634 |     I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
1635 | 
1636 |     NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1637 |                                 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1638 |                                 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1639 | 
1640 | class MinimalSoup(BeautifulSoup):
1641 |     """The MinimalSoup class is for parsing HTML that contains
1642 |     pathologically bad markup. It makes no assumptions about tag
1643 |     nesting, but it does know which tags are self-closing, that
1644 |     <script> tags contain Javascript and should not be parsed, that
1645 |     META tags may contain encoding information, and so on.
1646 | 
1647 |     This also makes it better for subclassing than BeautifulStoneSoup
1648 |     or BeautifulSoup."""
1649 | 
1650 |     RESET_NESTING_TAGS = buildTagMap('noscript')
1651 |     NESTABLE_TAGS = {}
1652 | 
1653 | class BeautifulSOAP(BeautifulStoneSoup):
1654 |     """This class will push a tag with only a single string child into
1655 |     the tag's parent as an attribute. The attribute's name is the tag
1656 |     name, and the value is the string child. An example should give
1657 |     the flavor of the change:
1658 | 
1659 |     <foo><bar>baz</bar></foo>
1660 |      =>
1661 |     <foo bar="baz"><bar>baz</bar></foo>
1662 | 
1663 |     You can then access fooTag['bar'] instead of fooTag.barTag.string.
1664 | 
1665 |     This is, of course, useful for scraping structures that tend to
1666 |     use subelements instead of attributes, such as SOAP messages. Note
1667 |     that it modifies its input, so don't print the modified version
1668 |     out.
1669 | 
1670 |     I'm not sure how many people really want to use this class; let me
1671 |     know if you do. Mainly I like the name."""
1672 | 
1673 |     def popTag(self):
1674 |         if len(self.tagStack) > 1:
1675 |             tag = self.tagStack[-1]
1676 |             parent = self.tagStack[-2]
1677 |             parent._getAttrMap()
1678 |             if (isinstance(tag, Tag) and len(tag.contents) == 1 and
1679 |                 isinstance(tag.contents[0], NavigableString) and
1680 |                 not parent.attrMap.has_key(tag.name)):
1681 |                 parent[tag.name] = tag.contents[0]
1682 |         BeautifulStoneSoup.popTag(self)
1683 | 
1684 | #Enterprise class names! It has come to our attention that some people
1685 | #think the names of the Beautiful Soup parser classes are too silly
1686 | #and "unprofessional" for use in enterprise screen-scraping. We feel
1687 | #your pain! For such-minded folk, the Beautiful Soup Consortium And
1688 | #All-Night Kosher Bakery recommends renaming this file to
1689 | #"RobustParser.py" (or, in cases of extreme enterprisiness,
1690 | #"RobustParserBeanInterface.class") and using the following
1691 | #enterprise-friendly class aliases:
1692 | class RobustXMLParser(BeautifulStoneSoup):
1693 |     pass
1694 | class RobustHTMLParser(BeautifulSoup):
1695 |     pass
1696 | class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
1697 |     pass
1698 | class RobustInsanelyWackAssHTMLParser(MinimalSoup):
1699 |     pass
1700 | class SimplifyingSOAPParser(BeautifulSOAP):
1701 |     pass
1702 | 
1703 | ######################################################
1704 | #
1705 | # Bonus library: Unicode, Dammit
1706 | #
1707 | # This class forces XML data into a standard format (usually to UTF-8
1708 | # or Unicode).  It is heavily based on code from Mark Pilgrim's
1709 | # Universal Feed Parser. It does not rewrite the XML or HTML to
1710 | # reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
1711 | # (XML) and BeautifulSoup.start_meta (HTML).
1712 | 
1713 | # Autodetects character encodings.
1714 | # Download from http://chardet.feedparser.org/
1715 | try:
1716 |     import chardet
1717 | #    import chardet.constants
1718 | #    chardet.constants._debug = 1
1719 | except ImportError:
1720 |     chardet = None
1721 | 
1722 | # cjkcodecs and iconv_codec make Python know about more character encodings.
1723 | # Both are available from http://cjkpython.i18n.org/
1724 | # They're built in if you use Python 2.4.
1725 | try:
1726 |     import cjkcodecs.aliases
1727 | except ImportError:
1728 |     pass
1729 | try:
1730 |     import iconv_codec
1731 | except ImportError:
1732 |     pass
1733 | 
1734 | class UnicodeDammit:
1735 |     """A class for detecting the encoding of a *ML document and
1736 |     converting it to a Unicode string. If the source encoding is
1737 |     windows-1252, can replace MS smart quotes with their HTML or XML
1738 |     equivalents."""
1739 | 
1740 |     # This dictionary maps commonly seen values for "charset" in HTML
1741 |     # meta tags to the corresponding Python codec names. It only covers
1742 |     # values that aren't in Python's aliases and can't be determined
1743 |     # by the heuristics in find_codec.
1744 |     CHARSET_ALIASES = { "macintosh" : "mac-roman",
1745 |                         "x-sjis" : "shift-jis" }
1746 | 
1747 |     def __init__(self, markup, overrideEncodings=[],
1748 |                  smartQuotesTo='xml', isHTML=False):
1749 |         self.declaredHTMLEncoding = None
1750 |         self.markup, documentEncoding, sniffedEncoding = \
1751 |                      self._detectEncoding(markup, isHTML)
1752 |         self.smartQuotesTo = smartQuotesTo
1753 |         self.triedEncodings = []
1754 |         if markup == '' or isinstance(markup, unicode):
1755 |             self.originalEncoding = None
1756 |             self.unicode = unicode(markup)
1757 |             return
1758 | 
1759 |         u = None
1760 |         for proposedEncoding in overrideEncodings:
1761 |             u = self._convertFrom(proposedEncoding)
1762 |             if u: break
1763 |         if not u:
1764 |             for proposedEncoding in (documentEncoding, sniffedEncoding):
1765 |                 u = self._convertFrom(proposedEncoding)
1766 |                 if u: break
1767 | 
1768 |         # If no luck and we have auto-detection library, try that:
1769 |         if not u and chardet and not isinstance(self.markup, unicode):
1770 |             u = self._convertFrom(chardet.detect(self.markup)['encoding'])
1771 | 
1772 |         # As a last resort, try utf-8 and windows-1252:
1773 |         if not u:
1774 |             for proposed_encoding in ("utf-8", "windows-1252"):
1775 |                 u = self._convertFrom(proposed_encoding)
1776 |                 if u: break
1777 | 
1778 |         self.unicode = u
1779 |         if not u: self.originalEncoding = None
1780 | 
1781 |     def _subMSChar(self, match):
1782 |         """Changes a MS smart quote character to an XML or HTML
1783 |         entity."""
1784 |         orig = match.group(1)
1785 |         sub = self.MS_CHARS.get(orig)
1786 |         if type(sub) == types.TupleType:
1787 |             if self.smartQuotesTo == 'xml':
1788 |                 sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
1789 |             else:
1790 |                 sub = '&'.encode() + sub[0].encode() + ';'.encode()
1791 |         else:
1792 |             sub = sub.encode()
1793 |         return sub
1794 | 
1795 |     def _convertFrom(self, proposed):
1796 |         proposed = self.find_codec(proposed)
1797 |         if not proposed or proposed in self.triedEncodings:
1798 |             return None
1799 |         self.triedEncodings.append(proposed)
1800 |         markup = self.markup
1801 | 
1802 |         # Convert smart quotes to HTML if coming from an encoding
1803 |         # that might have them.
1804 |         if self.smartQuotesTo and proposed.lower() in("windows-1252",
1805 |                                                       "iso-8859-1",
1806 |                                                       "iso-8859-2"):
1807 |             smart_quotes_re = "([\x80-\x9f])"
1808 |             smart_quotes_compiled = re.compile(smart_quotes_re)
1809 |             markup = smart_quotes_compiled.sub(self._subMSChar, markup)
1810 | 
1811 |         try:
1812 |             # print "Trying to convert document to %s" % proposed
1813 |             u = self._toUnicode(markup, proposed)
1814 |             self.markup = u
1815 |             self.originalEncoding = proposed
1816 |         except Exception, e:
1817 |             # print "That didn't work!"
1818 |             # print e
1819 |             return None
1820 |         #print "Correct encoding: %s" % proposed
1821 |         return self.markup
1822 | 
1823 |     def _toUnicode(self, data, encoding):
1824 |         '''Given a string and its encoding, decodes the string into Unicode.
1825 |         %encoding is a string recognized by encodings.aliases'''
1826 | 
1827 |         # strip Byte Order Mark (if present)
1828 |         if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
1829 |                and (data[2:4] != '\x00\x00'):
1830 |             encoding = 'utf-16be'
1831 |             data = data[2:]
1832 |         elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
1833 |                  and (data[2:4] != '\x00\x00'):
1834 |             encoding = 'utf-16le'
1835 |             data = data[2:]
1836 |         elif data[:3] == '\xef\xbb\xbf':
1837 |             encoding = 'utf-8'
1838 |             data = data[3:]
1839 |         elif data[:4] == '\x00\x00\xfe\xff':
1840 |             encoding = 'utf-32be'
1841 |             data = data[4:]
1842 |         elif data[:4] == '\xff\xfe\x00\x00':
1843 |             encoding = 'utf-32le'
1844 |             data = data[4:]
1845 |         newdata = unicode(data, encoding)
1846 |         return newdata
1847 | 
1848 |     def _detectEncoding(self, xml_data, isHTML=False):
1849 |         """Given a document, tries to detect its XML encoding."""
1850 |         xml_encoding = sniffed_xml_encoding = None
1851 |         try:
1852 |             if xml_data[:4] == '\x4c\x6f\xa7\x94':
1853 |                 # EBCDIC
1854 |                 xml_data = self._ebcdic_to_ascii(xml_data)
1855 |             elif xml_data[:4] == '\x00\x3c\x00\x3f':
1856 |                 # UTF-16BE
1857 |                 sniffed_xml_encoding = 'utf-16be'
1858 |                 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
1859 |             elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
1860 |                      and (xml_data[2:4] != '\x00\x00'):
1861 |                 # UTF-16BE with BOM
1862 |                 sniffed_xml_encoding = 'utf-16be'
1863 |                 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
1864 |             elif xml_data[:4] == '\x3c\x00\x3f\x00':
1865 |                 # UTF-16LE
1866 |                 sniffed_xml_encoding = 'utf-16le'
1867 |                 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
1868 |             elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
1869 |                      (xml_data[2:4] != '\x00\x00'):
1870 |                 # UTF-16LE with BOM
1871 |                 sniffed_xml_encoding = 'utf-16le'
1872 |                 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
1873 |             elif xml_data[:4] == '\x00\x00\x00\x3c':
1874 |                 # UTF-32BE
1875 |                 sniffed_xml_encoding = 'utf-32be'
1876 |                 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
1877 |             elif xml_data[:4] == '\x3c\x00\x00\x00':
1878 |                 # UTF-32LE
1879 |                 sniffed_xml_encoding = 'utf-32le'
1880 |                 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
1881 |             elif xml_data[:4] == '\x00\x00\xfe\xff':
1882 |                 # UTF-32BE with BOM
1883 |                 sniffed_xml_encoding = 'utf-32be'
1884 |                 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
1885 |             elif xml_data[:4] == '\xff\xfe\x00\x00':
1886 |                 # UTF-32LE with BOM
1887 |                 sniffed_xml_encoding = 'utf-32le'
1888 |                 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
1889 |             elif xml_data[:3] == '\xef\xbb\xbf':
1890 |                 # UTF-8 with BOM
1891 |                 sniffed_xml_encoding = 'utf-8'
1892 |                 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
1893 |             else:
1894 |                 sniffed_xml_encoding = 'ascii'
1895 |                 pass
1896 |         except:
1897 |             xml_encoding_match = None
1898 |         xml_encoding_re = '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode()
1899 |         xml_encoding_match = re.compile(xml_encoding_re).match(xml_data)
1900 |         if not xml_encoding_match and isHTML:
1901 |             meta_re = '<\s*meta[^>]+charset=([^>]*?)[;\'">]'.encode()
1902 |             regexp = re.compile(meta_re, re.I)
1903 |             xml_encoding_match = regexp.search(xml_data)
1904 |         if xml_encoding_match is not None:
1905 |             xml_encoding = xml_encoding_match.groups()[0].decode(
1906 |                 'ascii').lower()
1907 |             if isHTML:
1908 |                 self.declaredHTMLEncoding = xml_encoding
1909 |             if sniffed_xml_encoding and \
1910 |                (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
1911 |                                  'iso-10646-ucs-4', 'ucs-4', 'csucs4',
1912 |                                  'utf-16', 'utf-32', 'utf_16', 'utf_32',
1913 |                                  'utf16', 'u16')):
1914 |                 xml_encoding = sniffed_xml_encoding
1915 |         return xml_data, xml_encoding, sniffed_xml_encoding
1916 | 
1917 | 
1918 |     def find_codec(self, charset):
1919 |         return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
1920 |                or (charset and self._codec(charset.replace("-", ""))) \
1921 |                or (charset and self._codec(charset.replace("-", "_"))) \
1922 |                or charset
1923 | 
1924 |     def _codec(self, charset):
1925 |         if not charset: return charset
1926 |         codec = None
1927 |         try:
1928 |             codecs.lookup(charset)
1929 |             codec = charset
1930 |         except (LookupError, ValueError):
1931 |             pass
1932 |         return codec
1933 | 
1934 |     EBCDIC_TO_ASCII_MAP = None
1935 |     def _ebcdic_to_ascii(self, s):
1936 |         c = self.__class__
1937 |         if not c.EBCDIC_TO_ASCII_MAP:
1938 |             emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1939 |                     16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1940 |                     128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1941 |                     144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1942 |                     32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1943 |                     38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1944 |                     45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1945 |                     186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1946 |                     195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1947 |                     201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1948 |                     206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1949 |                     211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1950 |                     225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1951 |                     73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1952 |                     82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1953 |                     90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1954 |                     250,251,252,253,254,255)
1955 |             import string
1956 |             c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
1957 |             ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
1958 |         return s.translate(c.EBCDIC_TO_ASCII_MAP)
1959 | 
1960 |     MS_CHARS = { '\x80' : ('euro', '20AC'),
1961 |                  '\x81' : ' ',
1962 |                  '\x82' : ('sbquo', '201A'),
1963 |                  '\x83' : ('fnof', '192'),
1964 |                  '\x84' : ('bdquo', '201E'),
1965 |                  '\x85' : ('hellip', '2026'),
1966 |                  '\x86' : ('dagger', '2020'),
1967 |                  '\x87' : ('Dagger', '2021'),
1968 |                  '\x88' : ('circ', '2C6'),
1969 |                  '\x89' : ('permil', '2030'),
1970 |                  '\x8A' : ('Scaron', '160'),
1971 |                  '\x8B' : ('lsaquo', '2039'),
1972 |                  '\x8C' : ('OElig', '152'),
1973 |                  '\x8D' : '?',
1974 |                  '\x8E' : ('#x17D', '17D'),
1975 |                  '\x8F' : '?',
1976 |                  '\x90' : '?',
1977 |                  '\x91' : ('lsquo', '2018'),
1978 |                  '\x92' : ('rsquo', '2019'),
1979 |                  '\x93' : ('ldquo', '201C'),
1980 |                  '\x94' : ('rdquo', '201D'),
1981 |                  '\x95' : ('bull', '2022'),
1982 |                  '\x96' : ('ndash', '2013'),
1983 |                  '\x97' : ('mdash', '2014'),
1984 |                  '\x98' : ('tilde', '2DC'),
1985 |                  '\x99' : ('trade', '2122'),
1986 |                  '\x9a' : ('scaron', '161'),
1987 |                  '\x9b' : ('rsaquo', '203A'),
1988 |                  '\x9c' : ('oelig', '153'),
1989 |                  '\x9d' : '?',
1990 |                  '\x9e' : ('#x17E', '17E'),
1991 |                  '\x9f' : ('Yuml', ''),}
1992 | 
1993 | #######################################################################
1994 | 
1995 | 
1996 | #By default, act as an HTML pretty-printer.
1997 | if __name__ == '__main__':
1998 |     import sys
1999 |     soup = BeautifulSoup(sys.stdin)
2000 |     print soup.prettify()
2001 | 


--------------------------------------------------------------------------------
/readability/__init__.py:
--------------------------------------------------------------------------------
1 | from readability import Document, main
2 | from page_parser import ascii, Unparseable
3 | 


--------------------------------------------------------------------------------
/readability/page_parser.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from url_helpers import absolute_url
  3 | from BeautifulSoup import BeautifulSoup, HTMLParseError, UnicodeDammit
  4 | from logging import error
  5 | 
  6 | __all__ = [
  7 | 	'Unparseable',
  8 | 	'parse',
  9 | 	'get_title',
 10 | 	'get_body',
 11 | 	'ascii']
 12 | 
 13 | def debug(s): pass
 14 | 
 15 | class Unparseable(ValueError):
 16 | 	pass
 17 | 
 18 | def parse(raw_content, base_href=None, notify=lambda x: None):
 19 | 	for parse_method in _parse_methods():
 20 | 		try:
 21 | 			return parse_method(raw_content, base_href)
 22 | 		except HTMLParseError, e:
 23 | 			notify("parsing (%s) failed: %s" % (parse_method.__name__, e))
 24 | 			continue
 25 | 	raise Unparseable()
 26 | 
 27 | def get_title(soup):
 28 | 	title = unicode(getattr(soup.title, 'string', ''))
 29 | 	if not title:
 30 | 		return None
 31 | 	return normalize_spaces(title)
 32 | 
 33 | 
 34 | def get_body(soup):
 35 | 	[ elem.extract() for elem in soup.findAll(['script', 'link', 'style']) ]
 36 | 	raw_html = unicode(soup.body or soup)
 37 | 	cleaned = clean_attributes(raw_html)
 38 | 	try:
 39 | 		BeautifulSoup(cleaned)
 40 | 		return cleaned
 41 | 	except HTMLParseError:
 42 | 		error("cleansing broke html content: %s\n---------\n%s" % (raw_html,cleaned))
 43 | 		return raw_html
 44 | 
 45 | def ascii(s):
 46 | 	return s.decode('ascii', 'ignore')
 47 | 
 48 | class Replacement(object):
 49 | 	def __init__(self, desc, regex, replacement):
 50 | 		self.desc = desc
 51 | 		self.regex = regex
 52 | 		self.replacement = replacement
 53 | 	
 54 | 	def apply(self, content):
 55 | #		# useful for debugging:
 56 | #		try:
 57 | #			print self. desc + ':' + str(self.regex.findall(content))
 58 | #		except RuntimeError: pass
 59 | 		return self.regex.sub(self.replacement, content)
 60 | 
 61 | def beautiful_soup(content, base_href):
 62 | 	soup = BeautifulSoup(content)
 63 | 	if base_href:
 64 | 		_fix_references(soup, base_href)
 65 | 	return soup
 66 | 
 67 | 
 68 | def _make_absolute_links(soup, base_href):
 69 | 	for link in soup.findAll('a', attrs={'href':True}):
 70 | 		link['href'] = absolute_url(link['href'], base_href)
 71 | 
 72 | def _make_absolute_images(soup, base_href):
 73 | 	for img in soup.findAll('img', attrs={'src':True}):
 74 | 		img['src'] = absolute_url(img['src'], base_href)
 75 | 
 76 | def _fix_references(soup, base_href):
 77 | 	_make_absolute_links(soup, base_href)
 78 | 	_make_absolute_images(soup, base_href)
 79 | 
 80 | # a bunch of regexes to hack around lousy html
 81 | dodgy_regexes = (
 82 | 	Replacement('javascript',
 83 | 		regex=re.compile('<script.*?</script[^>]*>', re.DOTALL | re.IGNORECASE),
 84 | 		replacement=''),
 85 | 
 86 | 	Replacement('double double-quoted attributes',
 87 | 		regex=re.compile('(="[^"]+")"+'),
 88 | 		replacement='\\1'),
 89 | 
 90 | 	Replacement('unclosed tags',
 91 | 		regex = re.compile('(<[a-zA-Z]+[^>]*)(<[a-zA-Z]+[^<>]*>)'),
 92 | 		replacement='\\1>\\2'),
 93 | 
 94 | 	Replacement('unclosed (numerical) attribute values',
 95 | 		regex = re.compile('(<[^>]*[a-zA-Z]+\s*=\s*"[0-9]+)( [a-zA-Z]+="\w+"|/?>)'),
 96 | 		replacement='\\1"\\2'),
 97 | 	)
 98 | 	
 99 | 
100 | # helpers for parsing
101 | def normalize_spaces(s):
102 | 	"""replace any sequence of whitespace
103 | 	characters with a single space"""
104 | 	return ' '.join(s.split())
105 | 
106 | def _remove_crufty_html(content):
107 | 	for replacement in dodgy_regexes:
108 | 		content = replacement.apply(content)
109 | 	return content
110 | 
111 | def _parse_methods():
112 | 	def unicode_cleansed(content, base_href):
113 | 		content = UnicodeDammit(content, isHTML=True).markup
114 | 		cleaned = _remove_crufty_html(content)
115 | 		debug("Cleaned content: %s" % (cleaned,))
116 | 		return beautiful_soup(cleaned, base_href)
117 | 
118 | 	def ascii_cleansed(content, base_href):
119 | 		content = ascii(content)
120 | 		cleaned = _remove_crufty_html(content)
121 | 		debug("Cleaned content: %s" % (cleaned,))
122 | 		return beautiful_soup(cleaned, base_href)
123 | 
124 | 	return (
125 | 		beautiful_soup,
126 | 		unicode_cleansed,
127 | 		ascii_cleansed)
128 | 
129 | # strip out a set of nuisance html attributes that can mess up rendering in RSS feeds
130 | bad_attrs = ['width','height','style','[-a-z]*color','background[-a-z]*']
131 | single_quoted = "'[^']+'"
132 | double_quoted = '"[^"]+"'
133 | non_space = '[^ "\'>]+'
134 | htmlstrip = re.compile("<" # open
135 | 	"([^>]+) " # prefix
136 | 	"(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes
137 | 	'= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value
138 | 	"([^>]*)"  # postfix
139 | 	">"        # end
140 | , re.I)
141 | def clean_attributes(html):
142 | 	while htmlstrip.search(html):
143 | 		html = htmlstrip.sub('<\\1\\2>', html)
144 | 	return html
145 | 
146 | 


--------------------------------------------------------------------------------
/readability/readability.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from BeautifulSoup import NavigableString
  3 | from page_parser import parse, get_title, get_body, Unparseable
  4 | import logging
  5 | import re
  6 | 
  7 | REGEXES = { 'unlikelyCandidatesRe': re.compile('combx|comment|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|sponsor',re.I),
  8 | 	'okMaybeItsACandidateRe': re.compile('and|article|body|column|main',re.I),
  9 | 	'positiveRe': re.compile('article|body|content|entry|hentry|page|pagination|post|text',re.I),
 10 | 	'negativeRe': re.compile('combx|comment|contact|foot|footer|footnote|link|media|meta|promo|related|scroll|shoutbox|sponsor|tags|widget',re.I),
 11 | 	'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)',re.I),
 12 | 	'replaceBrsRe': re.compile('(<br[^>]*>[ \n\r\t]*){2,}',re.I),
 13 | 	'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I),
 14 | 	'trimRe': re.compile('^\s+|\s+$/'),
 15 | 	'normalizeRe': re.compile('\s{2,}/'),
 16 | 	'killBreaksRe': re.compile('(<br\s*\/?>(\s|&nbsp;?)*){1,}/'),
 17 | 	'videoRe': re.compile('http:\/\/(www\.)?(youtube|vimeo)\.com', re.I),
 18 | }
 19 | 
 20 | from collections import defaultdict
 21 | def describe(node):
 22 | 	if not hasattr(node, 'name'):
 23 | 		return "[text]"
 24 | 	return "%s#%s.%s" % (
 25 | 		node.name, node.get('id', ''), node.get('class',''))
 26 | 
 27 | def _text(node):
 28 | 	return " ".join(node.findAll(text=True))
 29 | 
 30 | class Document:
 31 | 	TEXT_LENGTH_THRESHOLD = 25
 32 | 	RETRY_LENGTH = 250
 33 | 
 34 | 	def __init__(self, input, notify=None, **options):
 35 | 		self.input = input
 36 | 		self.options = defaultdict(lambda: None)
 37 | 		for k, v in options.items():
 38 | 			self.options[k] = v
 39 | 		self.notify = notify or logging.info
 40 | 		self.html = None
 41 | 
 42 | 	def _html(self, force=False):
 43 | 		if force or self.html is None:
 44 | 			self.html = parse(self.input, self.options['url'], notify=self.notify)
 45 | 		return self.html
 46 | 	
 47 | 	def content(self):
 48 | 		return get_body(self._html())
 49 | 	
 50 | 	def title(self):
 51 | 		return get_title(self._html())
 52 | 
 53 | 	def summary(self):
 54 | 		try:
 55 | 			ruthless = True
 56 | 			while True:
 57 | 				self._html(True)
 58 | 				[i.extract() for i in self.tags(self.html, 'script', 'style')]
 59 | 
 60 | 				if ruthless: self.remove_unlikely_candidates()
 61 | 				self.transform_misused_divs_into_paragraphs()
 62 | 				candidates = self.score_paragraphs(self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD))
 63 | 				best_candidate = self.select_best_candidate(candidates)
 64 | 				if best_candidate:
 65 | 					article = self.get_article(candidates, best_candidate)
 66 | 				else:
 67 | 					if ruthless:
 68 | 						ruthless = False
 69 | 						self.debug("ended up stripping too much - going for a safer parse")
 70 | 						# try again
 71 | 						continue
 72 | 					else:
 73 | 						article = self.html.find('body') or self.html
 74 | 
 75 | 				cleaned_article = self.sanitize(article, candidates)
 76 | 				of_acceptable_length = len(cleaned_article or '') >= (self.options['retry_length'] or self.RETRY_LENGTH)
 77 | 				if ruthless and not of_acceptable_length:
 78 | 					ruthless = False
 79 | 					continue # try again
 80 | 				else:
 81 | 					return cleaned_article
 82 | 		except StandardError, e:
 83 | 			logging.exception('error getting summary:')
 84 | 			raise Unparseable(str(e))
 85 | 
 86 | 	def get_article(self, candidates, best_candidate):
 87 | 		# Now that we have the top candidate, look through its siblings for content that might also be related.
 88 | 		# Things like preambles, content split by ads that we removed, etc.
 89 | 
 90 | 		sibling_score_threshold = max([10, best_candidate['content_score'] * 0.2])
 91 | 		output = parse("<div/>")
 92 | 		for sibling in best_candidate['elem'].parent.contents:
 93 | 			if isinstance(sibling, NavigableString): continue
 94 | 			append = False
 95 | 			if sibling is best_candidate['elem']:
 96 | 				append = True
 97 | 			sibling_key = HashableElement(sibling)
 98 | 			if sibling_key in candidates and candidates[sibling_key]['content_score'] >= sibling_score_threshold:
 99 | 				append = True
100 | 
101 | 			if sibling.name == "p":
102 | 				link_density = self.get_link_density(sibling)
103 | 				node_content = sibling.string or ""
104 | 				node_length = len(node_content)
105 | 
106 | 				if node_length > 80 and link_density < 0.25:
107 | 					append = True
108 | 				elif node_length < 80 and link_density == 0 and re.search('\.( |$)', node_content):
109 | 					append = True
110 | 
111 | 			if append:
112 | 				output.append(sibling)
113 | 
114 | 		if not output: output.append(best_candidate)
115 | 		return output
116 | 
117 | 	def select_best_candidate(self, candidates):
118 | 		sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True)
119 | 		self.debug("Top 5 candidates:")
120 | 		for candidate in sorted_candidates[:5]:
121 | 			elem = candidate['elem']
122 | 			self.debug("Candidate %s with score %s" % (describe(elem), candidate['content_score']))
123 | 
124 | 		if len(sorted_candidates) == 0:
125 | 			return None
126 | 		best_candidate = sorted_candidates[0]
127 | 		self.debug("Best candidate %s with score %s" % (describe(best_candidate['elem']), best_candidate['content_score']))
128 | 		return best_candidate
129 | 
130 | 	def get_link_density(self, elem):
131 | 		link_length = len("".join([i.text or "" for i in elem.findAll("a")]))
132 | 		text_length = len(_text(elem))
133 | 		return float(link_length) / max(text_length, 1)
134 | 
135 | 	def score_paragraphs(self, min_text_length):
136 | 		candidates = {}
137 | 		elems = self.tags(self.html, "p","td")
138 | 
139 | 		for elem in elems:
140 | 			parent_node = elem.parent
141 | 			grand_parent_node = parent_node.parent
142 | 			parent_key = HashableElement(parent_node)
143 | 			grand_parent_key = HashableElement(grand_parent_node)
144 | 
145 | 			inner_text = _text(elem)
146 | 
147 | 			# If this paragraph is less than 25 characters, don't even count it.
148 | 			if (not inner_text) or len(inner_text) < min_text_length:
149 | 				continue
150 | 
151 | 			if parent_key not in candidates:
152 | 				candidates[parent_key] = self.score_node(parent_node)
153 | 			if grand_parent_node and grand_parent_key not in candidates:
154 | 				candidates[grand_parent_key] = self.score_node(grand_parent_node)
155 | 
156 | 			content_score = 1
157 | 			content_score += len(inner_text.split(','))
158 | 			content_score += min([(len(inner_text) / 100), 3])
159 | 
160 | 			candidates[parent_key]['content_score'] += content_score
161 | 			if grand_parent_node:
162 | 				candidates[grand_parent_key]['content_score'] += content_score / 2.0
163 | 
164 | 		# Scale the final candidates score based on link density. Good content should have a
165 | 		# relatively small link density (5% or less) and be mostly unaffected by this operation.
166 | 		for elem, candidate in candidates.items():
167 | 			candidate['content_score'] *= (1 - self.get_link_density(elem))
168 | 			self.debug("candidate %s scored %s" % (describe(elem), candidate['content_score']))
169 | 
170 | 		return candidates
171 | 
172 | 	def class_weight(self, e):
173 | 		weight = 0
174 | 		if e.get('class', None):
175 | 			if REGEXES['negativeRe'].search(e['class']):
176 | 				weight -= 25
177 | 
178 | 			if REGEXES['positiveRe'].search(e['class']):
179 | 				weight += 25
180 | 
181 | 		if e.get('id', None):
182 | 			if REGEXES['negativeRe'].search(e['id']):
183 | 				weight -= 25
184 | 
185 | 			if REGEXES['positiveRe'].search(e['id']):
186 | 				weight += 25
187 | 
188 | 		return weight
189 | 
190 | 	def score_node(self, elem):
191 | 		content_score = self.class_weight(elem)
192 | 		name = elem.name.lower()
193 | 		if name == "div":
194 | 			content_score += 5
195 | 		elif name == "blockquote":
196 | 			content_score += 3
197 | 		elif name == "form":
198 | 			content_score -= 3
199 | 		elif name == "th":
200 | 			content_score -= 5
201 | 		return { 'content_score': content_score, 'elem': elem }
202 | 
203 | 	def debug(self, *a):
204 | 		if self.options['debug']:
205 | 			logging.debug(*a)
206 | 
207 | 	def remove_unlikely_candidates(self):
208 | 		for elem in self.html.findAll():
209 | 			s = "%s%s" % (elem.get('class', ''), elem.get('id', ''))
210 | 			if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.name != 'body':
211 | 				self.debug("Removing unlikely candidate - %s" % (s,))
212 | 				elem.extract()
213 | 
214 | 	def transform_misused_divs_into_paragraphs(self):
215 | 		for elem in self.html.findAll():
216 | 			if elem.name.lower() == "div":
217 | 				# transform <div>s that do not contain other block elements into <p>s
218 | 				if REGEXES['divToPElementsRe'].search(''.join(map(unicode, elem.contents))):
219 | 					self.debug("Altering div(#%s.%s) to p" % (elem.get('id', ''), elem.get('class', '')))
220 | 					elem.name = "p"
221 | 
222 | 	def tags(self, node, *tag_names):
223 | 		for tag_name in tag_names:
224 | 			for e in node.findAll(tag_name):
225 | 				yield e
226 | 
227 | 	def sanitize(self, node, candidates):
228 | 		for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
229 | 			if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33: header.extract()
230 | 
231 | 		for elem in self.tags(node, "form", "iframe"):
232 | 			elem.extract()
233 | 
234 | 		# Conditionally clean <table>s, <ul>s, and <div>s
235 | 		for el in self.tags(node, "table", "ul", "div"):
236 | 			weight = self.class_weight(el)
237 | 			el_key = HashableElement(el)
238 | 			if el_key in candidates:
239 | 				content_score = candidates[el_key]['content_score']
240 | 			else:
241 | 				content_score = 0
242 | 			name = el.name
243 | 
244 | 			if weight + content_score < 0:
245 | 				el.extract()
246 | 				self.debug("Conditionally cleaned %s with weight %s and content score %s because score + content score was less than zero." %
247 | 					(describe(el), weight, content_score))
248 | 			elif len(_text(el).split(",")) < 10:
249 | 				counts = {}
250 | 				for kind in ['p', 'img', 'li', 'a', 'embed', 'input']:
251 | 					counts[kind] = len(el.findAll(kind))
252 | 				counts["li"] -= 100
253 | 
254 | 				content_length = len(_text(el)) # Count the text length excluding any surrounding whitespace
255 | 				link_density = self.get_link_density(el)
256 | 				to_remove = False
257 | 				reason = ""
258 | 
259 | 				if counts["img"] > counts["p"]:
260 | 					reason = "too many images"
261 | 					to_remove = True
262 | 				elif counts["li"] > counts["p"] and name != "ul" and name != "ol":
263 | 					reason = "more <li>s than <p>s"
264 | 					to_remove = True
265 | 				elif counts["input"] > (counts["p"] / 3):
266 | 					reason = "less than 3x <p>s than <input>s"
267 | 					to_remove = True
268 | 				elif content_length < (self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD)) and (counts["img"] == 0 or counts["img"] > 2):
269 | 					reason = "too short a content length without a single image"
270 | 					to_remove = True
271 | 				elif weight < 25 and link_density > 0.2:
272 | 					reason = "too many links for its weight (#{weight})"
273 | 					to_remove = True
274 | 				elif weight >= 25 and link_density > 0.5:
275 | 					reason = "too many links for its weight (#{weight})"
276 | 					to_remove = True
277 | 				elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1:
278 | 					reason = "<embed>s with too short a content length, or too many <embed>s"
279 | 					to_remove = True
280 | 
281 | 				if to_remove:
282 | 					self.debug("Conditionally cleaned %s#%s.%s with weight %s and content score %s because it has %s." %
283 | 						(el.name, el.get('id',''), el.get('class', ''), weight, content_score, reason))
284 | 					el.extract()
285 | 
286 | 		for el in ([node] + node.findAll()):
287 | 			if not (self.options['attributes']):
288 | 				el.attrMap = {}
289 | 
290 | 		return unicode(node)
291 | 
292 | class HashableElement():
293 | 	def __init__(self, node):
294 | 		self.node = node
295 | 		self._path = None
296 | 
297 | 	def _get_path(self):
298 | 		if self._path is None:
299 | 			reverse_path = []
300 | 			node = self.node
301 | 			while node:
302 | 				node_id = (node.name, tuple(node.attrs), node.string)
303 | 				reverse_path.append(node_id)
304 | 				node = node.parent
305 | 			self._path = tuple(reverse_path)
306 | 		return self._path
307 | 	path = property(_get_path)
308 | 
309 | 	def __hash__(self):
310 | 		return hash(self.path)
311 | 
312 | 	def __eq__(self, other):
313 | 		return self.path == other.path
314 | 
315 | 	def __getattr__(self, name):
316 | 		return getattr(self.node, name)
317 | 
318 | def main():
319 | 	import sys
320 | 	from optparse import OptionParser
321 | 	parser = OptionParser(usage="%prog: [options] [file]")
322 | 	parser.add_option('-v', '--verbose', action='store_true')
323 | 	parser.add_option('-u', '--url', help="use URL instead of a local file")
324 | 	(options, args) = parser.parse_args()
325 | 	
326 | 	if not (len(args) == 1 or options.url):
327 | 		parser.print_help()
328 | 		sys.exit(1)
329 | 	logging.basicConfig(level=logging.DEBUG)
330 | 
331 | 	file = None
332 | 	if options.url:
333 | 		import urllib
334 | 		file = urllib.urlopen(options.url)
335 | 	else:
336 | 		file = open(args[0])
337 | 	try:
338 | 		print Document(file.read(), debug=options.verbose).summary().encode('ascii','ignore')
339 | 	finally:
340 | 		file.close()
341 | 
342 | if __name__ == '__main__':
343 | 	main()
344 | 


--------------------------------------------------------------------------------
/readability/url_helpers.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from urlparse import urlparse
 3 | 
 4 | def host_for_url(url):
 5 | 	"""
 6 | 	>>> host_for_url('http://base/whatever/fdsh')
 7 | 	'base'
 8 | 	>>> host_for_url('invalid')
 9 | 	"""
10 | 	host = urlparse(url)[1]
11 | 	if not host:
12 | 		logging.error("could not extract host from URL: %r" % (url,))
13 | 		return None
14 | 	return host
15 | 
16 | def absolute_url(url, base_href):
17 | 	"""
18 | 	>>> absolute_url('foo', 'http://base/whatever/ooo/fdsh')
19 | 	'http://base/whatever/ooo/foo'
20 | 
21 | 	>>> absolute_url('foo/bar/', 'http://base')
22 | 	'http://base/foo/bar/'
23 | 
24 | 	>>> absolute_url('/foo/bar', 'http://base/whatever/fdskf')
25 | 	'http://base/foo/bar'
26 | 
27 | 	>>> absolute_url('\\n/foo/bar', 'http://base/whatever/fdskf')
28 | 	'http://base/foo/bar'
29 | 
30 | 	>>> absolute_url('http://localhost/foo', 'http://base/whatever/fdskf')
31 | 	'http://localhost/foo'
32 | 	"""
33 | 	url = url.strip()
34 | 	proto = urlparse(url)[0]
35 | 	if proto:
36 | 		return url
37 | 
38 | 	base_url_parts = urlparse(base_href)
39 | 	base_server = '://'.join(base_url_parts[:2])
40 | 	if url.startswith('/'):
41 | 		return base_server + url
42 | 	else:
43 | 		path = base_url_parts[2]
44 | 		if '/' in path:
45 | 			path = path.rsplit('/', 1)[0] + '/'
46 | 		else:
47 | 			path = '/'
48 | 		return base_server + path + url
49 | 
50 | if __name__ == '__main__':
51 | 	import doctest
52 | 	doctest.testmod()


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from distutils.core import setup
 3 | 
 4 | 
 5 | setup(
 6 |     name="python-readability",
 7 |     author="Tim Cuthbertson",
 8 |     author_email="tim3d.junk+github@gmail.com",
 9 |     description="python port of arc90's readability bookmarklet",
10 |     long_description=open("README").read(),
11 |     license="Apache License 2.0",
12 |     url="http://github.com/gfxmonk/python-readability",
13 |     packages=[
14 |         "readability",
15 |     ],
16 |     classifiers=[
17 |         "Environment :: Web Environment",
18 |         "Intended Audience :: Developers",
19 |         "Operating System :: OS Independent",
20 |         "Programming Language :: Python",
21 |     ],
22 | )
23 | 


--------------------------------------------------------------------------------