├── BeautifulSoup.py
├── README.md
├── __init__.py
└── google.py


/BeautifulSoup.py:
--------------------------------------------------------------------------------
   1 | """Beautiful Soup
   2 | Elixir and Tonic
   3 | "The Screen-Scraper's Friend"
   4 | http://www.crummy.com/software/BeautifulSoup/
   5 | 
   6 | Beautiful Soup parses a (possibly invalid) XML or HTML document into a
   7 | tree representation. It provides methods and Pythonic idioms that make
   8 | it easy to navigate, search, and modify the tree.
   9 | 
  10 | A well-formed XML/HTML document yields a well-formed data
  11 | structure. An ill-formed XML/HTML document yields a correspondingly
  12 | ill-formed data structure. If your document is only locally
  13 | well-formed, you can use this library to find and process the
  14 | well-formed part of it.
  15 | 
  16 | Beautiful Soup works with Python 2.2 and up. It has no external
  17 | dependencies, but you'll have more success at converting data to UTF-8
  18 | if you also install these three packages:
  19 | 
  20 | * chardet, for auto-detecting character encodings
  21 |   http://chardet.feedparser.org/
  22 | * cjkcodecs and iconv_codec, which add more encodings to the ones supported
  23 |   by stock Python.
  24 |   http://cjkpython.i18n.org/
  25 | 
  26 | Beautiful Soup defines classes for two main parsing strategies:
  27 | 
  28 |  * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
  29 |    language that kind of looks like XML.
  30 | 
  31 |  * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
  32 |    or invalid. This class has web browser-like heuristics for
  33 |    obtaining a sensible parse tree in the face of common HTML errors.
  34 | 
  35 | Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
  36 | the encoding of an HTML or XML document, and converting it to
  37 | Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
  38 | 
  39 | For more than you ever wanted to know about Beautiful Soup, see the
  40 | documentation:
  41 | http://www.crummy.com/software/BeautifulSoup/documentation.html
  42 | 
  43 | Here, have some legalese:
  44 | 
  45 | Copyright (c) 2004-2010, Leonard Richardson
  46 | 
  47 | All rights reserved.
  48 | 
  49 | Redistribution and use in source and binary forms, with or without
  50 | modification, are permitted provided that the following conditions are
  51 | met:
  52 | 
  53 |   * Redistributions of source code must retain the above copyright
  54 |     notice, this list of conditions and the following disclaimer.
  55 | 
  56 |   * Redistributions in binary form must reproduce the above
  57 |     copyright notice, this list of conditions and the following
  58 |     disclaimer in the documentation and/or other materials provided
  59 |     with the distribution.
  60 | 
  61 |   * Neither the name of the the Beautiful Soup Consortium and All
  62 |     Night Kosher Bakery nor the names of its contributors may be
  63 |     used to endorse or promote products derived from this software
  64 |     without specific prior written permission.
  65 | 
  66 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  67 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  68 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  69 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  70 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  71 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  72 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  73 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  74 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  75 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  76 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
  77 | 
  78 | """
  79 | from __future__ import generators
  80 | 
  81 | __author__ = "Leonard Richardson (leonardr@segfault.org)"
  82 | __version__ = "3.2.0"
  83 | __copyright__ = "Copyright (c) 2004-2010 Leonard Richardson"
  84 | __license__ = "New-style BSD"
  85 | 
  86 | from sgmllib import SGMLParser, SGMLParseError
  87 | import codecs
  88 | import markupbase
  89 | import types
  90 | import re
  91 | import sgmllib
  92 | try:
  93 |   from htmlentitydefs import name2codepoint
  94 | except ImportError:
  95 |   name2codepoint = {}
  96 | try:
  97 |     set
  98 | except NameError:
  99 |     from sets import Set as set
 100 | 
 101 | #These hacks make Beautiful Soup able to parse XML with namespaces
 102 | sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
 103 | markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match
 104 | 
 105 | DEFAULT_OUTPUT_ENCODING = "utf-8"
 106 | 
 107 | def _match_css_class(str):
 108 |     """Build a RE to match the given CSS class."""
 109 |     return re.compile(r"(^|.*\s)%s($|\s)" % str)
 110 | 
 111 | # First, the classes that represent markup elements.
 112 | 
 113 | class PageElement(object):
 114 |     """Contains the navigational information for some part of the page
 115 |     (either a tag or a piece of text)"""
 116 | 
 117 |     def setup(self, parent=None, previous=None):
 118 |         """Sets up the initial relations between this element and
 119 |         other elements."""
 120 |         self.parent = parent
 121 |         self.previous = previous
 122 |         self.next = None
 123 |         self.previousSibling = None
 124 |         self.nextSibling = None
 125 |         if self.parent and self.parent.contents:
 126 |             self.previousSibling = self.parent.contents[-1]
 127 |             self.previousSibling.nextSibling = self
 128 | 
 129 |     def replaceWith(self, replaceWith):
 130 |         oldParent = self.parent
 131 |         myIndex = self.parent.index(self)
 132 |         if hasattr(replaceWith, "parent")\
 133 |                   and replaceWith.parent is self.parent:
 134 |             # We're replacing this element with one of its siblings.
 135 |             index = replaceWith.parent.index(replaceWith)
 136 |             if index and index < myIndex:
 137 |                 # Furthermore, it comes before this element. That
 138 |                 # means that when we extract it, the index of this
 139 |                 # element will change.
 140 |                 myIndex = myIndex - 1
 141 |         self.extract()
 142 |         oldParent.insert(myIndex, replaceWith)
 143 | 
 144 |     def replaceWithChildren(self):
 145 |         myParent = self.parent
 146 |         myIndex = self.parent.index(self)
 147 |         self.extract()
 148 |         reversedChildren = list(self.contents)
 149 |         reversedChildren.reverse()
 150 |         for child in reversedChildren:
 151 |             myParent.insert(myIndex, child)
 152 | 
 153 |     def extract(self):
 154 |         """Destructively rips this element out of the tree."""
 155 |         if self.parent:
 156 |             try:
 157 |                 del self.parent.contents[self.parent.index(self)]
 158 |             except ValueError:
 159 |                 pass
 160 | 
 161 |         #Find the two elements that would be next to each other if
 162 |         #this element (and any children) hadn't been parsed. Connect
 163 |         #the two.
 164 |         lastChild = self._lastRecursiveChild()
 165 |         nextElement = lastChild.next
 166 | 
 167 |         if self.previous:
 168 |             self.previous.next = nextElement
 169 |         if nextElement:
 170 |             nextElement.previous = self.previous
 171 |         self.previous = None
 172 |         lastChild.next = None
 173 | 
 174 |         self.parent = None
 175 |         if self.previousSibling:
 176 |             self.previousSibling.nextSibling = self.nextSibling
 177 |         if self.nextSibling:
 178 |             self.nextSibling.previousSibling = self.previousSibling
 179 |         self.previousSibling = self.nextSibling = None
 180 |         return self
 181 | 
 182 |     def _lastRecursiveChild(self):
 183 |         "Finds the last element beneath this object to be parsed."
 184 |         lastChild = self
 185 |         while hasattr(lastChild, 'contents') and lastChild.contents:
 186 |             lastChild = lastChild.contents[-1]
 187 |         return lastChild
 188 | 
 189 |     def insert(self, position, newChild):
 190 |         if isinstance(newChild, basestring) \
 191 |             and not isinstance(newChild, NavigableString):
 192 |             newChild = NavigableString(newChild)
 193 | 
 194 |         position =  min(position, len(self.contents))
 195 |         if hasattr(newChild, 'parent') and newChild.parent is not None:
 196 |             # We're 'inserting' an element that's already one
 197 |             # of this object's children.
 198 |             if newChild.parent is self:
 199 |                 index = self.index(newChild)
 200 |                 if index > position:
 201 |                     # Furthermore we're moving it further down the
 202 |                     # list of this object's children. That means that
 203 |                     # when we extract this element, our target index
 204 |                     # will jump down one.
 205 |                     position = position - 1
 206 |             newChild.extract()
 207 | 
 208 |         newChild.parent = self
 209 |         previousChild = None
 210 |         if position == 0:
 211 |             newChild.previousSibling = None
 212 |             newChild.previous = self
 213 |         else:
 214 |             previousChild = self.contents[position-1]
 215 |             newChild.previousSibling = previousChild
 216 |             newChild.previousSibling.nextSibling = newChild
 217 |             newChild.previous = previousChild._lastRecursiveChild()
 218 |         if newChild.previous:
 219 |             newChild.previous.next = newChild
 220 | 
 221 |         newChildsLastElement = newChild._lastRecursiveChild()
 222 | 
 223 |         if position >= len(self.contents):
 224 |             newChild.nextSibling = None
 225 | 
 226 |             parent = self
 227 |             parentsNextSibling = None
 228 |             while not parentsNextSibling:
 229 |                 parentsNextSibling = parent.nextSibling
 230 |                 parent = parent.parent
 231 |                 if not parent: # This is the last element in the document.
 232 |                     break
 233 |             if parentsNextSibling:
 234 |                 newChildsLastElement.next = parentsNextSibling
 235 |             else:
 236 |                 newChildsLastElement.next = None
 237 |         else:
 238 |             nextChild = self.contents[position]
 239 |             newChild.nextSibling = nextChild
 240 |             if newChild.nextSibling:
 241 |                 newChild.nextSibling.previousSibling = newChild
 242 |             newChildsLastElement.next = nextChild
 243 | 
 244 |         if newChildsLastElement.next:
 245 |             newChildsLastElement.next.previous = newChildsLastElement
 246 |         self.contents.insert(position, newChild)
 247 | 
 248 |     def append(self, tag):
 249 |         """Appends the given tag to the contents of this tag."""
 250 |         self.insert(len(self.contents), tag)
 251 | 
 252 |     def findNext(self, name=None, attrs={}, text=None, **kwargs):
 253 |         """Returns the first item that matches the given criteria and
 254 |         appears after this Tag in the document."""
 255 |         return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
 256 | 
 257 |     def findAllNext(self, name=None, attrs={}, text=None, limit=None,
 258 |                     **kwargs):
 259 |         """Returns all items that match the given criteria and appear
 260 |         after this Tag in the document."""
 261 |         return self._findAll(name, attrs, text, limit, self.nextGenerator,
 262 |                              **kwargs)
 263 | 
 264 |     def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
 265 |         """Returns the closest sibling to this Tag that matches the
 266 |         given criteria and appears after this Tag in the document."""
 267 |         return self._findOne(self.findNextSiblings, name, attrs, text,
 268 |                              **kwargs)
 269 | 
 270 |     def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
 271 |                          **kwargs):
 272 |         """Returns the siblings of this Tag that match the given
 273 |         criteria and appear after this Tag in the document."""
 274 |         return self._findAll(name, attrs, text, limit,
 275 |                              self.nextSiblingGenerator, **kwargs)
 276 |     fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
 277 | 
 278 |     def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
 279 |         """Returns the first item that matches the given criteria and
 280 |         appears before this Tag in the document."""
 281 |         return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
 282 | 
 283 |     def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
 284 |                         **kwargs):
 285 |         """Returns all items that match the given criteria and appear
 286 |         before this Tag in the document."""
 287 |         return self._findAll(name, attrs, text, limit, self.previousGenerator,
 288 |                            **kwargs)
 289 |     fetchPrevious = findAllPrevious # Compatibility with pre-3.x
 290 | 
 291 |     def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
 292 |         """Returns the closest sibling to this Tag that matches the
 293 |         given criteria and appears before this Tag in the document."""
 294 |         return self._findOne(self.findPreviousSiblings, name, attrs, text,
 295 |                              **kwargs)
 296 | 
 297 |     def findPreviousSiblings(self, name=None, attrs={}, text=None,
 298 |                              limit=None, **kwargs):
 299 |         """Returns the siblings of this Tag that match the given
 300 |         criteria and appear before this Tag in the document."""
 301 |         return self._findAll(name, attrs, text, limit,
 302 |                              self.previousSiblingGenerator, **kwargs)
 303 |     fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
 304 | 
 305 |     def findParent(self, name=None, attrs={}, **kwargs):
 306 |         """Returns the closest parent of this Tag that matches the given
 307 |         criteria."""
 308 |         # NOTE: We can't use _findOne because findParents takes a different
 309 |         # set of arguments.
 310 |         r = None
 311 |         l = self.findParents(name, attrs, 1)
 312 |         if l:
 313 |             r = l[0]
 314 |         return r
 315 | 
 316 |     def findParents(self, name=None, attrs={}, limit=None, **kwargs):
 317 |         """Returns the parents of this Tag that match the given
 318 |         criteria."""
 319 | 
 320 |         return self._findAll(name, attrs, None, limit, self.parentGenerator,
 321 |                              **kwargs)
 322 |     fetchParents = findParents # Compatibility with pre-3.x
 323 | 
 324 |     #These methods do the real heavy lifting.
 325 | 
 326 |     def _findOne(self, method, name, attrs, text, **kwargs):
 327 |         r = None
 328 |         l = method(name, attrs, text, 1, **kwargs)
 329 |         if l:
 330 |             r = l[0]
 331 |         return r
 332 | 
 333 |     def _findAll(self, name, attrs, text, limit, generator, **kwargs):
 334 |         "Iterates over a generator looking for things that match."
 335 | 
 336 |         if isinstance(name, SoupStrainer):
 337 |             strainer = name
 338 |         # (Possibly) special case some findAll*(...) searches
 339 |         elif text is None and not limit and not attrs and not kwargs:
 340 |             # findAll*(True)
 341 |             if name is True:
 342 |                 return [element for element in generator()
 343 |                         if isinstance(element, Tag)]
 344 |             # findAll*('tag-name')
 345 |             elif isinstance(name, basestring):
 346 |                 return [element for element in generator()
 347 |                         if isinstance(element, Tag) and
 348 |                         element.name == name]
 349 |             else:
 350 |                 strainer = SoupStrainer(name, attrs, text, **kwargs)
 351 |         # Build a SoupStrainer
 352 |         else:
 353 |             strainer = SoupStrainer(name, attrs, text, **kwargs)
 354 |         results = ResultSet(strainer)
 355 |         g = generator()
 356 |         while True:
 357 |             try:
 358 |                 i = g.next()
 359 |             except StopIteration:
 360 |                 break
 361 |             if i:
 362 |                 found = strainer.search(i)
 363 |                 if found:
 364 |                     results.append(found)
 365 |                     if limit and len(results) >= limit:
 366 |                         break
 367 |         return results
 368 | 
 369 |     #These Generators can be used to navigate starting from both
 370 |     #NavigableStrings and Tags.
 371 |     def nextGenerator(self):
 372 |         i = self
 373 |         while i is not None:
 374 |             i = i.next
 375 |             yield i
 376 | 
 377 |     def nextSiblingGenerator(self):
 378 |         i = self
 379 |         while i is not None:
 380 |             i = i.nextSibling
 381 |             yield i
 382 | 
 383 |     def previousGenerator(self):
 384 |         i = self
 385 |         while i is not None:
 386 |             i = i.previous
 387 |             yield i
 388 | 
 389 |     def previousSiblingGenerator(self):
 390 |         i = self
 391 |         while i is not None:
 392 |             i = i.previousSibling
 393 |             yield i
 394 | 
 395 |     def parentGenerator(self):
 396 |         i = self
 397 |         while i is not None:
 398 |             i = i.parent
 399 |             yield i
 400 | 
 401 |     # Utility methods
 402 |     def substituteEncoding(self, str, encoding=None):
 403 |         encoding = encoding or "utf-8"
 404 |         return str.replace("%SOUP-ENCODING%", encoding)
 405 | 
 406 |     def toEncoding(self, s, encoding=None):
 407 |         """Encodes an object to a string in some encoding, or to Unicode.
 408 |         ."""
 409 |         if isinstance(s, unicode):
 410 |             if encoding:
 411 |                 s = s.encode(encoding)
 412 |         elif isinstance(s, str):
 413 |             if encoding:
 414 |                 s = s.encode(encoding)
 415 |             else:
 416 |                 s = unicode(s)
 417 |         else:
 418 |             if encoding:
 419 |                 s  = self.toEncoding(str(s), encoding)
 420 |             else:
 421 |                 s = unicode(s)
 422 |         return s
 423 | 
 424 | class NavigableString(unicode, PageElement):
 425 | 
 426 |     def __new__(cls, value):
 427 |         """Create a new NavigableString.
 428 | 
 429 |         When unpickling a NavigableString, this method is called with
 430 |         the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
 431 |         passed in to the superclass's __new__ or the superclass won't know
 432 |         how to handle non-ASCII characters.
 433 |         """
 434 |         if isinstance(value, unicode):
 435 |             return unicode.__new__(cls, value)
 436 |         return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
 437 | 
 438 |     def __getnewargs__(self):
 439 |         return (NavigableString.__str__(self),)
 440 | 
 441 |     def __getattr__(self, attr):
 442 |         """text.string gives you text. This is for backwards
 443 |         compatibility for Navigable*String, but for CData* it lets you
 444 |         get the string without the CData wrapper."""
 445 |         if attr == 'string':
 446 |             return self
 447 |         else:
 448 |             raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
 449 | 
 450 |     def __unicode__(self):
 451 |         return str(self).decode(DEFAULT_OUTPUT_ENCODING)
 452 | 
 453 |     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 454 |         if encoding:
 455 |             return self.encode(encoding)
 456 |         else:
 457 |             return self
 458 | 
 459 | class CData(NavigableString):
 460 | 
 461 |     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 462 |         return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
 463 | 
 464 | class ProcessingInstruction(NavigableString):
 465 |     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 466 |         output = self
 467 |         if "%SOUP-ENCODING%" in output:
 468 |             output = self.substituteEncoding(output, encoding)
 469 |         return "<?%s?>" % self.toEncoding(output, encoding)
 470 | 
 471 | class Comment(NavigableString):
 472 |     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 473 |         return "<!--%s-->" % NavigableString.__str__(self, encoding)
 474 | 
 475 | class Declaration(NavigableString):
 476 |     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 477 |         return "<!%s>" % NavigableString.__str__(self, encoding)
 478 | 
 479 | class Tag(PageElement):
 480 | 
 481 |     """Represents a found HTML tag with its attributes and contents."""
 482 | 
 483 |     def _invert(h):
 484 |         "Cheap function to invert a hash."
 485 |         i = {}
 486 |         for k,v in h.items():
 487 |             i[v] = k
 488 |         return i
 489 | 
 490 |     XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
 491 |                                       "quot" : '"',
 492 |                                       "amp" : "&",
 493 |                                       "lt" : "<",
 494 |                                       "gt" : ">" }
 495 | 
 496 |     XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
 497 | 
 498 |     def _convertEntities(self, match):
 499 |         """Used in a call to re.sub to replace HTML, XML, and numeric
 500 |         entities with the appropriate Unicode characters. If HTML
 501 |         entities are being converted, any unrecognized entities are
 502 |         escaped."""
 503 |         x = match.group(1)
 504 |         if self.convertHTMLEntities and x in name2codepoint:
 505 |             return unichr(name2codepoint[x])
 506 |         elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
 507 |             if self.convertXMLEntities:
 508 |                 return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
 509 |             else:
 510 |                 return u'&%s;' % x
 511 |         elif len(x) > 0 and x[0] == '#':
 512 |             # Handle numeric entities
 513 |             if len(x) > 1 and x[1] == 'x':
 514 |                 return unichr(int(x[2:], 16))
 515 |             else:
 516 |                 return unichr(int(x[1:]))
 517 | 
 518 |         elif self.escapeUnrecognizedEntities:
 519 |             return u'&amp;%s;' % x
 520 |         else:
 521 |             return u'&%s;' % x
 522 | 
 523 |     def __init__(self, parser, name, attrs=None, parent=None,
 524 |                  previous=None):
 525 |         "Basic constructor."
 526 | 
 527 |         # We don't actually store the parser object: that lets extracted
 528 |         # chunks be garbage-collected
 529 |         self.parserClass = parser.__class__
 530 |         self.isSelfClosing = parser.isSelfClosingTag(name)
 531 |         self.name = name
 532 |         if attrs is None:
 533 |             attrs = []
 534 |         elif isinstance(attrs, dict):
 535 |             attrs = attrs.items()
 536 |         self.attrs = attrs
 537 |         self.contents = []
 538 |         self.setup(parent, previous)
 539 |         self.hidden = False
 540 |         self.containsSubstitutions = False
 541 |         self.convertHTMLEntities = parser.convertHTMLEntities
 542 |         self.convertXMLEntities = parser.convertXMLEntities
 543 |         self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
 544 | 
 545 |         # Convert any HTML, XML, or numeric entities in the attribute values.
 546 |         convert = lambda(k, val): (k,
 547 |                                    re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
 548 |                                           self._convertEntities,
 549 |                                           val))
 550 |         self.attrs = map(convert, self.attrs)
 551 | 
 552 |     def getString(self):
 553 |         if (len(self.contents) == 1
 554 |             and isinstance(self.contents[0], NavigableString)):
 555 |             return self.contents[0]
 556 | 
 557 |     def setString(self, string):
 558 |         """Replace the contents of the tag with a string"""
 559 |         self.clear()
 560 |         self.append(string)
 561 | 
 562 |     string = property(getString, setString)
 563 | 
 564 |     def getText(self, separator=u""):
 565 |         if not len(self.contents):
 566 |             return u""
 567 |         stopNode = self._lastRecursiveChild().next
 568 |         strings = []
 569 |         current = self.contents[0]
 570 |         while current is not stopNode:
 571 |             if isinstance(current, NavigableString):
 572 |                 strings.append(current.strip())
 573 |             current = current.next
 574 |         return separator.join(strings)
 575 | 
 576 |     text = property(getText)
 577 | 
 578 |     def get(self, key, default=None):
 579 |         """Returns the value of the 'key' attribute for the tag, or
 580 |         the value given for 'default' if it doesn't have that
 581 |         attribute."""
 582 |         return self._getAttrMap().get(key, default)
 583 | 
 584 |     def clear(self):
 585 |         """Extract all children."""
 586 |         for child in self.contents[:]:
 587 |             child.extract()
 588 | 
 589 |     def index(self, element):
 590 |         for i, child in enumerate(self.contents):
 591 |             if child is element:
 592 |                 return i
 593 |         raise ValueError("Tag.index: element not in tag")
 594 | 
 595 |     def has_key(self, key):
 596 |         return self._getAttrMap().has_key(key)
 597 | 
 598 |     def __getitem__(self, key):
 599 |         """tag[key] returns the value of the 'key' attribute for the tag,
 600 |         and throws an exception if it's not there."""
 601 |         return self._getAttrMap()[key]
 602 | 
 603 |     def __iter__(self):
 604 |         "Iterating over a tag iterates over its contents."
 605 |         return iter(self.contents)
 606 | 
 607 |     def __len__(self):
 608 |         "The length of a tag is the length of its list of contents."
 609 |         return len(self.contents)
 610 | 
 611 |     def __contains__(self, x):
 612 |         return x in self.contents
 613 | 
 614 |     def __nonzero__(self):
 615 |         "A tag is non-None even if it has no contents."
 616 |         return True
 617 | 
 618 |     def __setitem__(self, key, value):
 619 |         """Setting tag[key] sets the value of the 'key' attribute for the
 620 |         tag."""
 621 |         self._getAttrMap()
 622 |         self.attrMap[key] = value
 623 |         found = False
 624 |         for i in range(0, len(self.attrs)):
 625 |             if self.attrs[i][0] == key:
 626 |                 self.attrs[i] = (key, value)
 627 |                 found = True
 628 |         if not found:
 629 |             self.attrs.append((key, value))
 630 |         self._getAttrMap()[key] = value
 631 | 
 632 |     def __delitem__(self, key):
 633 |         "Deleting tag[key] deletes all 'key' attributes for the tag."
 634 |         for item in self.attrs:
 635 |             if item[0] == key:
 636 |                 self.attrs.remove(item)
 637 |                 #We don't break because bad HTML can define the same
 638 |                 #attribute multiple times.
 639 |             self._getAttrMap()
 640 |             if self.attrMap.has_key(key):
 641 |                 del self.attrMap[key]
 642 | 
 643 |     def __call__(self, *args, **kwargs):
 644 |         """Calling a tag like a function is the same as calling its
 645 |         findAll() method. Eg. tag('a') returns a list of all the A tags
 646 |         found within this tag."""
 647 |         return apply(self.findAll, args, kwargs)
 648 | 
 649 |     def __getattr__(self, tag):
 650 |         #print "Getattr %s.%s" % (self.__class__, tag)
 651 |         if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
 652 |             return self.find(tag[:-3])
 653 |         elif tag.find('__') != 0:
 654 |             return self.find(tag)
 655 |         raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
 656 | 
 657 |     def __eq__(self, other):
 658 |         """Returns true iff this tag has the same name, the same attributes,
 659 |         and the same contents (recursively) as the given tag.
 660 | 
 661 |         NOTE: right now this will return false if two tags have the
 662 |         same attributes in a different order. Should this be fixed?"""
 663 |         if other is self:
 664 |             return True
 665 |         if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
 666 |             return False
 667 |         for i in range(0, len(self.contents)):
 668 |             if self.contents[i] != other.contents[i]:
 669 |                 return False
 670 |         return True
 671 | 
 672 |     def __ne__(self, other):
 673 |         """Returns true iff this tag is not identical to the other tag,
 674 |         as defined in __eq__."""
 675 |         return not self == other
 676 | 
 677 |     def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 678 |         """Renders this tag as a string."""
 679 |         return self.__str__(encoding)
 680 | 
 681 |     def __unicode__(self):
 682 |         return self.__str__(None)
 683 | 
 684 |     BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
 685 |                                            + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
 686 |                                            + ")")
 687 | 
 688 |     def _sub_entity(self, x):
 689 |         """Used with a regular expression to substitute the
 690 |         appropriate XML entity for an XML special character."""
 691 |         return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
 692 | 
 693 |     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
 694 |                 prettyPrint=False, indentLevel=0):
 695 |         """Returns a string or Unicode representation of this tag and
 696 |         its contents. To get Unicode, pass None for encoding.
 697 | 
 698 |         NOTE: since Python's HTML parser consumes whitespace, this
 699 |         method is not certain to reproduce the whitespace present in
 700 |         the original string."""
 701 | 
 702 |         encodedName = self.toEncoding(self.name, encoding)
 703 | 
 704 |         attrs = []
 705 |         if self.attrs:
 706 |             for key, val in self.attrs:
 707 |                 fmt = '%s="%s"'
 708 |                 if isinstance(val, basestring):
 709 |                     if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
 710 |                         val = self.substituteEncoding(val, encoding)
 711 | 
 712 |                     # The attribute value either:
 713 |                     #
 714 |                     # * Contains no embedded double quotes or single quotes.
 715 |                     #   No problem: we enclose it in double quotes.
 716 |                     # * Contains embedded single quotes. No problem:
 717 |                     #   double quotes work here too.
 718 |                     # * Contains embedded double quotes. No problem:
 719 |                     #   we enclose it in single quotes.
 720 |                     # * Embeds both single _and_ double quotes. This
 721 |                     #   can't happen naturally, but it can happen if
 722 |                     #   you modify an attribute value after parsing
 723 |                     #   the document. Now we have a bit of a
 724 |                     #   problem. We solve it by enclosing the
 725 |                     #   attribute in single quotes, and escaping any
 726 |                     #   embedded single quotes to XML entities.
 727 |                     if '"' in val:
 728 |                         fmt = "%s='%s'"
 729 |                         if "'" in val:
 730 |                             # TODO: replace with apos when
 731 |                             # appropriate.
 732 |                             val = val.replace("'", "&squot;")
 733 | 
 734 |                     # Now we're okay w/r/t quotes. But the attribute
 735 |                     # value might also contain angle brackets, or
 736 |                     # ampersands that aren't part of entities. We need
 737 |                     # to escape those to XML entities too.
 738 |                     val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
 739 | 
 740 |                 attrs.append(fmt % (self.toEncoding(key, encoding),
 741 |                                     self.toEncoding(val, encoding)))
 742 |         close = ''
 743 |         closeTag = ''
 744 |         if self.isSelfClosing:
 745 |             close = ' /'
 746 |         else:
 747 |             closeTag = '</%s>' % encodedName
 748 | 
 749 |         indentTag, indentContents = 0, 0
 750 |         if prettyPrint:
 751 |             indentTag = indentLevel
 752 |             space = (' ' * (indentTag-1))
 753 |             indentContents = indentTag + 1
 754 |         contents = self.renderContents(encoding, prettyPrint, indentContents)
 755 |         if self.hidden:
 756 |             s = contents
 757 |         else:
 758 |             s = []
 759 |             attributeString = ''
 760 |             if attrs:
 761 |                 attributeString = ' ' + ' '.join(attrs)
 762 |             if prettyPrint:
 763 |                 s.append(space)
 764 |             s.append('<%s%s%s>' % (encodedName, attributeString, close))
 765 |             if prettyPrint:
 766 |                 s.append("\n")
 767 |             s.append(contents)
 768 |             if prettyPrint and contents and contents[-1] != "\n":
 769 |                 s.append("\n")
 770 |             if prettyPrint and closeTag:
 771 |                 s.append(space)
 772 |             s.append(closeTag)
 773 |             if prettyPrint and closeTag and self.nextSibling:
 774 |                 s.append("\n")
 775 |             s = ''.join(s)
 776 |         return s
 777 | 
 778 |     def decompose(self):
 779 |         """Recursively destroys the contents of this tree."""
 780 |         self.extract()
 781 |         if len(self.contents) == 0:
 782 |             return
 783 |         current = self.contents[0]
 784 |         while current is not None:
 785 |             next = current.next
 786 |             if isinstance(current, Tag):
 787 |                 del current.contents[:]
 788 |             current.parent = None
 789 |             current.previous = None
 790 |             current.previousSibling = None
 791 |             current.next = None
 792 |             current.nextSibling = None
 793 |             current = next
 794 | 
 795 |     def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
 796 |         return self.__str__(encoding, True)
 797 | 
 798 |     def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
 799 |                        prettyPrint=False, indentLevel=0):
 800 |         """Renders the contents of this tag as a string in the given
 801 |         encoding. If encoding is None, returns a Unicode string.."""
 802 |         s=[]
 803 |         for c in self:
 804 |             text = None
 805 |             if isinstance(c, NavigableString):
 806 |                 text = c.__str__(encoding)
 807 |             elif isinstance(c, Tag):
 808 |                 s.append(c.__str__(encoding, prettyPrint, indentLevel))
 809 |             if text and prettyPrint:
 810 |                 text = text.strip()
 811 |             if text:
 812 |                 if prettyPrint:
 813 |                     s.append(" " * (indentLevel-1))
 814 |                 s.append(text)
 815 |                 if prettyPrint:
 816 |                     s.append("\n")
 817 |         return ''.join(s)
 818 | 
 819 |     #Soup methods
 820 | 
 821 |     def find(self, name=None, attrs={}, recursive=True, text=None,
 822 |              **kwargs):
 823 |         """Return only the first child of this Tag matching the given
 824 |         criteria."""
 825 |         r = None
 826 |         l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
 827 |         if l:
 828 |             r = l[0]
 829 |         return r
 830 |     findChild = find
 831 | 
 832 |     def findAll(self, name=None, attrs={}, recursive=True, text=None,
 833 |                 limit=None, **kwargs):
 834 |         """Extracts a list of Tag objects that match the given
 835 |         criteria.  You can specify the name of the Tag and any
 836 |         attributes you want the Tag to have.
 837 | 
 838 |         The value of a key-value pair in the 'attrs' map can be a
 839 |         string, a list of strings, a regular expression object, or a
 840 |         callable that takes a string and returns whether or not the
 841 |         string matches for some custom definition of 'matches'. The
 842 |         same is true of the tag name."""
 843 |         generator = self.recursiveChildGenerator
 844 |         if not recursive:
 845 |             generator = self.childGenerator
 846 |         return self._findAll(name, attrs, text, limit, generator, **kwargs)
 847 |     findChildren = findAll
 848 | 
 849 |     # Pre-3.x compatibility methods
 850 |     first = find
 851 |     fetch = findAll
 852 | 
 853 |     def fetchText(self, text=None, recursive=True, limit=None):
 854 |         return self.findAll(text=text, recursive=recursive, limit=limit)
 855 | 
 856 |     def firstText(self, text=None, recursive=True):
 857 |         return self.find(text=text, recursive=recursive)
 858 | 
 859 |     #Private methods
 860 | 
 861 |     def _getAttrMap(self):
 862 |         """Initializes a map representation of this tag's attributes,
 863 |         if not already initialized."""
 864 |         if not getattr(self, 'attrMap'):
 865 |             self.attrMap = {}
 866 |             for (key, value) in self.attrs:
 867 |                 self.attrMap[key] = value
 868 |         return self.attrMap
 869 | 
 870 |     #Generator methods
 871 |     def childGenerator(self):
 872 |         # Just use the iterator from the contents
 873 |         return iter(self.contents)
 874 | 
 875 |     def recursiveChildGenerator(self):
 876 |         if not len(self.contents):
 877 |             raise StopIteration
 878 |         stopNode = self._lastRecursiveChild().next
 879 |         current = self.contents[0]
 880 |         while current is not stopNode:
 881 |             yield current
 882 |             current = current.next
 883 | 
 884 | 
 885 | # Next, a couple classes to represent queries and their results.
 886 | class SoupStrainer:
 887 |     """Encapsulates a number of ways of matching a markup element (tag or
 888 |     text)."""
 889 | 
 890 |     def __init__(self, name=None, attrs={}, text=None, **kwargs):
 891 |         self.name = name
 892 |         if isinstance(attrs, basestring):
 893 |             kwargs['class'] = _match_css_class(attrs)
 894 |             attrs = None
 895 |         if kwargs:
 896 |             if attrs:
 897 |                 attrs = attrs.copy()
 898 |                 attrs.update(kwargs)
 899 |             else:
 900 |                 attrs = kwargs
 901 |         self.attrs = attrs
 902 |         self.text = text
 903 | 
 904 |     def __str__(self):
 905 |         if self.text:
 906 |             return self.text
 907 |         else:
 908 |             return "%s|%s" % (self.name, self.attrs)
 909 | 
 910 |     def searchTag(self, markupName=None, markupAttrs={}):
 911 |         found = None
 912 |         markup = None
 913 |         if isinstance(markupName, Tag):
 914 |             markup = markupName
 915 |             markupAttrs = markup
 916 |         callFunctionWithTagData = callable(self.name) \
 917 |                                 and not isinstance(markupName, Tag)
 918 | 
 919 |         if (not self.name) \
 920 |                or callFunctionWithTagData \
 921 |                or (markup and self._matches(markup, self.name)) \
 922 |                or (not markup and self._matches(markupName, self.name)):
 923 |             if callFunctionWithTagData:
 924 |                 match = self.name(markupName, markupAttrs)
 925 |             else:
 926 |                 match = True
 927 |                 markupAttrMap = None
 928 |                 for attr, matchAgainst in self.attrs.items():
 929 |                     if not markupAttrMap:
 930 |                          if hasattr(markupAttrs, 'get'):
 931 |                             markupAttrMap = markupAttrs
 932 |                          else:
 933 |                             markupAttrMap = {}
 934 |                             for k,v in markupAttrs:
 935 |                                 markupAttrMap[k] = v
 936 |                     attrValue = markupAttrMap.get(attr)
 937 |                     if not self._matches(attrValue, matchAgainst):
 938 |                         match = False
 939 |                         break
 940 |             if match:
 941 |                 if markup:
 942 |                     found = markup
 943 |                 else:
 944 |                     found = markupName
 945 |         return found
 946 | 
 947 |     def search(self, markup):
 948 |         #print 'looking for %s in %s' % (self, markup)
 949 |         found = None
 950 |         # If given a list of items, scan it for a text element that
 951 |         # matches.
 952 |         if hasattr(markup, "__iter__") \
 953 |                 and not isinstance(markup, Tag):
 954 |             for element in markup:
 955 |                 if isinstance(element, NavigableString) \
 956 |                        and self.search(element):
 957 |                     found = element
 958 |                     break
 959 |         # If it's a Tag, make sure its name or attributes match.
 960 |         # Don't bother with Tags if we're searching for text.
 961 |         elif isinstance(markup, Tag):
 962 |             if not self.text:
 963 |                 found = self.searchTag(markup)
 964 |         # If it's text, make sure the text matches.
 965 |         elif isinstance(markup, NavigableString) or \
 966 |                  isinstance(markup, basestring):
 967 |             if self._matches(markup, self.text):
 968 |                 found = markup
 969 |         else:
 970 |             raise Exception, "I don't know how to match against a %s" \
 971 |                   % markup.__class__
 972 |         return found
 973 | 
 974 |     def _matches(self, markup, matchAgainst):
 975 |         #print "Matching %s against %s" % (markup, matchAgainst)
 976 |         result = False
 977 |         if matchAgainst is True:
 978 |             result = markup is not None
 979 |         elif callable(matchAgainst):
 980 |             result = matchAgainst(markup)
 981 |         else:
 982 |             #Custom match methods take the tag as an argument, but all
 983 |             #other ways of matching match the tag name as a string.
 984 |             if isinstance(markup, Tag):
 985 |                 markup = markup.name
 986 |             if markup and not isinstance(markup, basestring):
 987 |                 markup = unicode(markup)
 988 |             #Now we know that chunk is either a string, or None.
 989 |             if hasattr(matchAgainst, 'match'):
 990 |                 # It's a regexp object.
 991 |                 result = markup and matchAgainst.search(markup)
 992 |             elif hasattr(matchAgainst, '__iter__'): # list-like
 993 |                 result = markup in matchAgainst
 994 |             elif hasattr(matchAgainst, 'items'):
 995 |                 result = markup.has_key(matchAgainst)
 996 |             elif matchAgainst and isinstance(markup, basestring):
 997 |                 if isinstance(markup, unicode):
 998 |                     matchAgainst = unicode(matchAgainst)
 999 |                 else:
1000 |                     matchAgainst = str(matchAgainst)
1001 | 
1002 |             if not result:
1003 |                 result = matchAgainst == markup
1004 |         return result
1005 | 
1006 | class ResultSet(list):
1007 |     """A ResultSet is just a list that keeps track of the SoupStrainer
1008 |     that created it."""
1009 |     def __init__(self, source):
1010 |         list.__init__([])
1011 |         self.source = source
1012 | 
1013 | # Now, some helper functions.
1014 | 
1015 | def buildTagMap(default, *args):
1016 |     """Turns a list of maps, lists, or scalars into a single map.
1017 |     Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
1018 |     NESTING_RESET_TAGS maps out of lists and partial maps."""
1019 |     built = {}
1020 |     for portion in args:
1021 |         if hasattr(portion, 'items'):
1022 |             #It's a map. Merge it.
1023 |             for k,v in portion.items():
1024 |                 built[k] = v
1025 |         elif hasattr(portion, '__iter__'): # is a list
1026 |             #It's a list. Map each item to the default.
1027 |             for k in portion:
1028 |                 built[k] = default
1029 |         else:
1030 |             #It's a scalar. Map it to the default.
1031 |             built[portion] = default
1032 |     return built
1033 | 
1034 | # Now, the parser classes.
1035 | 
1036 | class BeautifulStoneSoup(Tag, SGMLParser):
1037 | 
1038 |     """This class contains the basic parser and search code. It defines
1039 |     a parser that knows nothing about tag behavior except for the
1040 |     following:
1041 | 
1042 |       You can't close a tag without closing all the tags it encloses.
1043 |       That is, "<foo><bar></foo>" actually means
1044 |       "<foo><bar></bar></foo>".
1045 | 
1046 |     [Another possible explanation is "<foo><bar /></foo>", but since
1047 |     this class defines no SELF_CLOSING_TAGS, it will never use that
1048 |     explanation.]
1049 | 
1050 |     This class is useful for parsing XML or made-up markup languages,
1051 |     or when BeautifulSoup makes an assumption counter to what you were
1052 |     expecting."""
1053 | 
1054 |     SELF_CLOSING_TAGS = {}
1055 |     NESTABLE_TAGS = {}
1056 |     RESET_NESTING_TAGS = {}
1057 |     QUOTE_TAGS = {}
1058 |     PRESERVE_WHITESPACE_TAGS = []
1059 | 
1060 |     MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
1061 |                        lambda x: x.group(1) + ' />'),
1062 |                       (re.compile('<!\s+([^<>]*)>'),
1063 |                        lambda x: '<!' + x.group(1) + '>')
1064 |                       ]
1065 | 
1066 |     ROOT_TAG_NAME = u'[document]'
1067 | 
1068 |     HTML_ENTITIES = "html"
1069 |     XML_ENTITIES = "xml"
1070 |     XHTML_ENTITIES = "xhtml"
1071 |     # TODO: This only exists for backwards-compatibility
1072 |     ALL_ENTITIES = XHTML_ENTITIES
1073 | 
1074 |     # Used when determining whether a text node is all whitespace and
1075 |     # can be replaced with a single space. A text node that contains
1076 |     # fancy Unicode spaces (usually non-breaking) should be left
1077 |     # alone.
1078 |     STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
1079 | 
1080 |     def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
1081 |                  markupMassage=True, smartQuotesTo=XML_ENTITIES,
1082 |                  convertEntities=None, selfClosingTags=None, isHTML=False):
1083 |         """The Soup object is initialized as the 'root tag', and the
1084 |         provided markup (which can be a string or a file-like object)
1085 |         is fed into the underlying parser.
1086 | 
1087 |         sgmllib will process most bad HTML, and the BeautifulSoup
1088 |         class has some tricks for dealing with some HTML that kills
1089 |         sgmllib, but Beautiful Soup can nonetheless choke or lose data
1090 |         if your data uses self-closing tags or declarations
1091 |         incorrectly.
1092 | 
1093 |         By default, Beautiful Soup uses regexes to sanitize input,
1094 |         avoiding the vast majority of these problems. If the problems
1095 |         don't apply to you, pass in False for markupMassage, and
1096 |         you'll get better performance.
1097 | 
1098 |         The default parser massage techniques fix the two most common
1099 |         instances of invalid HTML that choke sgmllib:
1100 | 
1101 |          <br/> (No space between name of closing tag and tag close)
1102 |          <! --Comment--> (Extraneous whitespace in declaration)
1103 | 
1104 |         You can pass in a custom list of (RE object, replace method)
1105 |         tuples to get Beautiful Soup to scrub your input the way you
1106 |         want."""
1107 | 
1108 |         self.parseOnlyThese = parseOnlyThese
1109 |         self.fromEncoding = fromEncoding
1110 |         self.smartQuotesTo = smartQuotesTo
1111 |         self.convertEntities = convertEntities
1112 |         # Set the rules for how we'll deal with the entities we
1113 |         # encounter
1114 |         if self.convertEntities:
1115 |             # It doesn't make sense to convert encoded characters to
1116 |             # entities even while you're converting entities to Unicode.
1117 |             # Just convert it all to Unicode.
1118 |             self.smartQuotesTo = None
1119 |             if convertEntities == self.HTML_ENTITIES:
1120 |                 self.convertXMLEntities = False
1121 |                 self.convertHTMLEntities = True
1122 |                 self.escapeUnrecognizedEntities = True
1123 |             elif convertEntities == self.XHTML_ENTITIES:
1124 |                 self.convertXMLEntities = True
1125 |                 self.convertHTMLEntities = True
1126 |                 self.escapeUnrecognizedEntities = False
1127 |             elif convertEntities == self.XML_ENTITIES:
1128 |                 self.convertXMLEntities = True
1129 |                 self.convertHTMLEntities = False
1130 |                 self.escapeUnrecognizedEntities = False
1131 |         else:
1132 |             self.convertXMLEntities = False
1133 |             self.convertHTMLEntities = False
1134 |             self.escapeUnrecognizedEntities = False
1135 | 
1136 |         self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
1137 |         SGMLParser.__init__(self)
1138 | 
1139 |         if hasattr(markup, 'read'):        # It's a file-type object.
1140 |             markup = markup.read()
1141 |         self.markup = markup
1142 |         self.markupMassage = markupMassage
1143 |         try:
1144 |             self._feed(isHTML=isHTML)
1145 |         except StopParsing:
1146 |             pass
1147 |         self.markup = None                 # The markup can now be GCed
1148 | 
1149 |     def convert_charref(self, name):
1150 |         """This method fixes a bug in Python's SGMLParser."""
1151 |         try:
1152 |             n = int(name)
1153 |         except ValueError:
1154 |             return
1155 |         if not 0 <= n <= 127 : # ASCII ends at 127, not 255
1156 |             return
1157 |         return self.convert_codepoint(n)
1158 | 
1159 |     def _feed(self, inDocumentEncoding=None, isHTML=False):
1160 |         # Convert the document to Unicode.
1161 |         markup = self.markup
1162 |         if isinstance(markup, unicode):
1163 |             if not hasattr(self, 'originalEncoding'):
1164 |                 self.originalEncoding = None
1165 |         else:
1166 |             dammit = UnicodeDammit\
1167 |                      (markup, [self.fromEncoding, inDocumentEncoding],
1168 |                       smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
1169 |             markup = dammit.unicode
1170 |             self.originalEncoding = dammit.originalEncoding
1171 |             self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
1172 |         if markup:
1173 |             if self.markupMassage:
1174 |                 if not hasattr(self.markupMassage, "__iter__"):
1175 |                     self.markupMassage = self.MARKUP_MASSAGE
1176 |                 for fix, m in self.markupMassage:
1177 |                     markup = fix.sub(m, markup)
1178 |                 # TODO: We get rid of markupMassage so that the
1179 |                 # soup object can be deepcopied later on. Some
1180 |                 # Python installations can't copy regexes. If anyone
1181 |                 # was relying on the existence of markupMassage, this
1182 |                 # might cause problems.
1183 |                 del(self.markupMassage)
1184 |         self.reset()
1185 | 
1186 |         SGMLParser.feed(self, markup)
1187 |         # Close out any unfinished strings and close all the open tags.
1188 |         self.endData()
1189 |         while self.currentTag.name != self.ROOT_TAG_NAME:
1190 |             self.popTag()
1191 | 
1192 |     def __getattr__(self, methodName):
1193 |         """This method routes method call requests to either the SGMLParser
1194 |         superclass or the Tag superclass, depending on the method name."""
1195 |         #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
1196 | 
1197 |         if methodName.startswith('start_') or methodName.startswith('end_') \
1198 |                or methodName.startswith('do_'):
1199 |             return SGMLParser.__getattr__(self, methodName)
1200 |         elif not methodName.startswith('__'):
1201 |             return Tag.__getattr__(self, methodName)
1202 |         else:
1203 |             raise AttributeError
1204 | 
1205 |     def isSelfClosingTag(self, name):
1206 |         """Returns true iff the given string is the name of a
1207 |         self-closing tag according to this parser."""
1208 |         return self.SELF_CLOSING_TAGS.has_key(name) \
1209 |                or self.instanceSelfClosingTags.has_key(name)
1210 | 
1211 |     def reset(self):
1212 |         Tag.__init__(self, self, self.ROOT_TAG_NAME)
1213 |         self.hidden = 1
1214 |         SGMLParser.reset(self)
1215 |         self.currentData = []
1216 |         self.currentTag = None
1217 |         self.tagStack = []
1218 |         self.quoteStack = []
1219 |         self.pushTag(self)
1220 | 
1221 |     def popTag(self):
1222 |         tag = self.tagStack.pop()
1223 | 
1224 |         #print "Pop", tag.name
1225 |         if self.tagStack:
1226 |             self.currentTag = self.tagStack[-1]
1227 |         return self.currentTag
1228 | 
1229 |     def pushTag(self, tag):
1230 |         #print "Push", tag.name
1231 |         if self.currentTag:
1232 |             self.currentTag.contents.append(tag)
1233 |         self.tagStack.append(tag)
1234 |         self.currentTag = self.tagStack[-1]
1235 | 
1236 |     def endData(self, containerClass=NavigableString):
1237 |         if self.currentData:
1238 |             currentData = u''.join(self.currentData)
1239 |             if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
1240 |                 not set([tag.name for tag in self.tagStack]).intersection(
1241 |                     self.PRESERVE_WHITESPACE_TAGS)):
1242 |                 if '\n' in currentData:
1243 |                     currentData = '\n'
1244 |                 else:
1245 |                     currentData = ' '
1246 |             self.currentData = []
1247 |             if self.parseOnlyThese and len(self.tagStack) <= 1 and \
1248 |                    (not self.parseOnlyThese.text or \
1249 |                     not self.parseOnlyThese.search(currentData)):
1250 |                 return
1251 |             o = containerClass(currentData)
1252 |             o.setup(self.currentTag, self.previous)
1253 |             if self.previous:
1254 |                 self.previous.next = o
1255 |             self.previous = o
1256 |             self.currentTag.contents.append(o)
1257 | 
1258 | 
1259 |     def _popToTag(self, name, inclusivePop=True):
1260 |         """Pops the tag stack up to and including the most recent
1261 |         instance of the given tag. If inclusivePop is false, pops the tag
1262 |         stack up to but *not* including the most recent instqance of
1263 |         the given tag."""
1264 |         #print "Popping to %s" % name
1265 |         if name == self.ROOT_TAG_NAME:
1266 |             return
1267 | 
1268 |         numPops = 0
1269 |         mostRecentTag = None
1270 |         for i in range(len(self.tagStack)-1, 0, -1):
1271 |             if name == self.tagStack[i].name:
1272 |                 numPops = len(self.tagStack)-i
1273 |                 break
1274 |         if not inclusivePop:
1275 |             numPops = numPops - 1
1276 | 
1277 |         for i in range(0, numPops):
1278 |             mostRecentTag = self.popTag()
1279 |         return mostRecentTag
1280 | 
1281 |     def _smartPop(self, name):
1282 | 
1283 |         """We need to pop up to the previous tag of this type, unless
1284 |         one of this tag's nesting reset triggers comes between this
1285 |         tag and the previous tag of this type, OR unless this tag is a
1286 |         generic nesting trigger and another generic nesting trigger
1287 |         comes between this tag and the previous tag of this type.
1288 | 
1289 |         Examples:
1290 |          <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
1291 |          <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
1292 |          <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
1293 | 
1294 |          <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
1295 |          <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
1296 |          <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
1297 |         """
1298 | 
1299 |         nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1300 |         isNestable = nestingResetTriggers != None
1301 |         isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
1302 |         popTo = None
1303 |         inclusive = True
1304 |         for i in range(len(self.tagStack)-1, 0, -1):
1305 |             p = self.tagStack[i]
1306 |             if (not p or p.name == name) and not isNestable:
1307 |                 #Non-nestable tags get popped to the top or to their
1308 |                 #last occurance.
1309 |                 popTo = name
1310 |                 break
1311 |             if (nestingResetTriggers is not None
1312 |                 and p.name in nestingResetTriggers) \
1313 |                 or (nestingResetTriggers is None and isResetNesting
1314 |                     and self.RESET_NESTING_TAGS.has_key(p.name)):
1315 | 
1316 |                 #If we encounter one of the nesting reset triggers
1317 |                 #peculiar to this tag, or we encounter another tag
1318 |                 #that causes nesting to reset, pop up to but not
1319 |                 #including that tag.
1320 |                 popTo = p.name
1321 |                 inclusive = False
1322 |                 break
1323 |             p = p.parent
1324 |         if popTo:
1325 |             self._popToTag(popTo, inclusive)
1326 | 
1327 |     def unknown_starttag(self, name, attrs, selfClosing=0):
1328 |         #print "Start tag %s: %s" % (name, attrs)
1329 |         if self.quoteStack:
1330 |             #This is not a real tag.
1331 |             #print "<%s> is not real!" % name
1332 |             attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs])
1333 |             self.handle_data('<%s%s>' % (name, attrs))
1334 |             return
1335 |         self.endData()
1336 | 
1337 |         if not self.isSelfClosingTag(name) and not selfClosing:
1338 |             self._smartPop(name)
1339 | 
1340 |         if self.parseOnlyThese and len(self.tagStack) <= 1 \
1341 |                and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1342 |             return
1343 | 
1344 |         tag = Tag(self, name, attrs, self.currentTag, self.previous)
1345 |         if self.previous:
1346 |             self.previous.next = tag
1347 |         self.previous = tag
1348 |         self.pushTag(tag)
1349 |         if selfClosing or self.isSelfClosingTag(name):
1350 |             self.popTag()
1351 |         if name in self.QUOTE_TAGS:
1352 |             #print "Beginning quote (%s)" % name
1353 |             self.quoteStack.append(name)
1354 |             self.literal = 1
1355 |         return tag
1356 | 
1357 |     def unknown_endtag(self, name):
1358 |         #print "End tag %s" % name
1359 |         if self.quoteStack and self.quoteStack[-1] != name:
1360 |             #This is not a real end tag.
1361 |             #print "</%s> is not real!" % name
1362 |             self.handle_data('</%s>' % name)
1363 |             return
1364 |         self.endData()
1365 |         self._popToTag(name)
1366 |         if self.quoteStack and self.quoteStack[-1] == name:
1367 |             self.quoteStack.pop()
1368 |             self.literal = (len(self.quoteStack) > 0)
1369 | 
1370 |     def handle_data(self, data):
1371 |         self.currentData.append(data)
1372 | 
1373 |     def _toStringSubclass(self, text, subclass):
1374 |         """Adds a certain piece of text to the tree as a NavigableString
1375 |         subclass."""
1376 |         self.endData()
1377 |         self.handle_data(text)
1378 |         self.endData(subclass)
1379 | 
1380 |     def handle_pi(self, text):
1381 |         """Handle a processing instruction as a ProcessingInstruction
1382 |         object, possibly one with a %SOUP-ENCODING% slot into which an
1383 |         encoding will be plugged later."""
1384 |         if text[:3] == "xml":
1385 |             text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
1386 |         self._toStringSubclass(text, ProcessingInstruction)
1387 | 
1388 |     def handle_comment(self, text):
1389 |         "Handle comments as Comment objects."
1390 |         self._toStringSubclass(text, Comment)
1391 | 
1392 |     def handle_charref(self, ref):
1393 |         "Handle character references as data."
1394 |         if self.convertEntities:
1395 |             data = unichr(int(ref))
1396 |         else:
1397 |             data = '&#%s;' % ref
1398 |         self.handle_data(data)
1399 | 
1400 |     def handle_entityref(self, ref):
1401 |         """Handle entity references as data, possibly converting known
1402 |         HTML and/or XML entity references to the corresponding Unicode
1403 |         characters."""
1404 |         data = None
1405 |         if self.convertHTMLEntities:
1406 |             try:
1407 |                 data = unichr(name2codepoint[ref])
1408 |             except KeyError:
1409 |                 pass
1410 | 
1411 |         if not data and self.convertXMLEntities:
1412 |                 data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
1413 | 
1414 |         if not data and self.convertHTMLEntities and \
1415 |             not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
1416 |                 # TODO: We've got a problem here. We're told this is
1417 |                 # an entity reference, but it's not an XML entity
1418 |                 # reference or an HTML entity reference. Nonetheless,
1419 |                 # the logical thing to do is to pass it through as an
1420 |                 # unrecognized entity reference.
1421 |                 #
1422 |                 # Except: when the input is "&carol;" this function
1423 |                 # will be called with input "carol". When the input is
1424 |                 # "AT&T", this function will be called with input
1425 |                 # "T". We have no way of knowing whether a semicolon
1426 |                 # was present originally, so we don't know whether
1427 |                 # this is an unknown entity or just a misplaced
1428 |                 # ampersand.
1429 |                 #
1430 |                 # The more common case is a misplaced ampersand, so I
1431 |                 # escape the ampersand and omit the trailing semicolon.
1432 |                 data = "&amp;%s" % ref
1433 |         if not data:
1434 |             # This case is different from the one above, because we
1435 |             # haven't already gone through a supposedly comprehensive
1436 |             # mapping of entities to Unicode characters. We might not
1437 |             # have gone through any mapping at all. So the chances are
1438 |             # very high that this is a real entity, and not a
1439 |             # misplaced ampersand.
1440 |             data = "&%s;" % ref
1441 |         self.handle_data(data)
1442 | 
1443 |     def handle_decl(self, data):
1444 |         "Handle DOCTYPEs and the like as Declaration objects."
1445 |         self._toStringSubclass(data, Declaration)
1446 | 
1447 |     def parse_declaration(self, i):
1448 |         """Treat a bogus SGML declaration as raw data. Treat a CDATA
1449 |         declaration as a CData object."""
1450 |         j = None
1451 |         if self.rawdata[i:i+9] == '<![CDATA[':
1452 |              k = self.rawdata.find(']]>', i)
1453 |              if k == -1:
1454 |                  k = len(self.rawdata)
1455 |              data = self.rawdata[i+9:k]
1456 |              j = k+3
1457 |              self._toStringSubclass(data, CData)
1458 |         else:
1459 |             try:
1460 |                 j = SGMLParser.parse_declaration(self, i)
1461 |             except SGMLParseError:
1462 |                 toHandle = self.rawdata[i:]
1463 |                 self.handle_data(toHandle)
1464 |                 j = i + len(toHandle)
1465 |         return j
1466 | 
1467 | class BeautifulSoup(BeautifulStoneSoup):
1468 | 
1469 |     """This parser knows the following facts about HTML:
1470 | 
1471 |     * Some tags have no closing tag and should be interpreted as being
1472 |       closed as soon as they are encountered.
1473 | 
1474 |     * The text inside some tags (ie. 'script') may contain tags which
1475 |       are not really part of the document and which should be parsed
1476 |       as text, not tags. If you want to parse the text as tags, you can
1477 |       always fetch it and parse it explicitly.
1478 | 
1479 |     * Tag nesting rules:
1480 | 
1481 |       Most tags can't be nested at all. For instance, the occurance of
1482 |       a <p> tag should implicitly close the previous <p> tag.
1483 | 
1484 |        <p>Para1<p>Para2
1485 |         should be transformed into:
1486 |        <p>Para1</p><p>Para2
1487 | 
1488 |       Some tags can be nested arbitrarily. For instance, the occurance
1489 |       of a <blockquote> tag should _not_ implicitly close the previous
1490 |       <blockquote> tag.
1491 | 
1492 |        Alice said: <blockquote>Bob said: <blockquote>Blah
1493 |         should NOT be transformed into:
1494 |        Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
1495 | 
1496 |       Some tags can be nested, but the nesting is reset by the
1497 |       interposition of other tags. For instance, a <tr> tag should
1498 |       implicitly close the previous <tr> tag within the same <table>,
1499 |       but not close a <tr> tag in another table.
1500 | 
1501 |        <table><tr>Blah<tr>Blah
1502 |         should be transformed into:
1503 |        <table><tr>Blah</tr><tr>Blah
1504 |         but,
1505 |        <tr>Blah<table><tr>Blah
1506 |         should NOT be transformed into
1507 |        <tr>Blah<table></tr><tr>Blah
1508 | 
1509 |     Differing assumptions about tag nesting rules are a major source
1510 |     of problems with the BeautifulSoup class. If BeautifulSoup is not
1511 |     treating as nestable a tag your page author treats as nestable,
1512 |     try ICantBelieveItsBeautifulSoup, MinimalSoup, or
1513 |     BeautifulStoneSoup before writing your own subclass."""
1514 | 
1515 |     def __init__(self, *args, **kwargs):
1516 |         if not kwargs.has_key('smartQuotesTo'):
1517 |             kwargs['smartQuotesTo'] = self.HTML_ENTITIES
1518 |         kwargs['isHTML'] = True
1519 |         BeautifulStoneSoup.__init__(self, *args, **kwargs)
1520 | 
1521 |     SELF_CLOSING_TAGS = buildTagMap(None,
1522 |                                     ('br' , 'hr', 'input', 'img', 'meta',
1523 |                                     'spacer', 'link', 'frame', 'base', 'col'))
1524 | 
1525 |     PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
1526 | 
1527 |     QUOTE_TAGS = {'script' : None, 'textarea' : None}
1528 | 
1529 |     #According to the HTML standard, each of these inline tags can
1530 |     #contain another tag of the same type. Furthermore, it's common
1531 |     #to actually use these tags this way.
1532 |     NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
1533 |                             'center')
1534 | 
1535 |     #According to the HTML standard, these block tags can contain
1536 |     #another tag of the same type. Furthermore, it's common
1537 |     #to actually use these tags this way.
1538 |     NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del')
1539 | 
1540 |     #Lists can contain other lists, but there are restrictions.
1541 |     NESTABLE_LIST_TAGS = { 'ol' : [],
1542 |                            'ul' : [],
1543 |                            'li' : ['ul', 'ol'],
1544 |                            'dl' : [],
1545 |                            'dd' : ['dl'],
1546 |                            'dt' : ['dl'] }
1547 | 
1548 |     #Tables can contain other tables, but there are restrictions.
1549 |     NESTABLE_TABLE_TAGS = {'table' : [],
1550 |                            'tr' : ['table', 'tbody', 'tfoot', 'thead'],
1551 |                            'td' : ['tr'],
1552 |                            'th' : ['tr'],
1553 |                            'thead' : ['table'],
1554 |                            'tbody' : ['table'],
1555 |                            'tfoot' : ['table'],
1556 |                            }
1557 | 
1558 |     NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre')
1559 | 
1560 |     #If one of these tags is encountered, all tags up to the next tag of
1561 |     #this type are popped.
1562 |     RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
1563 |                                      NON_NESTABLE_BLOCK_TAGS,
1564 |                                      NESTABLE_LIST_TAGS,
1565 |                                      NESTABLE_TABLE_TAGS)
1566 | 
1567 |     NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
1568 |                                 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
1569 | 
1570 |     # Used to detect the charset in a META tag; see start_meta
1571 |     CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
1572 | 
1573 |     def start_meta(self, attrs):
1574 |         """Beautiful Soup can detect a charset included in a META tag,
1575 |         try to convert the document to that charset, and re-parse the
1576 |         document from the beginning."""
1577 |         httpEquiv = None
1578 |         contentType = None
1579 |         contentTypeIndex = None
1580 |         tagNeedsEncodingSubstitution = False
1581 | 
1582 |         for i in range(0, len(attrs)):
1583 |             key, value = attrs[i]
1584 |             key = key.lower()
1585 |             if key == 'http-equiv':
1586 |                 httpEquiv = value
1587 |             elif key == 'content':
1588 |                 contentType = value
1589 |                 contentTypeIndex = i
1590 | 
1591 |         if httpEquiv and contentType: # It's an interesting meta tag.
1592 |             match = self.CHARSET_RE.search(contentType)
1593 |             if match:
1594 |                 if (self.declaredHTMLEncoding is not None or
1595 |                     self.originalEncoding == self.fromEncoding):
1596 |                     # An HTML encoding was sniffed while converting
1597 |                     # the document to Unicode, or an HTML encoding was
1598 |                     # sniffed during a previous pass through the
1599 |                     # document, or an encoding was specified
1600 |                     # explicitly and it worked. Rewrite the meta tag.
1601 |                     def rewrite(match):
1602 |                         return match.group(1) + "%SOUP-ENCODING%"
1603 |                     newAttr = self.CHARSET_RE.sub(rewrite, contentType)
1604 |                     attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
1605 |                                                newAttr)
1606 |                     tagNeedsEncodingSubstitution = True
1607 |                 else:
1608 |                     # This is our first pass through the document.
1609 |                     # Go through it again with the encoding information.
1610 |                     newCharset = match.group(3)
1611 |                     if newCharset and newCharset != self.originalEncoding:
1612 |                         self.declaredHTMLEncoding = newCharset
1613 |                         self._feed(self.declaredHTMLEncoding)
1614 |                         raise StopParsing
1615 |                     pass
1616 |         tag = self.unknown_starttag("meta", attrs)
1617 |         if tag and tagNeedsEncodingSubstitution:
1618 |             tag.containsSubstitutions = True
1619 | 
1620 | class StopParsing(Exception):
1621 |     pass
1622 | 
1623 | class ICantBelieveItsBeautifulSoup(BeautifulSoup):
1624 | 
1625 |     """The BeautifulSoup class is oriented towards skipping over
1626 |     common HTML errors like unclosed tags. However, sometimes it makes
1627 |     errors of its own. For instance, consider this fragment:
1628 | 
1629 |      <b>Foo<b>Bar</b></b>
1630 | 
1631 |     This is perfectly valid (if bizarre) HTML. However, the
1632 |     BeautifulSoup class will implicitly close the first b tag when it
1633 |     encounters the second 'b'. It will think the author wrote
1634 |     "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
1635 |     there's no real-world reason to bold something that's already
1636 |     bold. When it encounters '</b></b>' it will close two more 'b'
1637 |     tags, for a grand total of three tags closed instead of two. This
1638 |     can throw off the rest of your document structure. The same is
1639 |     true of a number of other tags, listed below.
1640 | 
1641 |     It's much more common for someone to forget to close a 'b' tag
1642 |     than to actually use nested 'b' tags, and the BeautifulSoup class
1643 |     handles the common case. This class handles the not-co-common
1644 |     case: where you can't believe someone wrote what they did, but
1645 |     it's valid HTML and BeautifulSoup screwed up by assuming it
1646 |     wouldn't be."""
1647 | 
1648 |     I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1649 |      ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1650 |       'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1651 |       'big')
1652 | 
1653 |     I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',)
1654 | 
1655 |     NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1656 |                                 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1657 |                                 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1658 | 
1659 | class MinimalSoup(BeautifulSoup):
1660 |     """The MinimalSoup class is for parsing HTML that contains
1661 |     pathologically bad markup. It makes no assumptions about tag
1662 |     nesting, but it does know which tags are self-closing, that
1663 |     <script> tags contain Javascript and should not be parsed, that
1664 |     META tags may contain encoding information, and so on.
1665 | 
1666 |     This also makes it better for subclassing than BeautifulStoneSoup
1667 |     or BeautifulSoup."""
1668 | 
1669 |     RESET_NESTING_TAGS = buildTagMap('noscript')
1670 |     NESTABLE_TAGS = {}
1671 | 
1672 | class BeautifulSOAP(BeautifulStoneSoup):
1673 |     """This class will push a tag with only a single string child into
1674 |     the tag's parent as an attribute. The attribute's name is the tag
1675 |     name, and the value is the string child. An example should give
1676 |     the flavor of the change:
1677 | 
1678 |     <foo><bar>baz</bar></foo>
1679 |      =>
1680 |     <foo bar="baz"><bar>baz</bar></foo>
1681 | 
1682 |     You can then access fooTag['bar'] instead of fooTag.barTag.string.
1683 | 
1684 |     This is, of course, useful for scraping structures that tend to
1685 |     use subelements instead of attributes, such as SOAP messages. Note
1686 |     that it modifies its input, so don't print the modified version
1687 |     out.
1688 | 
1689 |     I'm not sure how many people really want to use this class; let me
1690 |     know if you do. Mainly I like the name."""
1691 | 
1692 |     def popTag(self):
1693 |         if len(self.tagStack) > 1:
1694 |             tag = self.tagStack[-1]
1695 |             parent = self.tagStack[-2]
1696 |             parent._getAttrMap()
1697 |             if (isinstance(tag, Tag) and len(tag.contents) == 1 and
1698 |                 isinstance(tag.contents[0], NavigableString) and
1699 |                 not parent.attrMap.has_key(tag.name)):
1700 |                 parent[tag.name] = tag.contents[0]
1701 |         BeautifulStoneSoup.popTag(self)
1702 | 
1703 | #Enterprise class names! It has come to our attention that some people
1704 | #think the names of the Beautiful Soup parser classes are too silly
1705 | #and "unprofessional" for use in enterprise screen-scraping. We feel
1706 | #your pain! For such-minded folk, the Beautiful Soup Consortium And
1707 | #All-Night Kosher Bakery recommends renaming this file to
1708 | #"RobustParser.py" (or, in cases of extreme enterprisiness,
1709 | #"RobustParserBeanInterface.class") and using the following
1710 | #enterprise-friendly class aliases:
1711 | class RobustXMLParser(BeautifulStoneSoup):
1712 |     pass
1713 | class RobustHTMLParser(BeautifulSoup):
1714 |     pass
1715 | class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
1716 |     pass
1717 | class RobustInsanelyWackAssHTMLParser(MinimalSoup):
1718 |     pass
1719 | class SimplifyingSOAPParser(BeautifulSOAP):
1720 |     pass
1721 | 
1722 | ######################################################
1723 | #
1724 | # Bonus library: Unicode, Dammit
1725 | #
1726 | # This class forces XML data into a standard format (usually to UTF-8
1727 | # or Unicode).  It is heavily based on code from Mark Pilgrim's
1728 | # Universal Feed Parser. It does not rewrite the XML or HTML to
1729 | # reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
1730 | # (XML) and BeautifulSoup.start_meta (HTML).
1731 | 
1732 | # Autodetects character encodings.
1733 | # Download from http://chardet.feedparser.org/
1734 | try:
1735 |     import chardet
1736 | #    import chardet.constants
1737 | #    chardet.constants._debug = 1
1738 | except ImportError:
1739 |     chardet = None
1740 | 
1741 | # cjkcodecs and iconv_codec make Python know about more character encodings.
1742 | # Both are available from http://cjkpython.i18n.org/
1743 | # They're built in if you use Python 2.4.
1744 | try:
1745 |     import cjkcodecs.aliases
1746 | except ImportError:
1747 |     pass
1748 | try:
1749 |     import iconv_codec
1750 | except ImportError:
1751 |     pass
1752 | 
1753 | class UnicodeDammit:
1754 |     """A class for detecting the encoding of a *ML document and
1755 |     converting it to a Unicode string. If the source encoding is
1756 |     windows-1252, can replace MS smart quotes with their HTML or XML
1757 |     equivalents."""
1758 | 
1759 |     # This dictionary maps commonly seen values for "charset" in HTML
1760 |     # meta tags to the corresponding Python codec names. It only covers
1761 |     # values that aren't in Python's aliases and can't be determined
1762 |     # by the heuristics in find_codec.
1763 |     CHARSET_ALIASES = { "macintosh" : "mac-roman",
1764 |                         "x-sjis" : "shift-jis" }
1765 | 
1766 |     def __init__(self, markup, overrideEncodings=[],
1767 |                  smartQuotesTo='xml', isHTML=False):
1768 |         self.declaredHTMLEncoding = None
1769 |         self.markup, documentEncoding, sniffedEncoding = \
1770 |                      self._detectEncoding(markup, isHTML)
1771 |         self.smartQuotesTo = smartQuotesTo
1772 |         self.triedEncodings = []
1773 |         if markup == '' or isinstance(markup, unicode):
1774 |             self.originalEncoding = None
1775 |             self.unicode = unicode(markup)
1776 |             return
1777 | 
1778 |         u = None
1779 |         for proposedEncoding in overrideEncodings:
1780 |             u = self._convertFrom(proposedEncoding)
1781 |             if u: break
1782 |         if not u:
1783 |             for proposedEncoding in (documentEncoding, sniffedEncoding):
1784 |                 u = self._convertFrom(proposedEncoding)
1785 |                 if u: break
1786 | 
1787 |         # If no luck and we have auto-detection library, try that:
1788 |         if not u and chardet and not isinstance(self.markup, unicode):
1789 |             u = self._convertFrom(chardet.detect(self.markup)['encoding'])
1790 | 
1791 |         # As a last resort, try utf-8 and windows-1252:
1792 |         if not u:
1793 |             for proposed_encoding in ("utf-8", "windows-1252"):
1794 |                 u = self._convertFrom(proposed_encoding)
1795 |                 if u: break
1796 | 
1797 |         self.unicode = u
1798 |         if not u: self.originalEncoding = None
1799 | 
1800 |     def _subMSChar(self, orig):
1801 |         """Changes a MS smart quote character to an XML or HTML
1802 |         entity."""
1803 |         sub = self.MS_CHARS.get(orig)
1804 |         if isinstance(sub, tuple):
1805 |             if self.smartQuotesTo == 'xml':
1806 |                 sub = '&#x%s;' % sub[1]
1807 |             else:
1808 |                 sub = '&%s;' % sub[0]
1809 |         return sub
1810 | 
1811 |     def _convertFrom(self, proposed):
1812 |         proposed = self.find_codec(proposed)
1813 |         if not proposed or proposed in self.triedEncodings:
1814 |             return None
1815 |         self.triedEncodings.append(proposed)
1816 |         markup = self.markup
1817 | 
1818 |         # Convert smart quotes to HTML if coming from an encoding
1819 |         # that might have them.
1820 |         if self.smartQuotesTo and proposed.lower() in("windows-1252",
1821 |                                                       "iso-8859-1",
1822 |                                                       "iso-8859-2"):
1823 |             markup = re.compile("([\x80-\x9f])").sub \
1824 |                      (lambda(x): self._subMSChar(x.group(1)),
1825 |                       markup)
1826 | 
1827 |         try:
1828 |             # print "Trying to convert document to %s" % proposed
1829 |             u = self._toUnicode(markup, proposed)
1830 |             self.markup = u
1831 |             self.originalEncoding = proposed
1832 |         except Exception, e:
1833 |             # print "That didn't work!"
1834 |             # print e
1835 |             return None
1836 |         #print "Correct encoding: %s" % proposed
1837 |         return self.markup
1838 | 
1839 |     def _toUnicode(self, data, encoding):
1840 |         '''Given a string and its encoding, decodes the string into Unicode.
1841 |         %encoding is a string recognized by encodings.aliases'''
1842 | 
1843 |         # strip Byte Order Mark (if present)
1844 |         if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
1845 |                and (data[2:4] != '\x00\x00'):
1846 |             encoding = 'utf-16be'
1847 |             data = data[2:]
1848 |         elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
1849 |                  and (data[2:4] != '\x00\x00'):
1850 |             encoding = 'utf-16le'
1851 |             data = data[2:]
1852 |         elif data[:3] == '\xef\xbb\xbf':
1853 |             encoding = 'utf-8'
1854 |             data = data[3:]
1855 |         elif data[:4] == '\x00\x00\xfe\xff':
1856 |             encoding = 'utf-32be'
1857 |             data = data[4:]
1858 |         elif data[:4] == '\xff\xfe\x00\x00':
1859 |             encoding = 'utf-32le'
1860 |             data = data[4:]
1861 |         newdata = unicode(data, encoding)
1862 |         return newdata
1863 | 
1864 |     def _detectEncoding(self, xml_data, isHTML=False):
1865 |         """Given a document, tries to detect its XML encoding."""
1866 |         xml_encoding = sniffed_xml_encoding = None
1867 |         try:
1868 |             if xml_data[:4] == '\x4c\x6f\xa7\x94':
1869 |                 # EBCDIC
1870 |                 xml_data = self._ebcdic_to_ascii(xml_data)
1871 |             elif xml_data[:4] == '\x00\x3c\x00\x3f':
1872 |                 # UTF-16BE
1873 |                 sniffed_xml_encoding = 'utf-16be'
1874 |                 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
1875 |             elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
1876 |                      and (xml_data[2:4] != '\x00\x00'):
1877 |                 # UTF-16BE with BOM
1878 |                 sniffed_xml_encoding = 'utf-16be'
1879 |                 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
1880 |             elif xml_data[:4] == '\x3c\x00\x3f\x00':
1881 |                 # UTF-16LE
1882 |                 sniffed_xml_encoding = 'utf-16le'
1883 |                 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
1884 |             elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
1885 |                      (xml_data[2:4] != '\x00\x00'):
1886 |                 # UTF-16LE with BOM
1887 |                 sniffed_xml_encoding = 'utf-16le'
1888 |                 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
1889 |             elif xml_data[:4] == '\x00\x00\x00\x3c':
1890 |                 # UTF-32BE
1891 |                 sniffed_xml_encoding = 'utf-32be'
1892 |                 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
1893 |             elif xml_data[:4] == '\x3c\x00\x00\x00':
1894 |                 # UTF-32LE
1895 |                 sniffed_xml_encoding = 'utf-32le'
1896 |                 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
1897 |             elif xml_data[:4] == '\x00\x00\xfe\xff':
1898 |                 # UTF-32BE with BOM
1899 |                 sniffed_xml_encoding = 'utf-32be'
1900 |                 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
1901 |             elif xml_data[:4] == '\xff\xfe\x00\x00':
1902 |                 # UTF-32LE with BOM
1903 |                 sniffed_xml_encoding = 'utf-32le'
1904 |                 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
1905 |             elif xml_data[:3] == '\xef\xbb\xbf':
1906 |                 # UTF-8 with BOM
1907 |                 sniffed_xml_encoding = 'utf-8'
1908 |                 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
1909 |             else:
1910 |                 sniffed_xml_encoding = 'ascii'
1911 |                 pass
1912 |         except:
1913 |             xml_encoding_match = None
1914 |         xml_encoding_match = re.compile(
1915 |             '^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
1916 |         if not xml_encoding_match and isHTML:
1917 |             regexp = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I)
1918 |             xml_encoding_match = regexp.search(xml_data)
1919 |         if xml_encoding_match is not None:
1920 |             xml_encoding = xml_encoding_match.groups()[0].lower()
1921 |             if isHTML:
1922 |                 self.declaredHTMLEncoding = xml_encoding
1923 |             if sniffed_xml_encoding and \
1924 |                (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
1925 |                                  'iso-10646-ucs-4', 'ucs-4', 'csucs4',
1926 |                                  'utf-16', 'utf-32', 'utf_16', 'utf_32',
1927 |                                  'utf16', 'u16')):
1928 |                 xml_encoding = sniffed_xml_encoding
1929 |         return xml_data, xml_encoding, sniffed_xml_encoding
1930 | 
1931 | 
1932 |     def find_codec(self, charset):
1933 |         return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
1934 |                or (charset and self._codec(charset.replace("-", ""))) \
1935 |                or (charset and self._codec(charset.replace("-", "_"))) \
1936 |                or charset
1937 | 
1938 |     def _codec(self, charset):
1939 |         if not charset: return charset
1940 |         codec = None
1941 |         try:
1942 |             codecs.lookup(charset)
1943 |             codec = charset
1944 |         except (LookupError, ValueError):
1945 |             pass
1946 |         return codec
1947 | 
1948 |     EBCDIC_TO_ASCII_MAP = None
1949 |     def _ebcdic_to_ascii(self, s):
1950 |         c = self.__class__
1951 |         if not c.EBCDIC_TO_ASCII_MAP:
1952 |             emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1953 |                     16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1954 |                     128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1955 |                     144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1956 |                     32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1957 |                     38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1958 |                     45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1959 |                     186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1960 |                     195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1961 |                     201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1962 |                     206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1963 |                     211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1964 |                     225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1965 |                     73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1966 |                     82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1967 |                     90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1968 |                     250,251,252,253,254,255)
1969 |             import string
1970 |             c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
1971 |             ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
1972 |         return s.translate(c.EBCDIC_TO_ASCII_MAP)
1973 | 
1974 |     MS_CHARS = { '\x80' : ('euro', '20AC'),
1975 |                  '\x81' : ' ',
1976 |                  '\x82' : ('sbquo', '201A'),
1977 |                  '\x83' : ('fnof', '192'),
1978 |                  '\x84' : ('bdquo', '201E'),
1979 |                  '\x85' : ('hellip', '2026'),
1980 |                  '\x86' : ('dagger', '2020'),
1981 |                  '\x87' : ('Dagger', '2021'),
1982 |                  '\x88' : ('circ', '2C6'),
1983 |                  '\x89' : ('permil', '2030'),
1984 |                  '\x8A' : ('Scaron', '160'),
1985 |                  '\x8B' : ('lsaquo', '2039'),
1986 |                  '\x8C' : ('OElig', '152'),
1987 |                  '\x8D' : '?',
1988 |                  '\x8E' : ('#x17D', '17D'),
1989 |                  '\x8F' : '?',
1990 |                  '\x90' : '?',
1991 |                  '\x91' : ('lsquo', '2018'),
1992 |                  '\x92' : ('rsquo', '2019'),
1993 |                  '\x93' : ('ldquo', '201C'),
1994 |                  '\x94' : ('rdquo', '201D'),
1995 |                  '\x95' : ('bull', '2022'),
1996 |                  '\x96' : ('ndash', '2013'),
1997 |                  '\x97' : ('mdash', '2014'),
1998 |                  '\x98' : ('tilde', '2DC'),
1999 |                  '\x99' : ('trade', '2122'),
2000 |                  '\x9a' : ('scaron', '161'),
2001 |                  '\x9b' : ('rsaquo', '203A'),
2002 |                  '\x9c' : ('oelig', '153'),
2003 |                  '\x9d' : '?',
2004 |                  '\x9e' : ('#x17E', '17E'),
2005 |                  '\x9f' : ('Yuml', ''),}
2006 | 
2007 | #######################################################################
2008 | 
2009 | 
2010 | #By default, act as an HTML pretty-printer.
2011 | if __name__ == '__main__':
2012 |     import sys
2013 |     soup = BeautifulSoup(sys.stdin)
2014 |     print soup.prettify()
2015 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Google Search API
  2 | =====
  3 | 
  4 | Google Search API is a python based library for searching various functionalities of google.  It uses screen scraping to retrieve the results, and thus is unreliable if the way google's web pages are returned change in the future.
  5 | 
  6 | *Disclaimer: This software uses screen scraping to retreive search results from google.com, and therefore this software may stop working at any given time.  Use this software at your own risk. I assume no responsibility for how this software API is used by others.*
  7 | 
  8 | ## Google Web Search
  9 | You can search google web in the following way:
 10 | 
 11 | ```python
 12 | search_results = Google.search("This is my query")
 13 | ```
 14 | 
 15 | `search_results` will contain a list of `GoogleResult` objects
 16 | 
 17 | ```python
 18 | GoogleResult:
 19 |     self.name # The title of the link
 20 |     self.link # The link url
 21 |     self.description # The description of the link
 22 |     self.thumb # The link to a thumbnail of the website (not implemented yet)
 23 |     self.cached # A link to the cached version of the page
 24 |     self.page # What page this result was on (When searching more than one page)
 25 |     self.index # What index on this page it was on
 26 | ```
 27 |     
 28 | 
 29 | ## Google Calculator
 30 | Attempts to search google calculator for the result of an expression. Returns a `CalculatorResult` if successful or `None` if it fails.
 31 | 
 32 | ```python
 33 | Google.calculate("157.3kg in grams")
 34 | ```
 35 | 
 36 | ```python
 37 | {'expr': u'157.3 kilograms',
 38 |  'fullstring': u'157.3 kilograms = 157\xa0300 grams',
 39 |  'result': u'157 300 grams',
 40 |  'unit': u'grams',
 41 |  'value': u'157300'}
 42 | ```
 43 | 
 44 |     
 45 | ```python
 46 | Google.calculate("cos(25 pi) / 17.4")
 47 | ```
 48 | 
 49 | ```python
 50 | {'expr': u'cos(25 * pi) / 17.4',
 51 |  'fullstring': u'cos(25 * pi) / 17.4 = -0.0574712644',
 52 |  'result': u'-0.0574712644',
 53 |  'unit': None,
 54 |  'value': u'-0.0574712644'}
 55 | ```
 56 |     
 57 | ## Google Image Search
 58 | Searches google images for a list of images.  Image searches can be filtered to produce better results.
 59 | 
 60 | Perform a google image search on "banana" and filter it:
 61 | 
 62 | ```python
 63 | options = ImageOptions()
 64 | options.image_type = ImageType.CLIPART
 65 | options.larger_than = LargerThan.MP_4
 66 | options.color = "green"
 67 | results = Google.search_images("banana", options)
 68 | ```
 69 |     
 70 | Sample Result:
 71 | 
 72 | ```python
 73 | {'domain': u'exitrealworld.com',
 74 |  'filesize': u'4054k',
 75 |  'format': u'jpg',
 76 |  'height': u'3103',
 77 |  'index': 0,
 78 |  'link': u'http://www.exitrealworld.com/tools_v2/resources/9e55471ba84686ade677ffe595c45992/upload_images/YELLOW_BANANA.jpg',
 79 |  'name': u'Lib Tech Skate Banana BTX',
 80 |  'page': 0,
 81 |  'thumb': u'http://t3.gstatic.com/images?q=tbn:ANd9GcRzvAUW0en9eZTag3giWelcQ_xbrnBMXVChb3RU3v4HtEgxN3RMS0bSdidf',
 82 |  'width': u'3104'}
 83 | ```
 84 |      
 85 | Filter options:
 86 | 
 87 | ```python        
 88 | ImageOptions:
 89 |     image_type # face, body, clipart, line drawing
 90 |     size_category # large, small, icon
 91 |     larger_than # the well known name of the smallest image size you want
 92 |     exact_width # the exact width of the image you want
 93 |     exact_height # the exact height of the image you want
 94 |     color_type # color, b&w, specific
 95 |     color # blue, green, red
 96 | ```
 97 |         
 98 | Enums of values that can be used to filter image searches:
 99 | 
100 | ```python
101 | class ImageType:
102 |     NONE = None
103 |     FACE = "face"
104 |     PHOTO = "photo"
105 |     CLIPART = "clipart"
106 |     LINE_DRAWING = "lineart"
107 |     
108 | class SizeCategory:
109 |     NONE = None
110 |     ICON = "i"
111 |     LARGE = "l"
112 |     MEDIUM = "m"
113 |     SMALL = "s"
114 |     LARGER_THAN = "lt"
115 |     EXACTLY = "ex"
116 |     
117 | class LargerThan:
118 |     NONE = None
119 |     QSVGA = "qsvga" # 400 x 300
120 |     VGA = "vga"     # 640 x 480
121 |     SVGA = "svga"   # 800 x 600
122 |     XGA = "xga"     # 1024 x 768
123 |     MP_2 = "2mp"    # 2 MP (1600 x 1200)
124 |     MP_4 = "4mp"    # 4 MP (2272 x 1704)
125 |     MP_6 = "6mp"    # 6 MP (2816 x 2112)
126 |     MP_8 = "8mp"    # 8 MP (3264 x 2448)
127 |     MP_10 = "10mp"  # 10 MP (3648 x 2736)
128 |     MP_12 = "12mp"  # 12 MP (4096 x 3072)
129 |     MP_15 = "15mp"  # 15 MP (4480 x 3360)
130 |     MP_20 = "20mp"  # 20 MP (5120 x 3840)
131 |     MP_40 = "40mp"  # 40 MP (7216 x 5412)
132 |     MP_70 = "70mp"  # 70 MP (9600 x 7200)
133 | 
134 | class ColorType:
135 |     NONE = None
136 |     COLOR = "color"
137 |     BLACK_WHITE = "gray"
138 |     SPECIFIC = "specific"
139 | ```
140 | 
141 | ## Google Currency Converter (Exchange Rates)
142 | Convert between one currency and another using google calculator. Results are real time and can change at any time based on the current exchange rate according to google.
143 | 
144 | Convert 5 US Dollars to Euros using the official 3 letter currency acronym:
145 | 
146 | ```python
147 | euros = Google.convert_currency(5.0, "USD", "EUR")
148 | print "5.0 USD = {0} EUR".format(euros)
149 | ```
150 | 
151 | ```python
152 | 5.0 USD = 3.82350692 EUR
153 | ```
154 | 
155 | Convert 1000 Japanese Yen to US Dollars:
156 | 
157 | ```python
158 | yen = Google.convert_currency(1000, "yen", "us dollars")
159 | print "1000 yen = {0} us dollars".format(yen)
160 | ```
161 | 
162 | ```python
163 | 1000 yen = 12.379 us dollars
164 | ```
165 | 
166 | Instead you can get the exchange rate which returns what 1 `from_currency` equals in `to_currency` and do your own math:
167 | 
168 | ```python
169 | rate = Google.exchange_rate("dollars", "pesos")
170 | print "dollars -> pesos exchange rate = {0}".format(rate)
171 | ```
172 | 
173 | ```python
174 | dollars -> pesos exchange rate = 13.1580679
175 | ```
176 | 
177 | Perform your own math. The following 2 statements are equal:
178 | 
179 | ```python
180 | 5.0 * Google.exchange_rate("USD", "EUR")
181 | ```
182 | 
183 | ```python
184 | Google.convert_currency(5.0, "USD", "EUR")
185 | ```
186 | 
187 | As a side note, `convert_currency` is always more accurate than performing your own math on `exchange_rate` because of possible rounding errors. However if you have more than one value to convert it is best to call `exchange_rate` and cache the result to use for multiple calculations instead of querying the google server for each one.
188 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | 
3 | 


--------------------------------------------------------------------------------
/google.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | from BeautifulSoup import BeautifulSoup
  4 | from pprint import pprint
  5 | import os
  6 | import threading
  7 | import httplib
  8 | import urllib
  9 | import urllib2
 10 | import sys
 11 | import re
 12 | try:
 13 |     import json
 14 | except ImportError:
 15 |     import simplejson as json
 16 | 
 17 | __author__ = "Anthony Casagrande <birdapi@gmail.com>"
 18 | __version__ = "0.9"
 19 | 
 20 | """
 21 | Represents a standard google search result
 22 | """
 23 | class GoogleResult:
 24 |     def __init__(self):
 25 |         self.name = None
 26 |         self.link = None
 27 |         self.description = None
 28 |         self.thumb = None
 29 |         self.cached = None
 30 |         self.page = None
 31 |         self.index = None
 32 |         
 33 | """
 34 | Represents a result returned from google calculator
 35 | """        
 36 | class CalculatorResult:
 37 |     def __init__(self):
 38 |         self.value = None
 39 |         self.unit = None
 40 |         self.expr = None
 41 |         self.result = None
 42 |         self.fullstring = None
 43 | 
 44 | class ShoppingResult:
 45 |     def __init__(self):
 46 |         self.name = None
 47 |         self.link = None
 48 |         self.thumb = None
 49 |         self.subtext = None
 50 |         self.description = None
 51 |         self.compare_url = None
 52 |         self.store_count = None
 53 |         self.min_price = None
 54 | 
 55 | """
 56 | Represents a google image search result
 57 | """
 58 | class ImageResult:  
 59 |     def __init__(self):
 60 |         self.name = None
 61 |         self.link = None
 62 |         self.thumb = None
 63 |         self.thumb_width = None
 64 |         self.thumb_height = None
 65 |         self.width = None
 66 |         self.height = None
 67 |         self.filesize = None
 68 |         self.format = None
 69 |         self.domain = None
 70 |         self.page = None
 71 |         self.index = None
 72 |         
 73 | class ImageOptions:
 74 |     def __init__(self):
 75 |         self.image_type = None
 76 |         self.size_category = None
 77 |         self.larger_than = None
 78 |         self.exact_width = None
 79 |         self.exact_height = None
 80 |         self.color_type = None
 81 |         self.color = None
 82 |         
 83 |     def get_tbs(self):
 84 |         tbs = None
 85 |         if self.image_type:
 86 |             # clipart
 87 |             tbs = add_to_tbs(tbs, "itp", self.image_type)
 88 |         if self.size_category and not (self.larger_than or (self.exact_width and self.exact_height)): 
 89 |             # i = icon, l = large, m = medium, lt = larger than, ex = exact
 90 |             tbs = add_to_tbs(tbs, "isz", self.size_category)
 91 |         if self.larger_than:   
 92 |             # qsvga,4mp
 93 |             tbs = add_to_tbs(tbs, "isz", SizeCategory.LARGER_THAN)
 94 |             tbs = add_to_tbs(tbs, "islt", self.larger_than)
 95 |         if self.exact_width and self.exact_height:
 96 |             tbs = add_to_tbs(tbs, "isz", SizeCategory.EXACTLY)
 97 |             tbs = add_to_tbs(tbs, "iszw", self.exact_width)
 98 |             tbs = add_to_tbs(tbs, "iszh", self.exact_height)
 99 |         if self.color_type and not self.color:
100 |             # color = color, gray = black and white, specific = user defined
101 |             tbs = add_to_tbs(tbs, "ic", self.color_type)
102 |         if self.color:
103 |             tbs = add_to_tbs(tbs, "ic", ColorType.SPECIFIC)
104 |             tbs = add_to_tbs(tbs, "isc", self.color)
105 |         return tbs
106 |         
107 | """
108 | Defines the public static api methods
109 | """
110 | class Google:
111 |     DEBUG_MODE = False
112 | 
113 |     """
114 |     Returns a list of GoogleResult
115 |     """
116 |     @staticmethod
117 |     def search(query, pages = 1):
118 |         results = []
119 |         for i in range(pages):
120 |             url = get_search_url(query, i)
121 |             html = get_html(url)
122 |             if html:
123 |                 if Google.DEBUG_MODE:
124 |                     write_html_to_file(html, "{0}_{1}.html".format(query.replace(" ", "_"), i))
125 |                 soup = BeautifulSoup(html)
126 |                 lis = soup.findAll("li", attrs = { "class" : "g" })
127 |                 j = 0
128 |                 for li in lis:
129 |                     res = GoogleResult()
130 |                     res.page = i
131 |                     res.index = j
132 |                     a = li.find("a")
133 |                     res.name = a.text.strip()
134 |                     res.link = a["href"]
135 |                     if res.link.startswith("/search?"):
136 |                         # this is not an external link, so skip it
137 |                         continue
138 |                     sdiv = li.find("div", attrs = { "class" : "s" })
139 |                     if sdiv:
140 |                         res.description = sdiv.text.strip()
141 |                     results.append(res)
142 |                     j = j + 1
143 |         return results
144 |     
145 |     """
146 |     OLD WAY OF DOING THIS. Attempts to use google calculator to calculate the result of expr
147 |     """
148 |     @staticmethod
149 |     def calculate_old(expr):
150 |         url = get_search_url(expr)
151 |         html = get_html(url)
152 |         if html:
153 |             soup = BeautifulSoup(html)
154 |             topstuff = soup.find("div", id="topstuff")
155 |             if topstuff:
156 |                 a = topstuff.find("a")
157 |                 if a and a["href"].find("calculator") != -1:
158 |                     h2 = topstuff.find("h2")
159 |                     if h2:
160 |                         return parse_calc_result(h2.text)
161 |         return None 
162 | 
163 |     @staticmethod
164 |     def search_images_old(query, image_options = None, pages = 1):
165 |         results = []
166 |         for i in range(pages):
167 |             url = get_image_search_url(query, image_options, i)
168 |             html = get_html(url)
169 |             if html:
170 |                 if Google.DEBUG_MODE:
171 |                     write_html_to_file(html, "images_{0}_{1}.html".format(query.replace(" ", "_"), i))
172 |                 j = 0
173 |                 soup = BeautifulSoup(html)
174 |                 match = re.search("dyn.setResults\((.+)\);</script>", html)
175 |                 if match:
176 |                     init = unicode(match.group(1), errors="ignore")
177 |                     tokens = init.split('],[')
178 |                     for token in tokens:
179 |                         res = ImageResult()
180 |                         res.page = i
181 |                         res.index = j
182 |                         toks = token.split(",")
183 |                         
184 |                         # should be 32 or 33, but seems to change, so just make sure no exceptions
185 |                         # will be thrown by the indexing
186 |                         if (len(toks) > 22):
187 |                             for t in range(len(toks)):
188 |                                 toks[t] = toks[t].replace('\\x3cb\\x3e','').replace('\\x3c/b\\x3e','').replace('\\x3d','=').replace('\\x26','&')
189 |                             match = re.search("imgurl=(?P<link>[^&]+)&imgrefurl", toks[0])
190 |                             if match:
191 |                                 res.link = match.group("link")
192 |                             res.name = toks[6].replace('"', '')
193 |                             res.thumb = toks[21].replace('"', '')
194 |                             res.format = toks[10].replace('"', '')
195 |                             res.domain = toks[11].replace('"', '')
196 |                             match = re.search("(?P<width>[0-9]+) &times; (?P<height>[0-9]+) - (?P<size>[^ ]+)", toks[9].replace('"', ''))
197 |                             if match:
198 |                                 res.width = match.group("width")
199 |                                 res.height = match.group("height")
200 |                                 res.filesize = match.group("size")        
201 |                             results.append(res)
202 |                             j = j + 1
203 |         return results
204 |     
205 |     @staticmethod
206 |     def search_images(query, image_options = None, pages = 1):
207 |         results = []
208 |         for i in range(pages):
209 |             url = get_image_search_url(query, image_options, i)
210 |             html = get_html(url)
211 |             if html:
212 |                 if Google.DEBUG_MODE:
213 |                     write_html_to_file(html, "images_{0}_{1}.html".format(query.replace(" ", "_"), i))
214 |                 soup = BeautifulSoup(html)
215 |                 j = 0
216 |                 tds = soup.findAll("td")
217 |                 for td in tds:
218 |                     a = td.find("a")
219 |                     if a and a["href"].find("imgurl") != -1:
220 |                         res = ImageResult()
221 |                         res.page = i
222 |                         res.index = j
223 |                         tokens = a["href"].split("&")
224 |                         match = re.search("imgurl=(?P<link>[^&]+)", tokens[0])
225 |                         if match:
226 |                             res.link = match.group("link")
227 |                             res.format = res.link[res.link.rfind(".") + 1:]
228 |                         img = td.find("img")
229 |                         if img:
230 |                             res.thumb = img["src"]
231 |                             res.thumb_width = img["width"]
232 |                             res.thumb_height = img["height"]
233 |                         match = re.search("(?P<width>[0-9]+) &times; (?P<height>[0-9]+) - (?P<size>[^&]+)", td.text)
234 |                         if match:
235 |                             res.width = match.group("width")
236 |                             res.name = td.text[:td.text.find(res.width)]
237 |                             res.height = match.group("height")
238 |                             res.filesize = match.group("size")
239 |                         cite = td.find("cite")
240 |                         if cite:
241 |                             res.domain = cite["title"]
242 |                         results.append(res)
243 |                         j = j + 1
244 |         return results
245 |     
246 |     @staticmethod
247 |     def shopping(query, pages=1):
248 |         results = []
249 |         for i in range(pages):
250 |             url = get_shopping_url(query, i)
251 |             html = get_html(url)
252 |             if html:
253 |                 if Google.DEBUG_MODE:
254 |                     write_html_to_file(html, "shopping_{0}_{1}.html".format(query.replace(" ", "_"), i))
255 |                 j = 0
256 |                 soup = BeautifulSoup(html)
257 |                 
258 |                 products = soup.findAll("li", "g")
259 |                 for prod in products:
260 |                     res = ShoppingResult()
261 |                     
262 |                     divs = prod.findAll("div")
263 |                     for div in divs:
264 |                         match = re.search("from (?P<count>[0-9]+) stores", div.text.strip())
265 |                         if match:
266 |                             res.store_count = match.group("count")
267 |                             break
268 |                     
269 |                     h3 = prod.find("h3", "r")
270 |                     if h3:
271 |                         a = h3.find("a")
272 |                         if a:
273 |                             res.compare_url = a["href"]
274 |                         res.name = h3.text.strip()
275 |                     
276 |                     psliimg = prod.find("div", "psliimg")
277 |                     if psliimg:
278 |                         img = psliimg.find("img")
279 |                         if img:
280 |                             res.thumb = img["src"]
281 |                     
282 |                     f = prod.find("div", "f")
283 |                     if f:
284 |                         res.subtext = f.text.strip()
285 |                         
286 |                     price = prod.find("div", "psliprice")
287 |                     if price:
288 |                         res.min_price = price.text.strip()
289 |                     
290 |                     results.append(res)
291 |                     j = j + 1
292 |         return results
293 |     
294 |     """
295 |     Converts one currency to another.
296 |     [amount] from_curreny = [return_value] to_currency
297 |     """
298 |     @staticmethod
299 |     def convert_currency(amount, from_currency, to_currency):
300 |         if from_currency == to_currency:
301 |             return 1.0
302 |         conn = httplib.HTTPSConnection("www.google.com")
303 |         req_url = "/ig/calculator?hl=en&q={0}{1}=?{2}".format(amount, from_currency.replace(" ", "%20"), to_currency.replace(" ", "%20"))
304 |         headers = { "User-Agent": "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101" }
305 |         conn.request("GET", req_url, "", headers)
306 |         response = conn.getresponse()
307 |         rval = response.read().decode("utf-8").replace(u"\xa0", "")
308 |         conn.close()
309 |         rhs = rval.split(",")[1].strip()
310 |         s = rhs[rhs.find('"')+1:]
311 |         rate = s[:s.find(" ")]
312 |         return float(rate)
313 |         
314 |     """
315 |     Gets the exchange rate of one currency to another.
316 |     1 from_curreny = [return_value] to_currency
317 |     """
318 |     @staticmethod
319 |     def exchange_rate(from_currency, to_currency):
320 |         return Google.convert_currency(1, from_currency, to_currency)
321 |  
322 |     """
323 |     Attempts to use google calculator to calculate the result of expr
324 |     """
325 |     @staticmethod
326 |     def calculate(expr):
327 |         conn = httplib.HTTPSConnection("www.google.com")
328 |         req_url = "/ig/calculator?hl=en&q={0}".format(expr.replace(" ", "%20"))
329 |         headers = { "User-Agent": "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101" }
330 |         conn.request("GET", req_url, "", headers)
331 |         response = conn.getresponse()
332 |         j = response.read().decode("utf-8").replace(u"\xa0", "")
333 |         conn.close()
334 |         j = re.sub(r"{\s*'?(\w)", r'{"\1', j)
335 |         j = re.sub(r",\s*'?(\w)", r',"\1', j)
336 |         j = re.sub(r"(\w)'?\s*:", r'\1":', j)
337 |         j = re.sub(r":\s*'(\w)'\s*([,}])", r':"\1"\2', j)
338 |         js = json.loads(j)
339 |         return parse_calc_result(js["lhs"] + " = " + js["rhs"])
340 |  
341 | def normalize_query(query):
342 |     return query.strip().replace(":", "%3A").replace("+", "%2B").replace("&", "%26").replace(" ", "+")
343 |  
344 | def get_search_url(query, page = 0, per_page = 10):
345 |     # note: num per page might not be supported by google anymore (because of google instant)
346 |     return "http://www.google.com/search?hl=en&q=%s&start=%i&num=%i" % (normalize_query(query), page * per_page, per_page)
347 | 
348 | def get_shopping_url(query, page=0, per_page=10):
349 |     return "http://www.google.com/search?hl=en&q={0}&tbm=shop&start={1}&num={2}".format(normalize_query(query), page * per_page, per_page)
350 |     
351 | class ImageType:
352 |     NONE = None
353 |     FACE = "face"
354 |     PHOTO = "photo"
355 |     CLIPART = "clipart"
356 |     LINE_DRAWING = "lineart"
357 |     
358 | class SizeCategory:
359 |     NONE = None
360 |     ICON = "i"
361 |     LARGE = "l"
362 |     MEDIUM = "m"
363 |     SMALL = "s"
364 |     LARGER_THAN = "lt"
365 |     EXACTLY = "ex"
366 |     
367 | class LargerThan:
368 |     NONE = None
369 |     QSVGA = "qsvga" # 400 x 300
370 |     VGA = "vga"     # 640 x 480
371 |     SVGA = "svga"   # 800 x 600
372 |     XGA = "xga"     # 1024 x 768
373 |     MP_2 = "2mp"    # 2 MP (1600 x 1200)
374 |     MP_4 = "4mp"    # 4 MP (2272 x 1704)
375 |     MP_6 = "6mp"    # 6 MP (2816 x 2112)
376 |     MP_8 = "8mp"    # 8 MP (3264 x 2448)
377 |     MP_10 = "10mp"  # 10 MP (3648 x 2736)
378 |     MP_12 = "12mp"  # 12 MP (4096 x 3072)
379 |     MP_15 = "15mp"  # 15 MP (4480 x 3360)
380 |     MP_20 = "20mp"  # 20 MP (5120 x 3840)
381 |     MP_40 = "40mp"  # 40 MP (7216 x 5412)
382 |     MP_70 = "70mp"  # 70 MP (9600 x 7200)
383 | 
384 | class ColorType:
385 |     NONE = None
386 |     COLOR = "color"
387 |     BLACK_WHITE = "gray"
388 |     SPECIFIC = "specific"
389 |     
390 | def get_image_search_url(query, image_options=None, page=0, per_page=20):
391 |     query = query.strip().replace(":", "%3A").replace("+", "%2B").replace("&", "%26").replace(" ", "+")
392 |     url = "http://images.google.com/images?q=%s&sa=N&start=%i&ndsp=%i&sout=1" % (query, page * per_page, per_page)
393 |     if image_options:
394 |         tbs = image_options.get_tbs()
395 |         if tbs:
396 |             url = url + tbs
397 |     return url
398 |     
399 | def add_to_tbs(tbs, name, value):
400 |     if tbs:
401 |         return "%s,%s:%s" % (tbs, name, value)
402 |     else:
403 |         return "&tbs=%s:%s" % (name, value) 
404 |     
405 | def parse_calc_result(string):
406 |     result = CalculatorResult()
407 |     result.fullstring = string
408 |     string = string.strip().replace(u"\xa0", " ")
409 |     if string.find("=") != -1:
410 |         result.expr = string[:string.rfind("=")].strip()
411 |         string = string[string.rfind("=") + 2:]
412 |         result.result = string
413 |     tokens = string.split(" ")
414 |     if len(tokens) > 0:
415 |         result.value = ""
416 |         for token in tokens:
417 |             if is_number(token):
418 |                 result.value = result.value + token
419 |             else:
420 |                 if result.unit:
421 |                     result.unit = result.unit + " " + token
422 |                 else:
423 |                     result.unit = token
424 |         return result
425 |     return None
426 |         
427 | def is_number(s):
428 |     try:
429 |         float(s)
430 |         return True
431 |     except ValueError:
432 |         return False
433 |     
434 | def get_html(url):
435 |     try:
436 |         request = urllib2.Request(url)
437 |         request.add_header("User-Agent", "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101")
438 |         html = urllib2.urlopen(request).read()
439 |         return html
440 |     except:
441 |         print "Error accessing:", url
442 |         return None        
443 | 
444 | def write_html_to_file(html, filename):
445 |     of = open(filename, "w")
446 |     of.write(html)
447 |     of.flush()
448 |     of.close()
449 |         
450 | def test():
451 |     search = Google.search("github")
452 |     if search is None or len(search) == 0: 
453 |         print "ERROR: No Search Results!"
454 |     else: 
455 |         print "PASSED: {0} Search Results".format(len(search))
456 |     
457 |     shop = Google.shopping("Disgaea 4")
458 |     if shop is None or len(shop) == 0: 
459 |         print "ERROR: No Shopping Results!"
460 |     else: 
461 |         print "PASSED: {0} Shopping Results".format(len(shop))
462 |     
463 |     options = ImageOptions()
464 |     options.image_type = ImageType.CLIPART
465 |     options.larger_than = LargerThan.MP_4
466 |     options.color = "green"
467 |     images = Google.search_images("banana", options)
468 |     if images is None or len(images) == 0: 
469 |         print "ERROR: No Image Results!"
470 |     else:
471 |         print "PASSED: {0} Image Results".format(len(images))
472 |         
473 |     calc = Google.calculate("157.3kg in grams")
474 |     if calc is not None and int(calc.value) == 157300:
475 |         print "PASSED: Calculator passed"
476 |     else:
477 |         print "ERROR: Calculator failed!"
478 |         
479 |     euros = Google.convert_currency(5.0, "USD", "EUR")
480 |     if euros is not None and euros > 0.0:
481 |         print "PASSED: Currency convert passed"
482 |     else:
483 |         print "ERROR: Currency convert failed!"
484 |         
485 | def main():
486 |     if len(sys.argv) > 1 and sys.argv[1] == "--debug":
487 |         Google.DEBUG_MODE = True
488 |         print "DEBUG_MODE ENABLED"
489 |     test()
490 |         
491 | if __name__ == "__main__":
492 |     main()
493 |     


--------------------------------------------------------------------------------