├── BeautifulSoup.py ├── README.md ├── __init__.py └── google.py /BeautifulSoup.py: -------------------------------------------------------------------------------- 1 | """Beautiful Soup 2 | Elixir and Tonic 3 | "The Screen-Scraper's Friend" 4 | http://www.crummy.com/software/BeautifulSoup/ 5 | 6 | Beautiful Soup parses a (possibly invalid) XML or HTML document into a 7 | tree representation. It provides methods and Pythonic idioms that make 8 | it easy to navigate, search, and modify the tree. 9 | 10 | A well-formed XML/HTML document yields a well-formed data 11 | structure. An ill-formed XML/HTML document yields a correspondingly 12 | ill-formed data structure. If your document is only locally 13 | well-formed, you can use this library to find and process the 14 | well-formed part of it. 15 | 16 | Beautiful Soup works with Python 2.2 and up. It has no external 17 | dependencies, but you'll have more success at converting data to UTF-8 18 | if you also install these three packages: 19 | 20 | * chardet, for auto-detecting character encodings 21 | http://chardet.feedparser.org/ 22 | * cjkcodecs and iconv_codec, which add more encodings to the ones supported 23 | by stock Python. 24 | http://cjkpython.i18n.org/ 25 | 26 | Beautiful Soup defines classes for two main parsing strategies: 27 | 28 | * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific 29 | language that kind of looks like XML. 30 | 31 | * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid 32 | or invalid. This class has web browser-like heuristics for 33 | obtaining a sensible parse tree in the face of common HTML errors. 34 | 35 | Beautiful Soup also defines a class (UnicodeDammit) for autodetecting 36 | the encoding of an HTML or XML document, and converting it to 37 | Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser. 38 | 39 | For more than you ever wanted to know about Beautiful Soup, see the 40 | documentation: 41 | http://www.crummy.com/software/BeautifulSoup/documentation.html 42 | 43 | Here, have some legalese: 44 | 45 | Copyright (c) 2004-2010, Leonard Richardson 46 | 47 | All rights reserved. 48 | 49 | Redistribution and use in source and binary forms, with or without 50 | modification, are permitted provided that the following conditions are 51 | met: 52 | 53 | * Redistributions of source code must retain the above copyright 54 | notice, this list of conditions and the following disclaimer. 55 | 56 | * Redistributions in binary form must reproduce the above 57 | copyright notice, this list of conditions and the following 58 | disclaimer in the documentation and/or other materials provided 59 | with the distribution. 60 | 61 | * Neither the name of the the Beautiful Soup Consortium and All 62 | Night Kosher Bakery nor the names of its contributors may be 63 | used to endorse or promote products derived from this software 64 | without specific prior written permission. 65 | 66 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 67 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 68 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 69 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 70 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 71 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 72 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 73 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 74 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 75 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 76 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT. 77 | 78 | """ 79 | from __future__ import generators 80 | 81 | __author__ = "Leonard Richardson (leonardr@segfault.org)" 82 | __version__ = "3.2.0" 83 | __copyright__ = "Copyright (c) 2004-2010 Leonard Richardson" 84 | __license__ = "New-style BSD" 85 | 86 | from sgmllib import SGMLParser, SGMLParseError 87 | import codecs 88 | import markupbase 89 | import types 90 | import re 91 | import sgmllib 92 | try: 93 | from htmlentitydefs import name2codepoint 94 | except ImportError: 95 | name2codepoint = {} 96 | try: 97 | set 98 | except NameError: 99 | from sets import Set as set 100 | 101 | #These hacks make Beautiful Soup able to parse XML with namespaces 102 | sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') 103 | markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]*\s*').match 104 | 105 | DEFAULT_OUTPUT_ENCODING = "utf-8" 106 | 107 | def _match_css_class(str): 108 | """Build a RE to match the given CSS class.""" 109 | return re.compile(r"(^|.*\s)%s($|\s)" % str) 110 | 111 | # First, the classes that represent markup elements. 112 | 113 | class PageElement(object): 114 | """Contains the navigational information for some part of the page 115 | (either a tag or a piece of text)""" 116 | 117 | def setup(self, parent=None, previous=None): 118 | """Sets up the initial relations between this element and 119 | other elements.""" 120 | self.parent = parent 121 | self.previous = previous 122 | self.next = None 123 | self.previousSibling = None 124 | self.nextSibling = None 125 | if self.parent and self.parent.contents: 126 | self.previousSibling = self.parent.contents[-1] 127 | self.previousSibling.nextSibling = self 128 | 129 | def replaceWith(self, replaceWith): 130 | oldParent = self.parent 131 | myIndex = self.parent.index(self) 132 | if hasattr(replaceWith, "parent")\ 133 | and replaceWith.parent is self.parent: 134 | # We're replacing this element with one of its siblings. 135 | index = replaceWith.parent.index(replaceWith) 136 | if index and index < myIndex: 137 | # Furthermore, it comes before this element. That 138 | # means that when we extract it, the index of this 139 | # element will change. 140 | myIndex = myIndex - 1 141 | self.extract() 142 | oldParent.insert(myIndex, replaceWith) 143 | 144 | def replaceWithChildren(self): 145 | myParent = self.parent 146 | myIndex = self.parent.index(self) 147 | self.extract() 148 | reversedChildren = list(self.contents) 149 | reversedChildren.reverse() 150 | for child in reversedChildren: 151 | myParent.insert(myIndex, child) 152 | 153 | def extract(self): 154 | """Destructively rips this element out of the tree.""" 155 | if self.parent: 156 | try: 157 | del self.parent.contents[self.parent.index(self)] 158 | except ValueError: 159 | pass 160 | 161 | #Find the two elements that would be next to each other if 162 | #this element (and any children) hadn't been parsed. Connect 163 | #the two. 164 | lastChild = self._lastRecursiveChild() 165 | nextElement = lastChild.next 166 | 167 | if self.previous: 168 | self.previous.next = nextElement 169 | if nextElement: 170 | nextElement.previous = self.previous 171 | self.previous = None 172 | lastChild.next = None 173 | 174 | self.parent = None 175 | if self.previousSibling: 176 | self.previousSibling.nextSibling = self.nextSibling 177 | if self.nextSibling: 178 | self.nextSibling.previousSibling = self.previousSibling 179 | self.previousSibling = self.nextSibling = None 180 | return self 181 | 182 | def _lastRecursiveChild(self): 183 | "Finds the last element beneath this object to be parsed." 184 | lastChild = self 185 | while hasattr(lastChild, 'contents') and lastChild.contents: 186 | lastChild = lastChild.contents[-1] 187 | return lastChild 188 | 189 | def insert(self, position, newChild): 190 | if isinstance(newChild, basestring) \ 191 | and not isinstance(newChild, NavigableString): 192 | newChild = NavigableString(newChild) 193 | 194 | position = min(position, len(self.contents)) 195 | if hasattr(newChild, 'parent') and newChild.parent is not None: 196 | # We're 'inserting' an element that's already one 197 | # of this object's children. 198 | if newChild.parent is self: 199 | index = self.index(newChild) 200 | if index > position: 201 | # Furthermore we're moving it further down the 202 | # list of this object's children. That means that 203 | # when we extract this element, our target index 204 | # will jump down one. 205 | position = position - 1 206 | newChild.extract() 207 | 208 | newChild.parent = self 209 | previousChild = None 210 | if position == 0: 211 | newChild.previousSibling = None 212 | newChild.previous = self 213 | else: 214 | previousChild = self.contents[position-1] 215 | newChild.previousSibling = previousChild 216 | newChild.previousSibling.nextSibling = newChild 217 | newChild.previous = previousChild._lastRecursiveChild() 218 | if newChild.previous: 219 | newChild.previous.next = newChild 220 | 221 | newChildsLastElement = newChild._lastRecursiveChild() 222 | 223 | if position >= len(self.contents): 224 | newChild.nextSibling = None 225 | 226 | parent = self 227 | parentsNextSibling = None 228 | while not parentsNextSibling: 229 | parentsNextSibling = parent.nextSibling 230 | parent = parent.parent 231 | if not parent: # This is the last element in the document. 232 | break 233 | if parentsNextSibling: 234 | newChildsLastElement.next = parentsNextSibling 235 | else: 236 | newChildsLastElement.next = None 237 | else: 238 | nextChild = self.contents[position] 239 | newChild.nextSibling = nextChild 240 | if newChild.nextSibling: 241 | newChild.nextSibling.previousSibling = newChild 242 | newChildsLastElement.next = nextChild 243 | 244 | if newChildsLastElement.next: 245 | newChildsLastElement.next.previous = newChildsLastElement 246 | self.contents.insert(position, newChild) 247 | 248 | def append(self, tag): 249 | """Appends the given tag to the contents of this tag.""" 250 | self.insert(len(self.contents), tag) 251 | 252 | def findNext(self, name=None, attrs={}, text=None, **kwargs): 253 | """Returns the first item that matches the given criteria and 254 | appears after this Tag in the document.""" 255 | return self._findOne(self.findAllNext, name, attrs, text, **kwargs) 256 | 257 | def findAllNext(self, name=None, attrs={}, text=None, limit=None, 258 | **kwargs): 259 | """Returns all items that match the given criteria and appear 260 | after this Tag in the document.""" 261 | return self._findAll(name, attrs, text, limit, self.nextGenerator, 262 | **kwargs) 263 | 264 | def findNextSibling(self, name=None, attrs={}, text=None, **kwargs): 265 | """Returns the closest sibling to this Tag that matches the 266 | given criteria and appears after this Tag in the document.""" 267 | return self._findOne(self.findNextSiblings, name, attrs, text, 268 | **kwargs) 269 | 270 | def findNextSiblings(self, name=None, attrs={}, text=None, limit=None, 271 | **kwargs): 272 | """Returns the siblings of this Tag that match the given 273 | criteria and appear after this Tag in the document.""" 274 | return self._findAll(name, attrs, text, limit, 275 | self.nextSiblingGenerator, **kwargs) 276 | fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x 277 | 278 | def findPrevious(self, name=None, attrs={}, text=None, **kwargs): 279 | """Returns the first item that matches the given criteria and 280 | appears before this Tag in the document.""" 281 | return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs) 282 | 283 | def findAllPrevious(self, name=None, attrs={}, text=None, limit=None, 284 | **kwargs): 285 | """Returns all items that match the given criteria and appear 286 | before this Tag in the document.""" 287 | return self._findAll(name, attrs, text, limit, self.previousGenerator, 288 | **kwargs) 289 | fetchPrevious = findAllPrevious # Compatibility with pre-3.x 290 | 291 | def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs): 292 | """Returns the closest sibling to this Tag that matches the 293 | given criteria and appears before this Tag in the document.""" 294 | return self._findOne(self.findPreviousSiblings, name, attrs, text, 295 | **kwargs) 296 | 297 | def findPreviousSiblings(self, name=None, attrs={}, text=None, 298 | limit=None, **kwargs): 299 | """Returns the siblings of this Tag that match the given 300 | criteria and appear before this Tag in the document.""" 301 | return self._findAll(name, attrs, text, limit, 302 | self.previousSiblingGenerator, **kwargs) 303 | fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x 304 | 305 | def findParent(self, name=None, attrs={}, **kwargs): 306 | """Returns the closest parent of this Tag that matches the given 307 | criteria.""" 308 | # NOTE: We can't use _findOne because findParents takes a different 309 | # set of arguments. 310 | r = None 311 | l = self.findParents(name, attrs, 1) 312 | if l: 313 | r = l[0] 314 | return r 315 | 316 | def findParents(self, name=None, attrs={}, limit=None, **kwargs): 317 | """Returns the parents of this Tag that match the given 318 | criteria.""" 319 | 320 | return self._findAll(name, attrs, None, limit, self.parentGenerator, 321 | **kwargs) 322 | fetchParents = findParents # Compatibility with pre-3.x 323 | 324 | #These methods do the real heavy lifting. 325 | 326 | def _findOne(self, method, name, attrs, text, **kwargs): 327 | r = None 328 | l = method(name, attrs, text, 1, **kwargs) 329 | if l: 330 | r = l[0] 331 | return r 332 | 333 | def _findAll(self, name, attrs, text, limit, generator, **kwargs): 334 | "Iterates over a generator looking for things that match." 335 | 336 | if isinstance(name, SoupStrainer): 337 | strainer = name 338 | # (Possibly) special case some findAll*(...) searches 339 | elif text is None and not limit and not attrs and not kwargs: 340 | # findAll*(True) 341 | if name is True: 342 | return [element for element in generator() 343 | if isinstance(element, Tag)] 344 | # findAll*('tag-name') 345 | elif isinstance(name, basestring): 346 | return [element for element in generator() 347 | if isinstance(element, Tag) and 348 | element.name == name] 349 | else: 350 | strainer = SoupStrainer(name, attrs, text, **kwargs) 351 | # Build a SoupStrainer 352 | else: 353 | strainer = SoupStrainer(name, attrs, text, **kwargs) 354 | results = ResultSet(strainer) 355 | g = generator() 356 | while True: 357 | try: 358 | i = g.next() 359 | except StopIteration: 360 | break 361 | if i: 362 | found = strainer.search(i) 363 | if found: 364 | results.append(found) 365 | if limit and len(results) >= limit: 366 | break 367 | return results 368 | 369 | #These Generators can be used to navigate starting from both 370 | #NavigableStrings and Tags. 371 | def nextGenerator(self): 372 | i = self 373 | while i is not None: 374 | i = i.next 375 | yield i 376 | 377 | def nextSiblingGenerator(self): 378 | i = self 379 | while i is not None: 380 | i = i.nextSibling 381 | yield i 382 | 383 | def previousGenerator(self): 384 | i = self 385 | while i is not None: 386 | i = i.previous 387 | yield i 388 | 389 | def previousSiblingGenerator(self): 390 | i = self 391 | while i is not None: 392 | i = i.previousSibling 393 | yield i 394 | 395 | def parentGenerator(self): 396 | i = self 397 | while i is not None: 398 | i = i.parent 399 | yield i 400 | 401 | # Utility methods 402 | def substituteEncoding(self, str, encoding=None): 403 | encoding = encoding or "utf-8" 404 | return str.replace("%SOUP-ENCODING%", encoding) 405 | 406 | def toEncoding(self, s, encoding=None): 407 | """Encodes an object to a string in some encoding, or to Unicode. 408 | .""" 409 | if isinstance(s, unicode): 410 | if encoding: 411 | s = s.encode(encoding) 412 | elif isinstance(s, str): 413 | if encoding: 414 | s = s.encode(encoding) 415 | else: 416 | s = unicode(s) 417 | else: 418 | if encoding: 419 | s = self.toEncoding(str(s), encoding) 420 | else: 421 | s = unicode(s) 422 | return s 423 | 424 | class NavigableString(unicode, PageElement): 425 | 426 | def __new__(cls, value): 427 | """Create a new NavigableString. 428 | 429 | When unpickling a NavigableString, this method is called with 430 | the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be 431 | passed in to the superclass's __new__ or the superclass won't know 432 | how to handle non-ASCII characters. 433 | """ 434 | if isinstance(value, unicode): 435 | return unicode.__new__(cls, value) 436 | return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) 437 | 438 | def __getnewargs__(self): 439 | return (NavigableString.__str__(self),) 440 | 441 | def __getattr__(self, attr): 442 | """text.string gives you text. This is for backwards 443 | compatibility for Navigable*String, but for CData* it lets you 444 | get the string without the CData wrapper.""" 445 | if attr == 'string': 446 | return self 447 | else: 448 | raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr) 449 | 450 | def __unicode__(self): 451 | return str(self).decode(DEFAULT_OUTPUT_ENCODING) 452 | 453 | def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 454 | if encoding: 455 | return self.encode(encoding) 456 | else: 457 | return self 458 | 459 | class CData(NavigableString): 460 | 461 | def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 462 | return "" % NavigableString.__str__(self, encoding) 463 | 464 | class ProcessingInstruction(NavigableString): 465 | def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 466 | output = self 467 | if "%SOUP-ENCODING%" in output: 468 | output = self.substituteEncoding(output, encoding) 469 | return "" % self.toEncoding(output, encoding) 470 | 471 | class Comment(NavigableString): 472 | def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 473 | return "" % NavigableString.__str__(self, encoding) 474 | 475 | class Declaration(NavigableString): 476 | def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING): 477 | return "" % NavigableString.__str__(self, encoding) 478 | 479 | class Tag(PageElement): 480 | 481 | """Represents a found HTML tag with its attributes and contents.""" 482 | 483 | def _invert(h): 484 | "Cheap function to invert a hash." 485 | i = {} 486 | for k,v in h.items(): 487 | i[v] = k 488 | return i 489 | 490 | XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'", 491 | "quot" : '"', 492 | "amp" : "&", 493 | "lt" : "<", 494 | "gt" : ">" } 495 | 496 | XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS) 497 | 498 | def _convertEntities(self, match): 499 | """Used in a call to re.sub to replace HTML, XML, and numeric 500 | entities with the appropriate Unicode characters. If HTML 501 | entities are being converted, any unrecognized entities are 502 | escaped.""" 503 | x = match.group(1) 504 | if self.convertHTMLEntities and x in name2codepoint: 505 | return unichr(name2codepoint[x]) 506 | elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS: 507 | if self.convertXMLEntities: 508 | return self.XML_ENTITIES_TO_SPECIAL_CHARS[x] 509 | else: 510 | return u'&%s;' % x 511 | elif len(x) > 0 and x[0] == '#': 512 | # Handle numeric entities 513 | if len(x) > 1 and x[1] == 'x': 514 | return unichr(int(x[2:], 16)) 515 | else: 516 | return unichr(int(x[1:])) 517 | 518 | elif self.escapeUnrecognizedEntities: 519 | return u'&%s;' % x 520 | else: 521 | return u'&%s;' % x 522 | 523 | def __init__(self, parser, name, attrs=None, parent=None, 524 | previous=None): 525 | "Basic constructor." 526 | 527 | # We don't actually store the parser object: that lets extracted 528 | # chunks be garbage-collected 529 | self.parserClass = parser.__class__ 530 | self.isSelfClosing = parser.isSelfClosingTag(name) 531 | self.name = name 532 | if attrs is None: 533 | attrs = [] 534 | elif isinstance(attrs, dict): 535 | attrs = attrs.items() 536 | self.attrs = attrs 537 | self.contents = [] 538 | self.setup(parent, previous) 539 | self.hidden = False 540 | self.containsSubstitutions = False 541 | self.convertHTMLEntities = parser.convertHTMLEntities 542 | self.convertXMLEntities = parser.convertXMLEntities 543 | self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities 544 | 545 | # Convert any HTML, XML, or numeric entities in the attribute values. 546 | convert = lambda(k, val): (k, 547 | re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);", 548 | self._convertEntities, 549 | val)) 550 | self.attrs = map(convert, self.attrs) 551 | 552 | def getString(self): 553 | if (len(self.contents) == 1 554 | and isinstance(self.contents[0], NavigableString)): 555 | return self.contents[0] 556 | 557 | def setString(self, string): 558 | """Replace the contents of the tag with a string""" 559 | self.clear() 560 | self.append(string) 561 | 562 | string = property(getString, setString) 563 | 564 | def getText(self, separator=u""): 565 | if not len(self.contents): 566 | return u"" 567 | stopNode = self._lastRecursiveChild().next 568 | strings = [] 569 | current = self.contents[0] 570 | while current is not stopNode: 571 | if isinstance(current, NavigableString): 572 | strings.append(current.strip()) 573 | current = current.next 574 | return separator.join(strings) 575 | 576 | text = property(getText) 577 | 578 | def get(self, key, default=None): 579 | """Returns the value of the 'key' attribute for the tag, or 580 | the value given for 'default' if it doesn't have that 581 | attribute.""" 582 | return self._getAttrMap().get(key, default) 583 | 584 | def clear(self): 585 | """Extract all children.""" 586 | for child in self.contents[:]: 587 | child.extract() 588 | 589 | def index(self, element): 590 | for i, child in enumerate(self.contents): 591 | if child is element: 592 | return i 593 | raise ValueError("Tag.index: element not in tag") 594 | 595 | def has_key(self, key): 596 | return self._getAttrMap().has_key(key) 597 | 598 | def __getitem__(self, key): 599 | """tag[key] returns the value of the 'key' attribute for the tag, 600 | and throws an exception if it's not there.""" 601 | return self._getAttrMap()[key] 602 | 603 | def __iter__(self): 604 | "Iterating over a tag iterates over its contents." 605 | return iter(self.contents) 606 | 607 | def __len__(self): 608 | "The length of a tag is the length of its list of contents." 609 | return len(self.contents) 610 | 611 | def __contains__(self, x): 612 | return x in self.contents 613 | 614 | def __nonzero__(self): 615 | "A tag is non-None even if it has no contents." 616 | return True 617 | 618 | def __setitem__(self, key, value): 619 | """Setting tag[key] sets the value of the 'key' attribute for the 620 | tag.""" 621 | self._getAttrMap() 622 | self.attrMap[key] = value 623 | found = False 624 | for i in range(0, len(self.attrs)): 625 | if self.attrs[i][0] == key: 626 | self.attrs[i] = (key, value) 627 | found = True 628 | if not found: 629 | self.attrs.append((key, value)) 630 | self._getAttrMap()[key] = value 631 | 632 | def __delitem__(self, key): 633 | "Deleting tag[key] deletes all 'key' attributes for the tag." 634 | for item in self.attrs: 635 | if item[0] == key: 636 | self.attrs.remove(item) 637 | #We don't break because bad HTML can define the same 638 | #attribute multiple times. 639 | self._getAttrMap() 640 | if self.attrMap.has_key(key): 641 | del self.attrMap[key] 642 | 643 | def __call__(self, *args, **kwargs): 644 | """Calling a tag like a function is the same as calling its 645 | findAll() method. Eg. tag('a') returns a list of all the A tags 646 | found within this tag.""" 647 | return apply(self.findAll, args, kwargs) 648 | 649 | def __getattr__(self, tag): 650 | #print "Getattr %s.%s" % (self.__class__, tag) 651 | if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3: 652 | return self.find(tag[:-3]) 653 | elif tag.find('__') != 0: 654 | return self.find(tag) 655 | raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag) 656 | 657 | def __eq__(self, other): 658 | """Returns true iff this tag has the same name, the same attributes, 659 | and the same contents (recursively) as the given tag. 660 | 661 | NOTE: right now this will return false if two tags have the 662 | same attributes in a different order. Should this be fixed?""" 663 | if other is self: 664 | return True 665 | if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other): 666 | return False 667 | for i in range(0, len(self.contents)): 668 | if self.contents[i] != other.contents[i]: 669 | return False 670 | return True 671 | 672 | def __ne__(self, other): 673 | """Returns true iff this tag is not identical to the other tag, 674 | as defined in __eq__.""" 675 | return not self == other 676 | 677 | def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING): 678 | """Renders this tag as a string.""" 679 | return self.__str__(encoding) 680 | 681 | def __unicode__(self): 682 | return self.__str__(None) 683 | 684 | BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" 685 | + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" 686 | + ")") 687 | 688 | def _sub_entity(self, x): 689 | """Used with a regular expression to substitute the 690 | appropriate XML entity for an XML special character.""" 691 | return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";" 692 | 693 | def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING, 694 | prettyPrint=False, indentLevel=0): 695 | """Returns a string or Unicode representation of this tag and 696 | its contents. To get Unicode, pass None for encoding. 697 | 698 | NOTE: since Python's HTML parser consumes whitespace, this 699 | method is not certain to reproduce the whitespace present in 700 | the original string.""" 701 | 702 | encodedName = self.toEncoding(self.name, encoding) 703 | 704 | attrs = [] 705 | if self.attrs: 706 | for key, val in self.attrs: 707 | fmt = '%s="%s"' 708 | if isinstance(val, basestring): 709 | if self.containsSubstitutions and '%SOUP-ENCODING%' in val: 710 | val = self.substituteEncoding(val, encoding) 711 | 712 | # The attribute value either: 713 | # 714 | # * Contains no embedded double quotes or single quotes. 715 | # No problem: we enclose it in double quotes. 716 | # * Contains embedded single quotes. No problem: 717 | # double quotes work here too. 718 | # * Contains embedded double quotes. No problem: 719 | # we enclose it in single quotes. 720 | # * Embeds both single _and_ double quotes. This 721 | # can't happen naturally, but it can happen if 722 | # you modify an attribute value after parsing 723 | # the document. Now we have a bit of a 724 | # problem. We solve it by enclosing the 725 | # attribute in single quotes, and escaping any 726 | # embedded single quotes to XML entities. 727 | if '"' in val: 728 | fmt = "%s='%s'" 729 | if "'" in val: 730 | # TODO: replace with apos when 731 | # appropriate. 732 | val = val.replace("'", "&squot;") 733 | 734 | # Now we're okay w/r/t quotes. But the attribute 735 | # value might also contain angle brackets, or 736 | # ampersands that aren't part of entities. We need 737 | # to escape those to XML entities too. 738 | val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val) 739 | 740 | attrs.append(fmt % (self.toEncoding(key, encoding), 741 | self.toEncoding(val, encoding))) 742 | close = '' 743 | closeTag = '' 744 | if self.isSelfClosing: 745 | close = ' /' 746 | else: 747 | closeTag = '' % encodedName 748 | 749 | indentTag, indentContents = 0, 0 750 | if prettyPrint: 751 | indentTag = indentLevel 752 | space = (' ' * (indentTag-1)) 753 | indentContents = indentTag + 1 754 | contents = self.renderContents(encoding, prettyPrint, indentContents) 755 | if self.hidden: 756 | s = contents 757 | else: 758 | s = [] 759 | attributeString = '' 760 | if attrs: 761 | attributeString = ' ' + ' '.join(attrs) 762 | if prettyPrint: 763 | s.append(space) 764 | s.append('<%s%s%s>' % (encodedName, attributeString, close)) 765 | if prettyPrint: 766 | s.append("\n") 767 | s.append(contents) 768 | if prettyPrint and contents and contents[-1] != "\n": 769 | s.append("\n") 770 | if prettyPrint and closeTag: 771 | s.append(space) 772 | s.append(closeTag) 773 | if prettyPrint and closeTag and self.nextSibling: 774 | s.append("\n") 775 | s = ''.join(s) 776 | return s 777 | 778 | def decompose(self): 779 | """Recursively destroys the contents of this tree.""" 780 | self.extract() 781 | if len(self.contents) == 0: 782 | return 783 | current = self.contents[0] 784 | while current is not None: 785 | next = current.next 786 | if isinstance(current, Tag): 787 | del current.contents[:] 788 | current.parent = None 789 | current.previous = None 790 | current.previousSibling = None 791 | current.next = None 792 | current.nextSibling = None 793 | current = next 794 | 795 | def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING): 796 | return self.__str__(encoding, True) 797 | 798 | def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING, 799 | prettyPrint=False, indentLevel=0): 800 | """Renders the contents of this tag as a string in the given 801 | encoding. If encoding is None, returns a Unicode string..""" 802 | s=[] 803 | for c in self: 804 | text = None 805 | if isinstance(c, NavigableString): 806 | text = c.__str__(encoding) 807 | elif isinstance(c, Tag): 808 | s.append(c.__str__(encoding, prettyPrint, indentLevel)) 809 | if text and prettyPrint: 810 | text = text.strip() 811 | if text: 812 | if prettyPrint: 813 | s.append(" " * (indentLevel-1)) 814 | s.append(text) 815 | if prettyPrint: 816 | s.append("\n") 817 | return ''.join(s) 818 | 819 | #Soup methods 820 | 821 | def find(self, name=None, attrs={}, recursive=True, text=None, 822 | **kwargs): 823 | """Return only the first child of this Tag matching the given 824 | criteria.""" 825 | r = None 826 | l = self.findAll(name, attrs, recursive, text, 1, **kwargs) 827 | if l: 828 | r = l[0] 829 | return r 830 | findChild = find 831 | 832 | def findAll(self, name=None, attrs={}, recursive=True, text=None, 833 | limit=None, **kwargs): 834 | """Extracts a list of Tag objects that match the given 835 | criteria. You can specify the name of the Tag and any 836 | attributes you want the Tag to have. 837 | 838 | The value of a key-value pair in the 'attrs' map can be a 839 | string, a list of strings, a regular expression object, or a 840 | callable that takes a string and returns whether or not the 841 | string matches for some custom definition of 'matches'. The 842 | same is true of the tag name.""" 843 | generator = self.recursiveChildGenerator 844 | if not recursive: 845 | generator = self.childGenerator 846 | return self._findAll(name, attrs, text, limit, generator, **kwargs) 847 | findChildren = findAll 848 | 849 | # Pre-3.x compatibility methods 850 | first = find 851 | fetch = findAll 852 | 853 | def fetchText(self, text=None, recursive=True, limit=None): 854 | return self.findAll(text=text, recursive=recursive, limit=limit) 855 | 856 | def firstText(self, text=None, recursive=True): 857 | return self.find(text=text, recursive=recursive) 858 | 859 | #Private methods 860 | 861 | def _getAttrMap(self): 862 | """Initializes a map representation of this tag's attributes, 863 | if not already initialized.""" 864 | if not getattr(self, 'attrMap'): 865 | self.attrMap = {} 866 | for (key, value) in self.attrs: 867 | self.attrMap[key] = value 868 | return self.attrMap 869 | 870 | #Generator methods 871 | def childGenerator(self): 872 | # Just use the iterator from the contents 873 | return iter(self.contents) 874 | 875 | def recursiveChildGenerator(self): 876 | if not len(self.contents): 877 | raise StopIteration 878 | stopNode = self._lastRecursiveChild().next 879 | current = self.contents[0] 880 | while current is not stopNode: 881 | yield current 882 | current = current.next 883 | 884 | 885 | # Next, a couple classes to represent queries and their results. 886 | class SoupStrainer: 887 | """Encapsulates a number of ways of matching a markup element (tag or 888 | text).""" 889 | 890 | def __init__(self, name=None, attrs={}, text=None, **kwargs): 891 | self.name = name 892 | if isinstance(attrs, basestring): 893 | kwargs['class'] = _match_css_class(attrs) 894 | attrs = None 895 | if kwargs: 896 | if attrs: 897 | attrs = attrs.copy() 898 | attrs.update(kwargs) 899 | else: 900 | attrs = kwargs 901 | self.attrs = attrs 902 | self.text = text 903 | 904 | def __str__(self): 905 | if self.text: 906 | return self.text 907 | else: 908 | return "%s|%s" % (self.name, self.attrs) 909 | 910 | def searchTag(self, markupName=None, markupAttrs={}): 911 | found = None 912 | markup = None 913 | if isinstance(markupName, Tag): 914 | markup = markupName 915 | markupAttrs = markup 916 | callFunctionWithTagData = callable(self.name) \ 917 | and not isinstance(markupName, Tag) 918 | 919 | if (not self.name) \ 920 | or callFunctionWithTagData \ 921 | or (markup and self._matches(markup, self.name)) \ 922 | or (not markup and self._matches(markupName, self.name)): 923 | if callFunctionWithTagData: 924 | match = self.name(markupName, markupAttrs) 925 | else: 926 | match = True 927 | markupAttrMap = None 928 | for attr, matchAgainst in self.attrs.items(): 929 | if not markupAttrMap: 930 | if hasattr(markupAttrs, 'get'): 931 | markupAttrMap = markupAttrs 932 | else: 933 | markupAttrMap = {} 934 | for k,v in markupAttrs: 935 | markupAttrMap[k] = v 936 | attrValue = markupAttrMap.get(attr) 937 | if not self._matches(attrValue, matchAgainst): 938 | match = False 939 | break 940 | if match: 941 | if markup: 942 | found = markup 943 | else: 944 | found = markupName 945 | return found 946 | 947 | def search(self, markup): 948 | #print 'looking for %s in %s' % (self, markup) 949 | found = None 950 | # If given a list of items, scan it for a text element that 951 | # matches. 952 | if hasattr(markup, "__iter__") \ 953 | and not isinstance(markup, Tag): 954 | for element in markup: 955 | if isinstance(element, NavigableString) \ 956 | and self.search(element): 957 | found = element 958 | break 959 | # If it's a Tag, make sure its name or attributes match. 960 | # Don't bother with Tags if we're searching for text. 961 | elif isinstance(markup, Tag): 962 | if not self.text: 963 | found = self.searchTag(markup) 964 | # If it's text, make sure the text matches. 965 | elif isinstance(markup, NavigableString) or \ 966 | isinstance(markup, basestring): 967 | if self._matches(markup, self.text): 968 | found = markup 969 | else: 970 | raise Exception, "I don't know how to match against a %s" \ 971 | % markup.__class__ 972 | return found 973 | 974 | def _matches(self, markup, matchAgainst): 975 | #print "Matching %s against %s" % (markup, matchAgainst) 976 | result = False 977 | if matchAgainst is True: 978 | result = markup is not None 979 | elif callable(matchAgainst): 980 | result = matchAgainst(markup) 981 | else: 982 | #Custom match methods take the tag as an argument, but all 983 | #other ways of matching match the tag name as a string. 984 | if isinstance(markup, Tag): 985 | markup = markup.name 986 | if markup and not isinstance(markup, basestring): 987 | markup = unicode(markup) 988 | #Now we know that chunk is either a string, or None. 989 | if hasattr(matchAgainst, 'match'): 990 | # It's a regexp object. 991 | result = markup and matchAgainst.search(markup) 992 | elif hasattr(matchAgainst, '__iter__'): # list-like 993 | result = markup in matchAgainst 994 | elif hasattr(matchAgainst, 'items'): 995 | result = markup.has_key(matchAgainst) 996 | elif matchAgainst and isinstance(markup, basestring): 997 | if isinstance(markup, unicode): 998 | matchAgainst = unicode(matchAgainst) 999 | else: 1000 | matchAgainst = str(matchAgainst) 1001 | 1002 | if not result: 1003 | result = matchAgainst == markup 1004 | return result 1005 | 1006 | class ResultSet(list): 1007 | """A ResultSet is just a list that keeps track of the SoupStrainer 1008 | that created it.""" 1009 | def __init__(self, source): 1010 | list.__init__([]) 1011 | self.source = source 1012 | 1013 | # Now, some helper functions. 1014 | 1015 | def buildTagMap(default, *args): 1016 | """Turns a list of maps, lists, or scalars into a single map. 1017 | Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and 1018 | NESTING_RESET_TAGS maps out of lists and partial maps.""" 1019 | built = {} 1020 | for portion in args: 1021 | if hasattr(portion, 'items'): 1022 | #It's a map. Merge it. 1023 | for k,v in portion.items(): 1024 | built[k] = v 1025 | elif hasattr(portion, '__iter__'): # is a list 1026 | #It's a list. Map each item to the default. 1027 | for k in portion: 1028 | built[k] = default 1029 | else: 1030 | #It's a scalar. Map it to the default. 1031 | built[portion] = default 1032 | return built 1033 | 1034 | # Now, the parser classes. 1035 | 1036 | class BeautifulStoneSoup(Tag, SGMLParser): 1037 | 1038 | """This class contains the basic parser and search code. It defines 1039 | a parser that knows nothing about tag behavior except for the 1040 | following: 1041 | 1042 | You can't close a tag without closing all the tags it encloses. 1043 | That is, "" actually means 1044 | "". 1045 | 1046 | [Another possible explanation is "", but since 1047 | this class defines no SELF_CLOSING_TAGS, it will never use that 1048 | explanation.] 1049 | 1050 | This class is useful for parsing XML or made-up markup languages, 1051 | or when BeautifulSoup makes an assumption counter to what you were 1052 | expecting.""" 1053 | 1054 | SELF_CLOSING_TAGS = {} 1055 | NESTABLE_TAGS = {} 1056 | RESET_NESTING_TAGS = {} 1057 | QUOTE_TAGS = {} 1058 | PRESERVE_WHITESPACE_TAGS = [] 1059 | 1060 | MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'), 1061 | lambda x: x.group(1) + ' />'), 1062 | (re.compile(']*)>'), 1063 | lambda x: '') 1064 | ] 1065 | 1066 | ROOT_TAG_NAME = u'[document]' 1067 | 1068 | HTML_ENTITIES = "html" 1069 | XML_ENTITIES = "xml" 1070 | XHTML_ENTITIES = "xhtml" 1071 | # TODO: This only exists for backwards-compatibility 1072 | ALL_ENTITIES = XHTML_ENTITIES 1073 | 1074 | # Used when determining whether a text node is all whitespace and 1075 | # can be replaced with a single space. A text node that contains 1076 | # fancy Unicode spaces (usually non-breaking) should be left 1077 | # alone. 1078 | STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, } 1079 | 1080 | def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None, 1081 | markupMassage=True, smartQuotesTo=XML_ENTITIES, 1082 | convertEntities=None, selfClosingTags=None, isHTML=False): 1083 | """The Soup object is initialized as the 'root tag', and the 1084 | provided markup (which can be a string or a file-like object) 1085 | is fed into the underlying parser. 1086 | 1087 | sgmllib will process most bad HTML, and the BeautifulSoup 1088 | class has some tricks for dealing with some HTML that kills 1089 | sgmllib, but Beautiful Soup can nonetheless choke or lose data 1090 | if your data uses self-closing tags or declarations 1091 | incorrectly. 1092 | 1093 | By default, Beautiful Soup uses regexes to sanitize input, 1094 | avoiding the vast majority of these problems. If the problems 1095 | don't apply to you, pass in False for markupMassage, and 1096 | you'll get better performance. 1097 | 1098 | The default parser massage techniques fix the two most common 1099 | instances of invalid HTML that choke sgmllib: 1100 | 1101 |
(No space between name of closing tag and tag close) 1102 | (Extraneous whitespace in declaration) 1103 | 1104 | You can pass in a custom list of (RE object, replace method) 1105 | tuples to get Beautiful Soup to scrub your input the way you 1106 | want.""" 1107 | 1108 | self.parseOnlyThese = parseOnlyThese 1109 | self.fromEncoding = fromEncoding 1110 | self.smartQuotesTo = smartQuotesTo 1111 | self.convertEntities = convertEntities 1112 | # Set the rules for how we'll deal with the entities we 1113 | # encounter 1114 | if self.convertEntities: 1115 | # It doesn't make sense to convert encoded characters to 1116 | # entities even while you're converting entities to Unicode. 1117 | # Just convert it all to Unicode. 1118 | self.smartQuotesTo = None 1119 | if convertEntities == self.HTML_ENTITIES: 1120 | self.convertXMLEntities = False 1121 | self.convertHTMLEntities = True 1122 | self.escapeUnrecognizedEntities = True 1123 | elif convertEntities == self.XHTML_ENTITIES: 1124 | self.convertXMLEntities = True 1125 | self.convertHTMLEntities = True 1126 | self.escapeUnrecognizedEntities = False 1127 | elif convertEntities == self.XML_ENTITIES: 1128 | self.convertXMLEntities = True 1129 | self.convertHTMLEntities = False 1130 | self.escapeUnrecognizedEntities = False 1131 | else: 1132 | self.convertXMLEntities = False 1133 | self.convertHTMLEntities = False 1134 | self.escapeUnrecognizedEntities = False 1135 | 1136 | self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags) 1137 | SGMLParser.__init__(self) 1138 | 1139 | if hasattr(markup, 'read'): # It's a file-type object. 1140 | markup = markup.read() 1141 | self.markup = markup 1142 | self.markupMassage = markupMassage 1143 | try: 1144 | self._feed(isHTML=isHTML) 1145 | except StopParsing: 1146 | pass 1147 | self.markup = None # The markup can now be GCed 1148 | 1149 | def convert_charref(self, name): 1150 | """This method fixes a bug in Python's SGMLParser.""" 1151 | try: 1152 | n = int(name) 1153 | except ValueError: 1154 | return 1155 | if not 0 <= n <= 127 : # ASCII ends at 127, not 255 1156 | return 1157 | return self.convert_codepoint(n) 1158 | 1159 | def _feed(self, inDocumentEncoding=None, isHTML=False): 1160 | # Convert the document to Unicode. 1161 | markup = self.markup 1162 | if isinstance(markup, unicode): 1163 | if not hasattr(self, 'originalEncoding'): 1164 | self.originalEncoding = None 1165 | else: 1166 | dammit = UnicodeDammit\ 1167 | (markup, [self.fromEncoding, inDocumentEncoding], 1168 | smartQuotesTo=self.smartQuotesTo, isHTML=isHTML) 1169 | markup = dammit.unicode 1170 | self.originalEncoding = dammit.originalEncoding 1171 | self.declaredHTMLEncoding = dammit.declaredHTMLEncoding 1172 | if markup: 1173 | if self.markupMassage: 1174 | if not hasattr(self.markupMassage, "__iter__"): 1175 | self.markupMassage = self.MARKUP_MASSAGE 1176 | for fix, m in self.markupMassage: 1177 | markup = fix.sub(m, markup) 1178 | # TODO: We get rid of markupMassage so that the 1179 | # soup object can be deepcopied later on. Some 1180 | # Python installations can't copy regexes. If anyone 1181 | # was relying on the existence of markupMassage, this 1182 | # might cause problems. 1183 | del(self.markupMassage) 1184 | self.reset() 1185 | 1186 | SGMLParser.feed(self, markup) 1187 | # Close out any unfinished strings and close all the open tags. 1188 | self.endData() 1189 | while self.currentTag.name != self.ROOT_TAG_NAME: 1190 | self.popTag() 1191 | 1192 | def __getattr__(self, methodName): 1193 | """This method routes method call requests to either the SGMLParser 1194 | superclass or the Tag superclass, depending on the method name.""" 1195 | #print "__getattr__ called on %s.%s" % (self.__class__, methodName) 1196 | 1197 | if methodName.startswith('start_') or methodName.startswith('end_') \ 1198 | or methodName.startswith('do_'): 1199 | return SGMLParser.__getattr__(self, methodName) 1200 | elif not methodName.startswith('__'): 1201 | return Tag.__getattr__(self, methodName) 1202 | else: 1203 | raise AttributeError 1204 | 1205 | def isSelfClosingTag(self, name): 1206 | """Returns true iff the given string is the name of a 1207 | self-closing tag according to this parser.""" 1208 | return self.SELF_CLOSING_TAGS.has_key(name) \ 1209 | or self.instanceSelfClosingTags.has_key(name) 1210 | 1211 | def reset(self): 1212 | Tag.__init__(self, self, self.ROOT_TAG_NAME) 1213 | self.hidden = 1 1214 | SGMLParser.reset(self) 1215 | self.currentData = [] 1216 | self.currentTag = None 1217 | self.tagStack = [] 1218 | self.quoteStack = [] 1219 | self.pushTag(self) 1220 | 1221 | def popTag(self): 1222 | tag = self.tagStack.pop() 1223 | 1224 | #print "Pop", tag.name 1225 | if self.tagStack: 1226 | self.currentTag = self.tagStack[-1] 1227 | return self.currentTag 1228 | 1229 | def pushTag(self, tag): 1230 | #print "Push", tag.name 1231 | if self.currentTag: 1232 | self.currentTag.contents.append(tag) 1233 | self.tagStack.append(tag) 1234 | self.currentTag = self.tagStack[-1] 1235 | 1236 | def endData(self, containerClass=NavigableString): 1237 | if self.currentData: 1238 | currentData = u''.join(self.currentData) 1239 | if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and 1240 | not set([tag.name for tag in self.tagStack]).intersection( 1241 | self.PRESERVE_WHITESPACE_TAGS)): 1242 | if '\n' in currentData: 1243 | currentData = '\n' 1244 | else: 1245 | currentData = ' ' 1246 | self.currentData = [] 1247 | if self.parseOnlyThese and len(self.tagStack) <= 1 and \ 1248 | (not self.parseOnlyThese.text or \ 1249 | not self.parseOnlyThese.search(currentData)): 1250 | return 1251 | o = containerClass(currentData) 1252 | o.setup(self.currentTag, self.previous) 1253 | if self.previous: 1254 | self.previous.next = o 1255 | self.previous = o 1256 | self.currentTag.contents.append(o) 1257 | 1258 | 1259 | def _popToTag(self, name, inclusivePop=True): 1260 | """Pops the tag stack up to and including the most recent 1261 | instance of the given tag. If inclusivePop is false, pops the tag 1262 | stack up to but *not* including the most recent instqance of 1263 | the given tag.""" 1264 | #print "Popping to %s" % name 1265 | if name == self.ROOT_TAG_NAME: 1266 | return 1267 | 1268 | numPops = 0 1269 | mostRecentTag = None 1270 | for i in range(len(self.tagStack)-1, 0, -1): 1271 | if name == self.tagStack[i].name: 1272 | numPops = len(self.tagStack)-i 1273 | break 1274 | if not inclusivePop: 1275 | numPops = numPops - 1 1276 | 1277 | for i in range(0, numPops): 1278 | mostRecentTag = self.popTag() 1279 | return mostRecentTag 1280 | 1281 | def _smartPop(self, name): 1282 | 1283 | """We need to pop up to the previous tag of this type, unless 1284 | one of this tag's nesting reset triggers comes between this 1285 | tag and the previous tag of this type, OR unless this tag is a 1286 | generic nesting trigger and another generic nesting trigger 1287 | comes between this tag and the previous tag of this type. 1288 | 1289 | Examples: 1290 |

FooBar *

* should pop to 'p', not 'b'. 1291 |

FooBar *

* should pop to 'table', not 'p'. 1292 |

Foo

Bar *

* should pop to 'tr', not 'p'. 1293 | 1294 |

    • *
    • * should pop to 'ul', not the first 'li'. 1295 |
  • ** should pop to 'table', not the first 'tr' 1296 | tag should 1498 | implicitly close the previous tag within the same
    ** should pop to 'tr', not the first 'td' 1297 | """ 1298 | 1299 | nestingResetTriggers = self.NESTABLE_TAGS.get(name) 1300 | isNestable = nestingResetTriggers != None 1301 | isResetNesting = self.RESET_NESTING_TAGS.has_key(name) 1302 | popTo = None 1303 | inclusive = True 1304 | for i in range(len(self.tagStack)-1, 0, -1): 1305 | p = self.tagStack[i] 1306 | if (not p or p.name == name) and not isNestable: 1307 | #Non-nestable tags get popped to the top or to their 1308 | #last occurance. 1309 | popTo = name 1310 | break 1311 | if (nestingResetTriggers is not None 1312 | and p.name in nestingResetTriggers) \ 1313 | or (nestingResetTriggers is None and isResetNesting 1314 | and self.RESET_NESTING_TAGS.has_key(p.name)): 1315 | 1316 | #If we encounter one of the nesting reset triggers 1317 | #peculiar to this tag, or we encounter another tag 1318 | #that causes nesting to reset, pop up to but not 1319 | #including that tag. 1320 | popTo = p.name 1321 | inclusive = False 1322 | break 1323 | p = p.parent 1324 | if popTo: 1325 | self._popToTag(popTo, inclusive) 1326 | 1327 | def unknown_starttag(self, name, attrs, selfClosing=0): 1328 | #print "Start tag %s: %s" % (name, attrs) 1329 | if self.quoteStack: 1330 | #This is not a real tag. 1331 | #print "<%s> is not real!" % name 1332 | attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs]) 1333 | self.handle_data('<%s%s>' % (name, attrs)) 1334 | return 1335 | self.endData() 1336 | 1337 | if not self.isSelfClosingTag(name) and not selfClosing: 1338 | self._smartPop(name) 1339 | 1340 | if self.parseOnlyThese and len(self.tagStack) <= 1 \ 1341 | and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)): 1342 | return 1343 | 1344 | tag = Tag(self, name, attrs, self.currentTag, self.previous) 1345 | if self.previous: 1346 | self.previous.next = tag 1347 | self.previous = tag 1348 | self.pushTag(tag) 1349 | if selfClosing or self.isSelfClosingTag(name): 1350 | self.popTag() 1351 | if name in self.QUOTE_TAGS: 1352 | #print "Beginning quote (%s)" % name 1353 | self.quoteStack.append(name) 1354 | self.literal = 1 1355 | return tag 1356 | 1357 | def unknown_endtag(self, name): 1358 | #print "End tag %s" % name 1359 | if self.quoteStack and self.quoteStack[-1] != name: 1360 | #This is not a real end tag. 1361 | #print " is not real!" % name 1362 | self.handle_data('' % name) 1363 | return 1364 | self.endData() 1365 | self._popToTag(name) 1366 | if self.quoteStack and self.quoteStack[-1] == name: 1367 | self.quoteStack.pop() 1368 | self.literal = (len(self.quoteStack) > 0) 1369 | 1370 | def handle_data(self, data): 1371 | self.currentData.append(data) 1372 | 1373 | def _toStringSubclass(self, text, subclass): 1374 | """Adds a certain piece of text to the tree as a NavigableString 1375 | subclass.""" 1376 | self.endData() 1377 | self.handle_data(text) 1378 | self.endData(subclass) 1379 | 1380 | def handle_pi(self, text): 1381 | """Handle a processing instruction as a ProcessingInstruction 1382 | object, possibly one with a %SOUP-ENCODING% slot into which an 1383 | encoding will be plugged later.""" 1384 | if text[:3] == "xml": 1385 | text = u"xml version='1.0' encoding='%SOUP-ENCODING%'" 1386 | self._toStringSubclass(text, ProcessingInstruction) 1387 | 1388 | def handle_comment(self, text): 1389 | "Handle comments as Comment objects." 1390 | self._toStringSubclass(text, Comment) 1391 | 1392 | def handle_charref(self, ref): 1393 | "Handle character references as data." 1394 | if self.convertEntities: 1395 | data = unichr(int(ref)) 1396 | else: 1397 | data = '&#%s;' % ref 1398 | self.handle_data(data) 1399 | 1400 | def handle_entityref(self, ref): 1401 | """Handle entity references as data, possibly converting known 1402 | HTML and/or XML entity references to the corresponding Unicode 1403 | characters.""" 1404 | data = None 1405 | if self.convertHTMLEntities: 1406 | try: 1407 | data = unichr(name2codepoint[ref]) 1408 | except KeyError: 1409 | pass 1410 | 1411 | if not data and self.convertXMLEntities: 1412 | data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref) 1413 | 1414 | if not data and self.convertHTMLEntities and \ 1415 | not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref): 1416 | # TODO: We've got a problem here. We're told this is 1417 | # an entity reference, but it's not an XML entity 1418 | # reference or an HTML entity reference. Nonetheless, 1419 | # the logical thing to do is to pass it through as an 1420 | # unrecognized entity reference. 1421 | # 1422 | # Except: when the input is "&carol;" this function 1423 | # will be called with input "carol". When the input is 1424 | # "AT&T", this function will be called with input 1425 | # "T". We have no way of knowing whether a semicolon 1426 | # was present originally, so we don't know whether 1427 | # this is an unknown entity or just a misplaced 1428 | # ampersand. 1429 | # 1430 | # The more common case is a misplaced ampersand, so I 1431 | # escape the ampersand and omit the trailing semicolon. 1432 | data = "&%s" % ref 1433 | if not data: 1434 | # This case is different from the one above, because we 1435 | # haven't already gone through a supposedly comprehensive 1436 | # mapping of entities to Unicode characters. We might not 1437 | # have gone through any mapping at all. So the chances are 1438 | # very high that this is a real entity, and not a 1439 | # misplaced ampersand. 1440 | data = "&%s;" % ref 1441 | self.handle_data(data) 1442 | 1443 | def handle_decl(self, data): 1444 | "Handle DOCTYPEs and the like as Declaration objects." 1445 | self._toStringSubclass(data, Declaration) 1446 | 1447 | def parse_declaration(self, i): 1448 | """Treat a bogus SGML declaration as raw data. Treat a CDATA 1449 | declaration as a CData object.""" 1450 | j = None 1451 | if self.rawdata[i:i+9] == '', i) 1453 | if k == -1: 1454 | k = len(self.rawdata) 1455 | data = self.rawdata[i+9:k] 1456 | j = k+3 1457 | self._toStringSubclass(data, CData) 1458 | else: 1459 | try: 1460 | j = SGMLParser.parse_declaration(self, i) 1461 | except SGMLParseError: 1462 | toHandle = self.rawdata[i:] 1463 | self.handle_data(toHandle) 1464 | j = i + len(toHandle) 1465 | return j 1466 | 1467 | class BeautifulSoup(BeautifulStoneSoup): 1468 | 1469 | """This parser knows the following facts about HTML: 1470 | 1471 | * Some tags have no closing tag and should be interpreted as being 1472 | closed as soon as they are encountered. 1473 | 1474 | * The text inside some tags (ie. 'script') may contain tags which 1475 | are not really part of the document and which should be parsed 1476 | as text, not tags. If you want to parse the text as tags, you can 1477 | always fetch it and parse it explicitly. 1478 | 1479 | * Tag nesting rules: 1480 | 1481 | Most tags can't be nested at all. For instance, the occurance of 1482 | a

    tag should implicitly close the previous

    tag. 1483 | 1484 |

    Para1

    Para2 1485 | should be transformed into: 1486 |

    Para1

    Para2 1487 | 1488 | Some tags can be nested arbitrarily. For instance, the occurance 1489 | of a

    tag should _not_ implicitly close the previous 1490 |
    tag. 1491 | 1492 | Alice said:
    Bob said:
    Blah 1493 | should NOT be transformed into: 1494 | Alice said:
    Bob said:
    Blah 1495 | 1496 | Some tags can be nested, but the nesting is reset by the 1497 | interposition of other tags. For instance, a
    , 1499 | but not close a tag in another table. 1500 | 1501 |
    BlahBlah 1502 | should be transformed into: 1503 |
    BlahBlah 1504 | but, 1505 | Blah
    Blah 1506 | should NOT be transformed into 1507 | Blah
    Blah 1508 | 1509 | Differing assumptions about tag nesting rules are a major source 1510 | of problems with the BeautifulSoup class. If BeautifulSoup is not 1511 | treating as nestable a tag your page author treats as nestable, 1512 | try ICantBelieveItsBeautifulSoup, MinimalSoup, or 1513 | BeautifulStoneSoup before writing your own subclass.""" 1514 | 1515 | def __init__(self, *args, **kwargs): 1516 | if not kwargs.has_key('smartQuotesTo'): 1517 | kwargs['smartQuotesTo'] = self.HTML_ENTITIES 1518 | kwargs['isHTML'] = True 1519 | BeautifulStoneSoup.__init__(self, *args, **kwargs) 1520 | 1521 | SELF_CLOSING_TAGS = buildTagMap(None, 1522 | ('br' , 'hr', 'input', 'img', 'meta', 1523 | 'spacer', 'link', 'frame', 'base', 'col')) 1524 | 1525 | PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea']) 1526 | 1527 | QUOTE_TAGS = {'script' : None, 'textarea' : None} 1528 | 1529 | #According to the HTML standard, each of these inline tags can 1530 | #contain another tag of the same type. Furthermore, it's common 1531 | #to actually use these tags this way. 1532 | NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup', 1533 | 'center') 1534 | 1535 | #According to the HTML standard, these block tags can contain 1536 | #another tag of the same type. Furthermore, it's common 1537 | #to actually use these tags this way. 1538 | NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del') 1539 | 1540 | #Lists can contain other lists, but there are restrictions. 1541 | NESTABLE_LIST_TAGS = { 'ol' : [], 1542 | 'ul' : [], 1543 | 'li' : ['ul', 'ol'], 1544 | 'dl' : [], 1545 | 'dd' : ['dl'], 1546 | 'dt' : ['dl'] } 1547 | 1548 | #Tables can contain other tables, but there are restrictions. 1549 | NESTABLE_TABLE_TAGS = {'table' : [], 1550 | 'tr' : ['table', 'tbody', 'tfoot', 'thead'], 1551 | 'td' : ['tr'], 1552 | 'th' : ['tr'], 1553 | 'thead' : ['table'], 1554 | 'tbody' : ['table'], 1555 | 'tfoot' : ['table'], 1556 | } 1557 | 1558 | NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre') 1559 | 1560 | #If one of these tags is encountered, all tags up to the next tag of 1561 | #this type are popped. 1562 | RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript', 1563 | NON_NESTABLE_BLOCK_TAGS, 1564 | NESTABLE_LIST_TAGS, 1565 | NESTABLE_TABLE_TAGS) 1566 | 1567 | NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS, 1568 | NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS) 1569 | 1570 | # Used to detect the charset in a META tag; see start_meta 1571 | CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) 1572 | 1573 | def start_meta(self, attrs): 1574 | """Beautiful Soup can detect a charset included in a META tag, 1575 | try to convert the document to that charset, and re-parse the 1576 | document from the beginning.""" 1577 | httpEquiv = None 1578 | contentType = None 1579 | contentTypeIndex = None 1580 | tagNeedsEncodingSubstitution = False 1581 | 1582 | for i in range(0, len(attrs)): 1583 | key, value = attrs[i] 1584 | key = key.lower() 1585 | if key == 'http-equiv': 1586 | httpEquiv = value 1587 | elif key == 'content': 1588 | contentType = value 1589 | contentTypeIndex = i 1590 | 1591 | if httpEquiv and contentType: # It's an interesting meta tag. 1592 | match = self.CHARSET_RE.search(contentType) 1593 | if match: 1594 | if (self.declaredHTMLEncoding is not None or 1595 | self.originalEncoding == self.fromEncoding): 1596 | # An HTML encoding was sniffed while converting 1597 | # the document to Unicode, or an HTML encoding was 1598 | # sniffed during a previous pass through the 1599 | # document, or an encoding was specified 1600 | # explicitly and it worked. Rewrite the meta tag. 1601 | def rewrite(match): 1602 | return match.group(1) + "%SOUP-ENCODING%" 1603 | newAttr = self.CHARSET_RE.sub(rewrite, contentType) 1604 | attrs[contentTypeIndex] = (attrs[contentTypeIndex][0], 1605 | newAttr) 1606 | tagNeedsEncodingSubstitution = True 1607 | else: 1608 | # This is our first pass through the document. 1609 | # Go through it again with the encoding information. 1610 | newCharset = match.group(3) 1611 | if newCharset and newCharset != self.originalEncoding: 1612 | self.declaredHTMLEncoding = newCharset 1613 | self._feed(self.declaredHTMLEncoding) 1614 | raise StopParsing 1615 | pass 1616 | tag = self.unknown_starttag("meta", attrs) 1617 | if tag and tagNeedsEncodingSubstitution: 1618 | tag.containsSubstitutions = True 1619 | 1620 | class StopParsing(Exception): 1621 | pass 1622 | 1623 | class ICantBelieveItsBeautifulSoup(BeautifulSoup): 1624 | 1625 | """The BeautifulSoup class is oriented towards skipping over 1626 | common HTML errors like unclosed tags. However, sometimes it makes 1627 | errors of its own. For instance, consider this fragment: 1628 | 1629 | FooBar 1630 | 1631 | This is perfectly valid (if bizarre) HTML. However, the 1632 | BeautifulSoup class will implicitly close the first b tag when it 1633 | encounters the second 'b'. It will think the author wrote 1634 | "FooBar", and didn't close the first 'b' tag, because 1635 | there's no real-world reason to bold something that's already 1636 | bold. When it encounters '' it will close two more 'b' 1637 | tags, for a grand total of three tags closed instead of two. This 1638 | can throw off the rest of your document structure. The same is 1639 | true of a number of other tags, listed below. 1640 | 1641 | It's much more common for someone to forget to close a 'b' tag 1642 | than to actually use nested 'b' tags, and the BeautifulSoup class 1643 | handles the common case. This class handles the not-co-common 1644 | case: where you can't believe someone wrote what they did, but 1645 | it's valid HTML and BeautifulSoup screwed up by assuming it 1646 | wouldn't be.""" 1647 | 1648 | I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \ 1649 | ('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong', 1650 | 'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b', 1651 | 'big') 1652 | 1653 | I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',) 1654 | 1655 | NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS, 1656 | I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS, 1657 | I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS) 1658 | 1659 | class MinimalSoup(BeautifulSoup): 1660 | """The MinimalSoup class is for parsing HTML that contains 1661 | pathologically bad markup. It makes no assumptions about tag 1662 | nesting, but it does know which tags are self-closing, that 1663 | ", html) 175 | if match: 176 | init = unicode(match.group(1), errors="ignore") 177 | tokens = init.split('],[') 178 | for token in tokens: 179 | res = ImageResult() 180 | res.page = i 181 | res.index = j 182 | toks = token.split(",") 183 | 184 | # should be 32 or 33, but seems to change, so just make sure no exceptions 185 | # will be thrown by the indexing 186 | if (len(toks) > 22): 187 | for t in range(len(toks)): 188 | toks[t] = toks[t].replace('\\x3cb\\x3e','').replace('\\x3c/b\\x3e','').replace('\\x3d','=').replace('\\x26','&') 189 | match = re.search("imgurl=(?P[^&]+)&imgrefurl", toks[0]) 190 | if match: 191 | res.link = match.group("link") 192 | res.name = toks[6].replace('"', '') 193 | res.thumb = toks[21].replace('"', '') 194 | res.format = toks[10].replace('"', '') 195 | res.domain = toks[11].replace('"', '') 196 | match = re.search("(?P[0-9]+) × (?P[0-9]+) - (?P[^ ]+)", toks[9].replace('"', '')) 197 | if match: 198 | res.width = match.group("width") 199 | res.height = match.group("height") 200 | res.filesize = match.group("size") 201 | results.append(res) 202 | j = j + 1 203 | return results 204 | 205 | @staticmethod 206 | def search_images(query, image_options = None, pages = 1): 207 | results = [] 208 | for i in range(pages): 209 | url = get_image_search_url(query, image_options, i) 210 | html = get_html(url) 211 | if html: 212 | if Google.DEBUG_MODE: 213 | write_html_to_file(html, "images_{0}_{1}.html".format(query.replace(" ", "_"), i)) 214 | soup = BeautifulSoup(html) 215 | j = 0 216 | tds = soup.findAll("td") 217 | for td in tds: 218 | a = td.find("a") 219 | if a and a["href"].find("imgurl") != -1: 220 | res = ImageResult() 221 | res.page = i 222 | res.index = j 223 | tokens = a["href"].split("&") 224 | match = re.search("imgurl=(?P[^&]+)", tokens[0]) 225 | if match: 226 | res.link = match.group("link") 227 | res.format = res.link[res.link.rfind(".") + 1:] 228 | img = td.find("img") 229 | if img: 230 | res.thumb = img["src"] 231 | res.thumb_width = img["width"] 232 | res.thumb_height = img["height"] 233 | match = re.search("(?P[0-9]+) × (?P[0-9]+) - (?P[^&]+)", td.text) 234 | if match: 235 | res.width = match.group("width") 236 | res.name = td.text[:td.text.find(res.width)] 237 | res.height = match.group("height") 238 | res.filesize = match.group("size") 239 | cite = td.find("cite") 240 | if cite: 241 | res.domain = cite["title"] 242 | results.append(res) 243 | j = j + 1 244 | return results 245 | 246 | @staticmethod 247 | def shopping(query, pages=1): 248 | results = [] 249 | for i in range(pages): 250 | url = get_shopping_url(query, i) 251 | html = get_html(url) 252 | if html: 253 | if Google.DEBUG_MODE: 254 | write_html_to_file(html, "shopping_{0}_{1}.html".format(query.replace(" ", "_"), i)) 255 | j = 0 256 | soup = BeautifulSoup(html) 257 | 258 | products = soup.findAll("li", "g") 259 | for prod in products: 260 | res = ShoppingResult() 261 | 262 | divs = prod.findAll("div") 263 | for div in divs: 264 | match = re.search("from (?P[0-9]+) stores", div.text.strip()) 265 | if match: 266 | res.store_count = match.group("count") 267 | break 268 | 269 | h3 = prod.find("h3", "r") 270 | if h3: 271 | a = h3.find("a") 272 | if a: 273 | res.compare_url = a["href"] 274 | res.name = h3.text.strip() 275 | 276 | psliimg = prod.find("div", "psliimg") 277 | if psliimg: 278 | img = psliimg.find("img") 279 | if img: 280 | res.thumb = img["src"] 281 | 282 | f = prod.find("div", "f") 283 | if f: 284 | res.subtext = f.text.strip() 285 | 286 | price = prod.find("div", "psliprice") 287 | if price: 288 | res.min_price = price.text.strip() 289 | 290 | results.append(res) 291 | j = j + 1 292 | return results 293 | 294 | """ 295 | Converts one currency to another. 296 | [amount] from_curreny = [return_value] to_currency 297 | """ 298 | @staticmethod 299 | def convert_currency(amount, from_currency, to_currency): 300 | if from_currency == to_currency: 301 | return 1.0 302 | conn = httplib.HTTPSConnection("www.google.com") 303 | req_url = "/ig/calculator?hl=en&q={0}{1}=?{2}".format(amount, from_currency.replace(" ", "%20"), to_currency.replace(" ", "%20")) 304 | headers = { "User-Agent": "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101" } 305 | conn.request("GET", req_url, "", headers) 306 | response = conn.getresponse() 307 | rval = response.read().decode("utf-8").replace(u"\xa0", "") 308 | conn.close() 309 | rhs = rval.split(",")[1].strip() 310 | s = rhs[rhs.find('"')+1:] 311 | rate = s[:s.find(" ")] 312 | return float(rate) 313 | 314 | """ 315 | Gets the exchange rate of one currency to another. 316 | 1 from_curreny = [return_value] to_currency 317 | """ 318 | @staticmethod 319 | def exchange_rate(from_currency, to_currency): 320 | return Google.convert_currency(1, from_currency, to_currency) 321 | 322 | """ 323 | Attempts to use google calculator to calculate the result of expr 324 | """ 325 | @staticmethod 326 | def calculate(expr): 327 | conn = httplib.HTTPSConnection("www.google.com") 328 | req_url = "/ig/calculator?hl=en&q={0}".format(expr.replace(" ", "%20")) 329 | headers = { "User-Agent": "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101" } 330 | conn.request("GET", req_url, "", headers) 331 | response = conn.getresponse() 332 | j = response.read().decode("utf-8").replace(u"\xa0", "") 333 | conn.close() 334 | j = re.sub(r"{\s*'?(\w)", r'{"\1', j) 335 | j = re.sub(r",\s*'?(\w)", r',"\1', j) 336 | j = re.sub(r"(\w)'?\s*:", r'\1":', j) 337 | j = re.sub(r":\s*'(\w)'\s*([,}])", r':"\1"\2', j) 338 | js = json.loads(j) 339 | return parse_calc_result(js["lhs"] + " = " + js["rhs"]) 340 | 341 | def normalize_query(query): 342 | return query.strip().replace(":", "%3A").replace("+", "%2B").replace("&", "%26").replace(" ", "+") 343 | 344 | def get_search_url(query, page = 0, per_page = 10): 345 | # note: num per page might not be supported by google anymore (because of google instant) 346 | return "http://www.google.com/search?hl=en&q=%s&start=%i&num=%i" % (normalize_query(query), page * per_page, per_page) 347 | 348 | def get_shopping_url(query, page=0, per_page=10): 349 | return "http://www.google.com/search?hl=en&q={0}&tbm=shop&start={1}&num={2}".format(normalize_query(query), page * per_page, per_page) 350 | 351 | class ImageType: 352 | NONE = None 353 | FACE = "face" 354 | PHOTO = "photo" 355 | CLIPART = "clipart" 356 | LINE_DRAWING = "lineart" 357 | 358 | class SizeCategory: 359 | NONE = None 360 | ICON = "i" 361 | LARGE = "l" 362 | MEDIUM = "m" 363 | SMALL = "s" 364 | LARGER_THAN = "lt" 365 | EXACTLY = "ex" 366 | 367 | class LargerThan: 368 | NONE = None 369 | QSVGA = "qsvga" # 400 x 300 370 | VGA = "vga" # 640 x 480 371 | SVGA = "svga" # 800 x 600 372 | XGA = "xga" # 1024 x 768 373 | MP_2 = "2mp" # 2 MP (1600 x 1200) 374 | MP_4 = "4mp" # 4 MP (2272 x 1704) 375 | MP_6 = "6mp" # 6 MP (2816 x 2112) 376 | MP_8 = "8mp" # 8 MP (3264 x 2448) 377 | MP_10 = "10mp" # 10 MP (3648 x 2736) 378 | MP_12 = "12mp" # 12 MP (4096 x 3072) 379 | MP_15 = "15mp" # 15 MP (4480 x 3360) 380 | MP_20 = "20mp" # 20 MP (5120 x 3840) 381 | MP_40 = "40mp" # 40 MP (7216 x 5412) 382 | MP_70 = "70mp" # 70 MP (9600 x 7200) 383 | 384 | class ColorType: 385 | NONE = None 386 | COLOR = "color" 387 | BLACK_WHITE = "gray" 388 | SPECIFIC = "specific" 389 | 390 | def get_image_search_url(query, image_options=None, page=0, per_page=20): 391 | query = query.strip().replace(":", "%3A").replace("+", "%2B").replace("&", "%26").replace(" ", "+") 392 | url = "http://images.google.com/images?q=%s&sa=N&start=%i&ndsp=%i&sout=1" % (query, page * per_page, per_page) 393 | if image_options: 394 | tbs = image_options.get_tbs() 395 | if tbs: 396 | url = url + tbs 397 | return url 398 | 399 | def add_to_tbs(tbs, name, value): 400 | if tbs: 401 | return "%s,%s:%s" % (tbs, name, value) 402 | else: 403 | return "&tbs=%s:%s" % (name, value) 404 | 405 | def parse_calc_result(string): 406 | result = CalculatorResult() 407 | result.fullstring = string 408 | string = string.strip().replace(u"\xa0", " ") 409 | if string.find("=") != -1: 410 | result.expr = string[:string.rfind("=")].strip() 411 | string = string[string.rfind("=") + 2:] 412 | result.result = string 413 | tokens = string.split(" ") 414 | if len(tokens) > 0: 415 | result.value = "" 416 | for token in tokens: 417 | if is_number(token): 418 | result.value = result.value + token 419 | else: 420 | if result.unit: 421 | result.unit = result.unit + " " + token 422 | else: 423 | result.unit = token 424 | return result 425 | return None 426 | 427 | def is_number(s): 428 | try: 429 | float(s) 430 | return True 431 | except ValueError: 432 | return False 433 | 434 | def get_html(url): 435 | try: 436 | request = urllib2.Request(url) 437 | request.add_header("User-Agent", "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101") 438 | html = urllib2.urlopen(request).read() 439 | return html 440 | except: 441 | print "Error accessing:", url 442 | return None 443 | 444 | def write_html_to_file(html, filename): 445 | of = open(filename, "w") 446 | of.write(html) 447 | of.flush() 448 | of.close() 449 | 450 | def test(): 451 | search = Google.search("github") 452 | if search is None or len(search) == 0: 453 | print "ERROR: No Search Results!" 454 | else: 455 | print "PASSED: {0} Search Results".format(len(search)) 456 | 457 | shop = Google.shopping("Disgaea 4") 458 | if shop is None or len(shop) == 0: 459 | print "ERROR: No Shopping Results!" 460 | else: 461 | print "PASSED: {0} Shopping Results".format(len(shop)) 462 | 463 | options = ImageOptions() 464 | options.image_type = ImageType.CLIPART 465 | options.larger_than = LargerThan.MP_4 466 | options.color = "green" 467 | images = Google.search_images("banana", options) 468 | if images is None or len(images) == 0: 469 | print "ERROR: No Image Results!" 470 | else: 471 | print "PASSED: {0} Image Results".format(len(images)) 472 | 473 | calc = Google.calculate("157.3kg in grams") 474 | if calc is not None and int(calc.value) == 157300: 475 | print "PASSED: Calculator passed" 476 | else: 477 | print "ERROR: Calculator failed!" 478 | 479 | euros = Google.convert_currency(5.0, "USD", "EUR") 480 | if euros is not None and euros > 0.0: 481 | print "PASSED: Currency convert passed" 482 | else: 483 | print "ERROR: Currency convert failed!" 484 | 485 | def main(): 486 | if len(sys.argv) > 1 and sys.argv[1] == "--debug": 487 | Google.DEBUG_MODE = True 488 | print "DEBUG_MODE ENABLED" 489 | test() 490 | 491 | if __name__ == "__main__": 492 | main() 493 | --------------------------------------------------------------------------------