├── .gitignore
├── README.md
├── emails.csv
├── main.py
└── xgoogle
    ├── BeautifulSoup.py
    ├── BeautifulSoup.pyc
    ├── __init__.py
    ├── __init__.pyc
    ├── browser.py
    ├── browser.pyc
    ├── googlesets.py
    ├── search.py
    ├── search.pyc
    ├── sponsoredlinks.py
    └── translate.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .pyc
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Google-EmailScraper
 2 | ===================
 3 | 
 4 | This is a scraper that searches Google based on a query and scrapes all
 5 | emails found on each page Google finds.
 6 | 
 7 | Requirements
 8 | ------------
 9 | * Python 2.6+
10 | 
11 | Instructions
12 | ------------
13 | To use this scraper, you'll need to run main.py with Python and pass in
14 | the following arguments
15 | 
16 | * -query (this is what we're telling Google to search for)
17 | * -pages (number of Google search results pages we should scrape)
18 | * -o     (output filename) 
19 | 
20 | Example
21 | -------
22 | ```
23 | python main.py -query "adoption agency email" -pages 10 -o emails.csv
24 | ```
25 | 


--------------------------------------------------------------------------------
/emails.csv:
--------------------------------------------------------------------------------
 1 | India - Panel Physicians,http://www.immi.gov.au/contacts/overseas/i/india/panel-doctors.htm,healthcheck@apolloahd.com
 2 | India - Panel Physicians,http://www.immi.gov.au/contacts/overseas/i/india/panel-doctors.htm,immigration_cni@apollohospitals.com
 3 | India - Panel Physicians,http://www.immi.gov.au/contacts/overseas/i/india/panel-doctors.htm,malathi_r@apollohospitals.com
 4 | India - Panel Physicians,http://www.immi.gov.au/contacts/overseas/i/india/panel-doctors.htm,pulsevisamedicals@gmail.com
 5 | India - Panel Physicians,http://www.immi.gov.au/contacts/overseas/i/india/panel-doctors.htm,pulsecal@gmail.com
 6 | India - Panel Physicians,http://www.immi.gov.au/contacts/overseas/i/india/panel-doctors.htm,cheemayv@yahoo.com
 7 | India - Panel Physicians,http://www.immi.gov.au/contacts/overseas/i/india/panel-doctors.htm,management@cdcuniverse.com
 8 | India - Panel Physicians,http://www.immi.gov.au/contacts/overseas/i/india/panel-doctors.htm,jayant.rele@releclinic.com
 9 | Indian Doctors | LinkedIn,http://www.linkedin.com/groups/Indian-Doctors-1884718,satish.boyana@harneedi.com.
10 | Indian Doctors | LinkedIn,http://www.linkedin.com/groups/Indian-Doctors-1884718,vaibhavchordia@hotmail.com
11 | Indian Doctors | LinkedIn,http://www.linkedin.com/groups/Indian-Doctors-1884718,satish.boyana@harneedi.com
12 | Indian Doctors | LinkedIn,http://www.linkedin.com/groups/Indian-Doctors-1884718,partimer@lakshmillc.com
13 | Indian Doctors | LinkedIn,http://www.linkedin.com/groups/Indian-Doctors-1884718,doctors@kaizenmr.com
14 | Indian Doctors | LinkedIn,http://www.linkedin.com/groups/Indian-Doctors-1884718,MSINGHSHASHI@GMAIL.COM
15 | Common menu bar links - Panel Physician,http://www.cic.gc.ca/dmp-md/medicalinfo.aspx?CountryID=1995&CountryName=India,vinod.beedwal@maxhealthcare.com
16 | When Email Is Part of the Doctor's Treatment - WSJ.com,http://online.wsj.com/article/SB10001424127887324373204578376863506224702.html,sumathi.reddy@wsj.com
17 | When Email Is Part of the Doctor's Treatment - WSJ.com,http://online.wsj.com/article/SB10001424127887324373204578376863506224702.html,TBD@wsj.com
18 | "India Needs Doctors, Nurses, and Health Insurance - Businessweek",http://www.businessweek.com/articles/2012-05-30/india-needs-doctors-nurses-and-health-insurance,beinhorn1@bloomberg.net
19 | Consult the Best Doctors in India - Best Doctors & Hospitals in India,http://www.healthinindia.com/consult_doctor.html,info@healthinindia.com
20 | Consult the Best Doctors in India - Best Doctors & Hospitals in India,http://www.healthinindia.com/consult_doctor.html,md@healthinindia.com
21 | Consult the Best Doctors in India - Best Doctors & Hospitals in India,http://www.healthinindia.com/consult_doctor.html,harcourt@healthinindia.com
22 | Doctors - Just Dial Homepage,http://www.justdial.com/Vadodara/doctors,abc@xyz.com
23 | The Association of Physicians of India,http://www.apiindia.org/,a.muruganathan@gmail.com
24 | The Association of Physicians of India,http://www.apiindia.org/,shashank.sr@gmail.com
25 | The Association of Physicians of India,http://www.apiindia.org/,drsbgupta@gmail.com
26 | The Association of Physicians of India,http://www.apiindia.org/,api_ho@vsnl.com
27 | The Association of Physicians of India,http://www.apiindia.org/,contact@apicon2014.com
28 | The Association of Physicians of India,http://www.apiindia.org/,api.hdo@gmail.com
29 | India Physician Email List- FreelanceFree Open Projects,http://www.freelancefree.com/project.php?id=1312997172,rrspindia@gmail.com
30 | Global Future Physician - Health Careers Center - University of ...,http://www.healthcareers.umn.edu/courses/global-future-physician/index.htm,todd0002@umn.edu
31 | Global Future Physician - Health Careers Center - University of ...,http://www.healthcareers.umn.edu/courses/global-future-physician/index.htm,HCC@umn.edu
32 | U.S.-India Physician Exchange Program 'Ready to Go' - Indiawest.com,http://www.indiawest.com/news/9044-u-s-india-physician-exchange-program-ready-to-go.html,web@indiawest.com
33 | U.S.-India Physician Exchange Program 'Ready to Go' - Indiawest.com,http://www.indiawest.com/news/9044-u-s-india-physician-exchange-program-ready-to-go.html,shwetaverma@4cplus.com
34 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Copyright 2013 Kendrick Ledet
 3 | 
 4 | This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by
 5 | the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
 6 | This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
 7 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
 8 | You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>.
 9 | 
10 | Google-EmailScraper
11 | 
12 | Purpose: Scraper that searches Google based on a query and scrapes all emails found on each page.
13 | Output files are saved as csv.
14 | 
15 | Date: 5/26/13
16 | '''
17 | from xgoogle.search import GoogleSearch
18 | import urllib2, re, csv, os
19 | import argparse
20 | 
21 | class ScrapeProcess(object):
22 |     emails = []  # for duplication prevention
23 | 
24 |     def __init__(self, filename):
25 |         self.filename  = filename
26 |         self.csvfile   = open(filename, 'wb+')
27 |         self.csvwriter = csv.writer(self.csvfile)
28 | 
29 |     def go(self, query, pages):
30 |         search = GoogleSearch(query)
31 |         search.results_per_page = 10
32 | 
33 |         for i in range(pages):
34 |             search.page = i
35 |             results = search.get_results()
36 |             for page in results:
37 |                 self.scrape(page)
38 |             
39 |     def scrape(self, page):
40 |         try:
41 |             request = urllib2.Request(page.url.encode("utf8"))
42 |             html    = urllib2.urlopen(request).read()
43 |         except Exception, e:
44 |             return
45 | 
46 |         emails = re.findall(r'([A-Za-z0-9\.\+_-]+@[A-Za-z0-9\._-]+\.[a-zA-Z]*)', html)
47 | 
48 |         for email in emails:
49 |             if email not in self.emails:  # if not a duplicate
50 |                 self.csvwriter.writerow([page.title.encode('utf8'), page.url.encode("utf8"), email])
51 |                 self.emails.append(email)
52 | 
53 | parser = argparse.ArgumentParser(description='Scrape Google results for emails')
54 | parser.add_argument('-query', type=str, default='test', help='a query to use for the Google search')
55 | parser.add_argument('-pages', type=int, default=10, help='number of Google results pages to scrape')
56 | parser.add_argument('-o', type=str, default='emails.csv', help='output filename')
57 | 
58 | args   = parser.parse_args()
59 | args.o = args.o+'.csv' if '.csv' not in args.o else args.o  # make sure filename has .csv extension
60 | 
61 | s = ScrapeProcess(args.o)
62 | s.go(args.query, args.pages)


--------------------------------------------------------------------------------
/xgoogle/BeautifulSoup.py:
--------------------------------------------------------------------------------
   1 | """Beautiful Soup
   2 | Elixir and Tonic
   3 | "The Screen-Scraper's Friend"
   4 | http://www.crummy.com/software/BeautifulSoup/
   5 | 
   6 | Beautiful Soup parses a (possibly invalid) XML or HTML document into a
   7 | tree representation. It provides methods and Pythonic idioms that make
   8 | it easy to navigate, search, and modify the tree.
   9 | 
  10 | A well-formed XML/HTML document yields a well-formed data
  11 | structure. An ill-formed XML/HTML document yields a correspondingly
  12 | ill-formed data structure. If your document is only locally
  13 | well-formed, you can use this library to find and process the
  14 | well-formed part of it.
  15 | 
  16 | Beautiful Soup works with Python 2.2 and up. It has no external
  17 | dependencies, but you'll have more success at converting data to UTF-8
  18 | if you also install these three packages:
  19 | 
  20 | * chardet, for auto-detecting character encodings
  21 |   http://chardet.feedparser.org/
  22 | * cjkcodecs and iconv_codec, which add more encodings to the ones supported
  23 |   by stock Python.
  24 |   http://cjkpython.i18n.org/
  25 | 
  26 | Beautiful Soup defines classes for two main parsing strategies:
  27 | 
  28 |  * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
  29 |    language that kind of looks like XML.
  30 | 
  31 |  * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
  32 |    or invalid. This class has web browser-like heuristics for
  33 |    obtaining a sensible parse tree in the face of common HTML errors.
  34 | 
  35 | Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
  36 | the encoding of an HTML or XML document, and converting it to
  37 | Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
  38 | 
  39 | For more than you ever wanted to know about Beautiful Soup, see the
  40 | documentation:
  41 | http://www.crummy.com/software/BeautifulSoup/documentation.html
  42 | 
  43 | Here, have some legalese:
  44 | 
  45 | Copyright (c) 2004-2007, Leonard Richardson
  46 | 
  47 | All rights reserved.
  48 | 
  49 | Redistribution and use in source and binary forms, with or without
  50 | modification, are permitted provided that the following conditions are
  51 | met:
  52 | 
  53 |   * Redistributions of source code must retain the above copyright
  54 |     notice, this list of conditions and the following disclaimer.
  55 | 
  56 |   * Redistributions in binary form must reproduce the above
  57 |     copyright notice, this list of conditions and the following
  58 |     disclaimer in the documentation and/or other materials provided
  59 |     with the distribution.
  60 | 
  61 |   * Neither the name of the the Beautiful Soup Consortium and All
  62 |     Night Kosher Bakery nor the names of its contributors may be
  63 |     used to endorse or promote products derived from this software
  64 |     without specific prior written permission.
  65 | 
  66 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  67 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  68 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  69 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  70 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  71 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  72 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  73 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  74 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  75 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  76 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
  77 | 
  78 | """
  79 | from __future__ import generators
  80 | 
  81 | __author__ = "Leonard Richardson (leonardr@segfault.org)"
  82 | __version__ = "3.0.6"
  83 | __copyright__ = "Copyright (c) 2004-2008 Leonard Richardson"
  84 | __license__ = "New-style BSD"
  85 | 
  86 | from sgmllib import SGMLParser, SGMLParseError
  87 | import codecs
  88 | import types
  89 | import re
  90 | import sgmllib
  91 | try:
  92 |   from htmlentitydefs import name2codepoint
  93 | except ImportError:
  94 |   name2codepoint = {}
  95 | 
  96 | #This hack makes Beautiful Soup able to parse XML with namespaces
  97 | sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
  98 | 
  99 | DEFAULT_OUTPUT_ENCODING = "utf-8"
 100 | 
 101 | # First, the classes that represent markup elements.
 102 | 
 103 | class PageElement:
 104 |     """Contains the navigational information for some part of the page
 105 |     (either a tag or a piece of text)"""
 106 | 
 107 |     def setup(self, parent=None, previous=None):
 108 |         """Sets up the initial relations between this element and
 109 |         other elements."""
 110 |         self.parent = parent
 111 |         self.previous = previous
 112 |         self.next = None
 113 |         self.previousSibling = None
 114 |         self.nextSibling = None
 115 |         if self.parent and self.parent.contents:
 116 |             self.previousSibling = self.parent.contents[-1]
 117 |             self.previousSibling.nextSibling = self
 118 | 
 119 |     def replaceWith(self, replaceWith):
 120 |         oldParent = self.parent
 121 |         myIndex = self.parent.contents.index(self)
 122 |         if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
 123 |             # We're replacing this element with one of its siblings.
 124 |             index = self.parent.contents.index(replaceWith)
 125 |             if index and index < myIndex:
 126 |                 # Furthermore, it comes before this element. That
 127 |                 # means that when we extract it, the index of this
 128 |                 # element will change.
 129 |                 myIndex = myIndex - 1
 130 |         self.extract()
 131 |         oldParent.insert(myIndex, replaceWith)
 132 | 
 133 |     def extract(self):
 134 |         """Destructively rips this element out of the tree."""
 135 |         if self.parent:
 136 |             try:
 137 |                 self.parent.contents.remove(self)
 138 |             except ValueError:
 139 |                 pass
 140 | 
 141 |         #Find the two elements that would be next to each other if
 142 |         #this element (and any children) hadn't been parsed. Connect
 143 |         #the two.
 144 |         lastChild = self._lastRecursiveChild()
 145 |         nextElement = lastChild.next
 146 | 
 147 |         if self.previous:
 148 |             self.previous.next = nextElement
 149 |         if nextElement:
 150 |             nextElement.previous = self.previous
 151 |         self.previous = None
 152 |         lastChild.next = None
 153 | 
 154 |         self.parent = None
 155 |         if self.previousSibling:
 156 |             self.previousSibling.nextSibling = self.nextSibling
 157 |         if self.nextSibling:
 158 |             self.nextSibling.previousSibling = self.previousSibling
 159 |         self.previousSibling = self.nextSibling = None
 160 |         return self
 161 | 
 162 |     def _lastRecursiveChild(self):
 163 |         "Finds the last element beneath this object to be parsed."
 164 |         lastChild = self
 165 |         while hasattr(lastChild, 'contents') and lastChild.contents:
 166 |             lastChild = lastChild.contents[-1]
 167 |         return lastChild
 168 | 
 169 |     def insert(self, position, newChild):
 170 |         if (isinstance(newChild, basestring)
 171 |             or isinstance(newChild, unicode)) \
 172 |             and not isinstance(newChild, NavigableString):
 173 |             newChild = NavigableString(newChild)
 174 | 
 175 |         position =  min(position, len(self.contents))
 176 |         if hasattr(newChild, 'parent') and newChild.parent != None:
 177 |             # We're 'inserting' an element that's already one
 178 |             # of this object's children.
 179 |             if newChild.parent == self:
 180 |                 index = self.find(newChild)
 181 |                 if index and index < position:
 182 |                     # Furthermore we're moving it further down the
 183 |                     # list of this object's children. That means that
 184 |                     # when we extract this element, our target index
 185 |                     # will jump down one.
 186 |                     position = position - 1
 187 |             newChild.extract()
 188 | 
 189 |         newChild.parent = self
 190 |         previousChild = None
 191 |         if position == 0:
 192 |             newChild.previousSibling = None
 193 |             newChild.previous = self
 194 |         else:
 195 |             previousChild = self.contents[position-1]
 196 |             newChild.previousSibling = previousChild
 197 |             newChild.previousSibling.nextSibling = newChild
 198 |             newChild.previous = previousChild._lastRecursiveChild()
 199 |         if newChild.previous:
 200 |             newChild.previous.next = newChild
 201 | 
 202 |         newChildsLastElement = newChild._lastRecursiveChild()
 203 | 
 204 |         if position >= len(self.contents):
 205 |             newChild.nextSibling = None
 206 | 
 207 |             parent = self
 208 |             parentsNextSibling = None
 209 |             while not parentsNextSibling:
 210 |                 parentsNextSibling = parent.nextSibling
 211 |                 parent = parent.parent
 212 |                 if not parent: # This is the last element in the document.
 213 |                     break
 214 |             if parentsNextSibling:
 215 |                 newChildsLastElement.next = parentsNextSibling
 216 |             else:
 217 |                 newChildsLastElement.next = None
 218 |         else:
 219 |             nextChild = self.contents[position]
 220 |             newChild.nextSibling = nextChild
 221 |             if newChild.nextSibling:
 222 |                 newChild.nextSibling.previousSibling = newChild
 223 |             newChildsLastElement.next = nextChild
 224 | 
 225 |         if newChildsLastElement.next:
 226 |             newChildsLastElement.next.previous = newChildsLastElement
 227 |         self.contents.insert(position, newChild)
 228 | 
 229 |     def append(self, tag):
 230 |         """Appends the given tag to the contents of this tag."""
 231 |         self.insert(len(self.contents), tag)
 232 | 
 233 |     def findNext(self, name=None, attrs={}, text=None, **kwargs):
 234 |         """Returns the first item that matches the given criteria and
 235 |         appears after this Tag in the document."""
 236 |         return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
 237 | 
 238 |     def findAllNext(self, name=None, attrs={}, text=None, limit=None,
 239 |                     **kwargs):
 240 |         """Returns all items that match the given criteria and appear
 241 |         after this Tag in the document."""
 242 |         return self._findAll(name, attrs, text, limit, self.nextGenerator,
 243 |                              **kwargs)
 244 | 
 245 |     def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
 246 |         """Returns the closest sibling to this Tag that matches the
 247 |         given criteria and appears after this Tag in the document."""
 248 |         return self._findOne(self.findNextSiblings, name, attrs, text,
 249 |                              **kwargs)
 250 | 
 251 |     def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
 252 |                          **kwargs):
 253 |         """Returns the siblings of this Tag that match the given
 254 |         criteria and appear after this Tag in the document."""
 255 |         return self._findAll(name, attrs, text, limit,
 256 |                              self.nextSiblingGenerator, **kwargs)
 257 |     fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
 258 | 
 259 |     def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
 260 |         """Returns the first item that matches the given criteria and
 261 |         appears before this Tag in the document."""
 262 |         return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
 263 | 
 264 |     def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
 265 |                         **kwargs):
 266 |         """Returns all items that match the given criteria and appear
 267 |         before this Tag in the document."""
 268 |         return self._findAll(name, attrs, text, limit, self.previousGenerator,
 269 |                            **kwargs)
 270 |     fetchPrevious = findAllPrevious # Compatibility with pre-3.x
 271 | 
 272 |     def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
 273 |         """Returns the closest sibling to this Tag that matches the
 274 |         given criteria and appears before this Tag in the document."""
 275 |         return self._findOne(self.findPreviousSiblings, name, attrs, text,
 276 |                              **kwargs)
 277 | 
 278 |     def findPreviousSiblings(self, name=None, attrs={}, text=None,
 279 |                              limit=None, **kwargs):
 280 |         """Returns the siblings of this Tag that match the given
 281 |         criteria and appear before this Tag in the document."""
 282 |         return self._findAll(name, attrs, text, limit,
 283 |                              self.previousSiblingGenerator, **kwargs)
 284 |     fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
 285 | 
 286 |     def findParent(self, name=None, attrs={}, **kwargs):
 287 |         """Returns the closest parent of this Tag that matches the given
 288 |         criteria."""
 289 |         # NOTE: We can't use _findOne because findParents takes a different
 290 |         # set of arguments.
 291 |         r = None
 292 |         l = self.findParents(name, attrs, 1)
 293 |         if l:
 294 |             r = l[0]
 295 |         return r
 296 | 
 297 |     def findParents(self, name=None, attrs={}, limit=None, **kwargs):
 298 |         """Returns the parents of this Tag that match the given
 299 |         criteria."""
 300 | 
 301 |         return self._findAll(name, attrs, None, limit, self.parentGenerator,
 302 |                              **kwargs)
 303 |     fetchParents = findParents # Compatibility with pre-3.x
 304 | 
 305 |     #These methods do the real heavy lifting.
 306 | 
 307 |     def _findOne(self, method, name, attrs, text, **kwargs):
 308 |         r = None
 309 |         l = method(name, attrs, text, 1, **kwargs)
 310 |         if l:
 311 |             r = l[0]
 312 |         return r
 313 | 
 314 |     def _findAll(self, name, attrs, text, limit, generator, **kwargs):
 315 |         "Iterates over a generator looking for things that match."
 316 | 
 317 |         if isinstance(name, SoupStrainer):
 318 |             strainer = name
 319 |         else:
 320 |             # Build a SoupStrainer
 321 |             strainer = SoupStrainer(name, attrs, text, **kwargs)
 322 |         results = ResultSet(strainer)
 323 |         g = generator()
 324 |         while True:
 325 |             try:
 326 |                 i = g.next()
 327 |             except StopIteration:
 328 |                 break
 329 |             if i:
 330 |                 found = strainer.search(i)
 331 |                 if found:
 332 |                     results.append(found)
 333 |                     if limit and len(results) >= limit:
 334 |                         break
 335 |         return results
 336 | 
 337 |     #These Generators can be used to navigate starting from both
 338 |     #NavigableStrings and Tags.
 339 |     def nextGenerator(self):
 340 |         i = self
 341 |         while i:
 342 |             i = i.next
 343 |             yield i
 344 | 
 345 |     def nextSiblingGenerator(self):
 346 |         i = self
 347 |         while i:
 348 |             i = i.nextSibling
 349 |             yield i
 350 | 
 351 |     def previousGenerator(self):
 352 |         i = self
 353 |         while i:
 354 |             i = i.previous
 355 |             yield i
 356 | 
 357 |     def previousSiblingGenerator(self):
 358 |         i = self
 359 |         while i:
 360 |             i = i.previousSibling
 361 |             yield i
 362 | 
 363 |     def parentGenerator(self):
 364 |         i = self
 365 |         while i:
 366 |             i = i.parent
 367 |             yield i
 368 | 
 369 |     # Utility methods
 370 |     def substituteEncoding(self, str, encoding=None):
 371 |         encoding = encoding or "utf-8"
 372 |         return str.replace("%SOUP-ENCODING%", encoding)
 373 | 
 374 |     def toEncoding(self, s, encoding=None):
 375 |         """Encodes an object to a string in some encoding, or to Unicode.
 376 |         ."""
 377 |         if isinstance(s, unicode):
 378 |             if encoding:
 379 |                 s = s.encode(encoding)
 380 |         elif isinstance(s, str):
 381 |             if encoding:
 382 |                 s = s.encode(encoding)
 383 |             else:
 384 |                 s = unicode(s)
 385 |         else:
 386 |             if encoding:
 387 |                 s  = self.toEncoding(str(s), encoding)
 388 |             else:
 389 |                 s = unicode(s)
 390 |         return s
 391 | 
 392 | class NavigableString(unicode, PageElement):
 393 | 
 394 |     def __getnewargs__(self):
 395 |         return (NavigableString.__str__(self),)
 396 | 
 397 |     def __getattr__(self, attr):
 398 |         """text.string gives you text. This is for backwards
 399 |         compatibility for Navigable*String, but for CData* it lets you
 400 |         get the string without the CData wrapper."""
 401 |         if attr == 'string':
 402 |             return self
 403 |         else:
 404 |             raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
 405 | 
 406 |     def __unicode__(self):
 407 |         return str(self).decode(DEFAULT_OUTPUT_ENCODING)
 408 | 
 409 |     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 410 |         if encoding:
 411 |             return self.encode(encoding)
 412 |         else:
 413 |             return self
 414 | 
 415 | class CData(NavigableString):
 416 | 
 417 |     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 418 |         return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
 419 | 
 420 | class ProcessingInstruction(NavigableString):
 421 |     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 422 |         output = self
 423 |         if "%SOUP-ENCODING%" in output:
 424 |             output = self.substituteEncoding(output, encoding)
 425 |         return "<?%s?>" % self.toEncoding(output, encoding)
 426 | 
 427 | class Comment(NavigableString):
 428 |     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 429 |         return "<!--%s-->" % NavigableString.__str__(self, encoding)
 430 | 
 431 | class Declaration(NavigableString):
 432 |     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 433 |         return "<!%s>" % NavigableString.__str__(self, encoding)
 434 | 
 435 | class Tag(PageElement):
 436 | 
 437 |     """Represents a found HTML tag with its attributes and contents."""
 438 | 
 439 |     def _invert(h):
 440 |         "Cheap function to invert a hash."
 441 |         i = {}
 442 |         for k,v in h.items():
 443 |             i[v] = k
 444 |         return i
 445 | 
 446 |     XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
 447 |                                       "quot" : '"',
 448 |                                       "amp" : "&",
 449 |                                       "lt" : "<",
 450 |                                       "gt" : ">" }
 451 | 
 452 |     XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
 453 | 
 454 |     def _convertEntities(self, match):
 455 |         """Used in a call to re.sub to replace HTML, XML, and numeric
 456 |         entities with the appropriate Unicode characters. If HTML
 457 |         entities are being converted, any unrecognized entities are
 458 |         escaped."""
 459 |         x = match.group(1)
 460 |         if self.convertHTMLEntities and x in name2codepoint:
 461 |             return unichr(name2codepoint[x])
 462 |         elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
 463 |             if self.convertXMLEntities:
 464 |                 return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
 465 |             else:
 466 |                 return u'&%s;' % x
 467 |         elif len(x) > 0 and x[0] == '#':
 468 |             # Handle numeric entities
 469 |             if len(x) > 1 and x[1] == 'x':
 470 |                 return unichr(int(x[2:], 16))
 471 |             else:
 472 |                 return unichr(int(x[1:]))
 473 | 
 474 |         elif self.escapeUnrecognizedEntities:
 475 |             return u'&amp;%s;' % x
 476 |         else:
 477 |             return u'&%s;' % x
 478 | 
 479 |     def __init__(self, parser, name, attrs=None, parent=None,
 480 |                  previous=None):
 481 |         "Basic constructor."
 482 | 
 483 |         # We don't actually store the parser object: that lets extracted
 484 |         # chunks be garbage-collected
 485 |         self.parserClass = parser.__class__
 486 |         self.isSelfClosing = parser.isSelfClosingTag(name)
 487 |         self.name = name
 488 |         if attrs == None:
 489 |             attrs = []
 490 |         self.attrs = attrs
 491 |         self.contents = []
 492 |         self.setup(parent, previous)
 493 |         self.hidden = False
 494 |         self.containsSubstitutions = False
 495 |         self.convertHTMLEntities = parser.convertHTMLEntities
 496 |         self.convertXMLEntities = parser.convertXMLEntities
 497 |         self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
 498 | 
 499 |         # Convert any HTML, XML, or numeric entities in the attribute values.
 500 |         convert = lambda(k, val): (k,
 501 |                                    re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
 502 |                                           self._convertEntities,
 503 |                                           val))
 504 |         self.attrs = map(convert, self.attrs)
 505 | 
 506 |     def get(self, key, default=None):
 507 |         """Returns the value of the 'key' attribute for the tag, or
 508 |         the value given for 'default' if it doesn't have that
 509 |         attribute."""
 510 |         return self._getAttrMap().get(key, default)
 511 | 
 512 |     def has_key(self, key):
 513 |         return self._getAttrMap().has_key(key)
 514 | 
 515 |     def __getitem__(self, key):
 516 |         """tag[key] returns the value of the 'key' attribute for the tag,
 517 |         and throws an exception if it's not there."""
 518 |         return self._getAttrMap()[key]
 519 | 
 520 |     def __iter__(self):
 521 |         "Iterating over a tag iterates over its contents."
 522 |         return iter(self.contents)
 523 | 
 524 |     def __len__(self):
 525 |         "The length of a tag is the length of its list of contents."
 526 |         return len(self.contents)
 527 | 
 528 |     def __contains__(self, x):
 529 |         return x in self.contents
 530 | 
 531 |     def __nonzero__(self):
 532 |         "A tag is non-None even if it has no contents."
 533 |         return True
 534 | 
 535 |     def __setitem__(self, key, value):
 536 |         """Setting tag[key] sets the value of the 'key' attribute for the
 537 |         tag."""
 538 |         self._getAttrMap()
 539 |         self.attrMap[key] = value
 540 |         found = False
 541 |         for i in range(0, len(self.attrs)):
 542 |             if self.attrs[i][0] == key:
 543 |                 self.attrs[i] = (key, value)
 544 |                 found = True
 545 |         if not found:
 546 |             self.attrs.append((key, value))
 547 |         self._getAttrMap()[key] = value
 548 | 
 549 |     def __delitem__(self, key):
 550 |         "Deleting tag[key] deletes all 'key' attributes for the tag."
 551 |         for item in self.attrs:
 552 |             if item[0] == key:
 553 |                 self.attrs.remove(item)
 554 |                 #We don't break because bad HTML can define the same
 555 |                 #attribute multiple times.
 556 |             self._getAttrMap()
 557 |             if self.attrMap.has_key(key):
 558 |                 del self.attrMap[key]
 559 | 
 560 |     def __call__(self, *args, **kwargs):
 561 |         """Calling a tag like a function is the same as calling its
 562 |         findAll() method. Eg. tag('a') returns a list of all the A tags
 563 |         found within this tag."""
 564 |         return apply(self.findAll, args, kwargs)
 565 | 
 566 |     def __getattr__(self, tag):
 567 |         #print "Getattr %s.%s" % (self.__class__, tag)
 568 |         if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
 569 |             return self.find(tag[:-3])
 570 |         elif tag.find('__') != 0:
 571 |             return self.find(tag)
 572 |         raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
 573 | 
 574 |     def __eq__(self, other):
 575 |         """Returns true iff this tag has the same name, the same attributes,
 576 |         and the same contents (recursively) as the given tag.
 577 | 
 578 |         NOTE: right now this will return false if two tags have the
 579 |         same attributes in a different order. Should this be fixed?"""
 580 |         if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
 581 |             return False
 582 |         for i in range(0, len(self.contents)):
 583 |             if self.contents[i] != other.contents[i]:
 584 |                 return False
 585 |         return True
 586 | 
 587 |     def __ne__(self, other):
 588 |         """Returns true iff this tag is not identical to the other tag,
 589 |         as defined in __eq__."""
 590 |         return not self == other
 591 | 
 592 |     def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
 593 |         """Renders this tag as a string."""
 594 |         return self.__str__(encoding)
 595 | 
 596 |     def __unicode__(self):
 597 |         return self.__str__(None)
 598 | 
 599 |     BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
 600 |                                            + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
 601 |                                            + ")")
 602 | 
 603 |     def _sub_entity(self, x):
 604 |         """Used with a regular expression to substitute the
 605 |         appropriate XML entity for an XML special character."""
 606 |         return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
 607 | 
 608 |     def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
 609 |                 prettyPrint=False, indentLevel=0):
 610 |         """Returns a string or Unicode representation of this tag and
 611 |         its contents. To get Unicode, pass None for encoding.
 612 | 
 613 |         NOTE: since Python's HTML parser consumes whitespace, this
 614 |         method is not certain to reproduce the whitespace present in
 615 |         the original string."""
 616 | 
 617 |         encodedName = self.toEncoding(self.name, encoding)
 618 | 
 619 |         attrs = []
 620 |         if self.attrs:
 621 |             for key, val in self.attrs:
 622 |                 fmt = '%s="%s"'
 623 |                 if isString(val):
 624 |                     if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
 625 |                         val = self.substituteEncoding(val, encoding)
 626 | 
 627 |                     # The attribute value either:
 628 |                     #
 629 |                     # * Contains no embedded double quotes or single quotes.
 630 |                     #   No problem: we enclose it in double quotes.
 631 |                     # * Contains embedded single quotes. No problem:
 632 |                     #   double quotes work here too.
 633 |                     # * Contains embedded double quotes. No problem:
 634 |                     #   we enclose it in single quotes.
 635 |                     # * Embeds both single _and_ double quotes. This
 636 |                     #   can't happen naturally, but it can happen if
 637 |                     #   you modify an attribute value after parsing
 638 |                     #   the document. Now we have a bit of a
 639 |                     #   problem. We solve it by enclosing the
 640 |                     #   attribute in single quotes, and escaping any
 641 |                     #   embedded single quotes to XML entities.
 642 |                     if '"' in val:
 643 |                         fmt = "%s='%s'"
 644 |                         if "'" in val:
 645 |                             # TODO: replace with apos when
 646 |                             # appropriate.
 647 |                             val = val.replace("'", "&squot;")
 648 | 
 649 |                     # Now we're okay w/r/t quotes. But the attribute
 650 |                     # value might also contain angle brackets, or
 651 |                     # ampersands that aren't part of entities. We need
 652 |                     # to escape those to XML entities too.
 653 |                     val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
 654 | 
 655 |                 attrs.append(fmt % (self.toEncoding(key, encoding),
 656 |                                     self.toEncoding(val, encoding)))
 657 |         close = ''
 658 |         closeTag = ''
 659 |         if self.isSelfClosing:
 660 |             close = ' /'
 661 |         else:
 662 |             closeTag = '</%s>' % encodedName
 663 | 
 664 |         indentTag, indentContents = 0, 0
 665 |         if prettyPrint:
 666 |             indentTag = indentLevel
 667 |             space = (' ' * (indentTag-1))
 668 |             indentContents = indentTag + 1
 669 |         contents = self.renderContents(encoding, prettyPrint, indentContents)
 670 |         if self.hidden:
 671 |             s = contents
 672 |         else:
 673 |             s = []
 674 |             attributeString = ''
 675 |             if attrs:
 676 |                 attributeString = ' ' + ' '.join(attrs)
 677 |             if prettyPrint:
 678 |                 s.append(space)
 679 |             s.append('<%s%s%s>' % (encodedName, attributeString, close))
 680 |             if prettyPrint:
 681 |                 s.append("\n")
 682 |             s.append(contents)
 683 |             if prettyPrint and contents and contents[-1] != "\n":
 684 |                 s.append("\n")
 685 |             if prettyPrint and closeTag:
 686 |                 s.append(space)
 687 |             s.append(closeTag)
 688 |             if prettyPrint and closeTag and self.nextSibling:
 689 |                 s.append("\n")
 690 |             s = ''.join(s)
 691 |         return s
 692 | 
 693 |     def decompose(self):
 694 |         """Recursively destroys the contents of this tree."""
 695 |         contents = [i for i in self.contents]
 696 |         for i in contents:
 697 |             if isinstance(i, Tag):
 698 |                 i.decompose()
 699 |             else:
 700 |                 i.extract()
 701 |         self.extract()
 702 | 
 703 |     def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
 704 |         return self.__str__(encoding, True)
 705 | 
 706 |     def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
 707 |                        prettyPrint=False, indentLevel=0):
 708 |         """Renders the contents of this tag as a string in the given
 709 |         encoding. If encoding is None, returns a Unicode string.."""
 710 |         s=[]
 711 |         for c in self:
 712 |             text = None
 713 |             if isinstance(c, NavigableString):
 714 |                 text = c.__str__(encoding)
 715 |             elif isinstance(c, Tag):
 716 |                 s.append(c.__str__(encoding, prettyPrint, indentLevel))
 717 |             if text and prettyPrint:
 718 |                 text = text.strip()
 719 |             if text:
 720 |                 if prettyPrint:
 721 |                     s.append(" " * (indentLevel-1))
 722 |                 s.append(text)
 723 |                 if prettyPrint:
 724 |                     s.append("\n")
 725 |         return ''.join(s)
 726 | 
 727 |     #Soup methods
 728 | 
 729 |     def find(self, name=None, attrs={}, recursive=True, text=None,
 730 |              **kwargs):
 731 |         """Return only the first child of this Tag matching the given
 732 |         criteria."""
 733 |         r = None
 734 |         l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
 735 |         if l:
 736 |             r = l[0]
 737 |         return r
 738 |     findChild = find
 739 | 
 740 |     def findAll(self, name=None, attrs={}, recursive=True, text=None,
 741 |                 limit=None, **kwargs):
 742 |         """Extracts a list of Tag objects that match the given
 743 |         criteria.  You can specify the name of the Tag and any
 744 |         attributes you want the Tag to have.
 745 | 
 746 |         The value of a key-value pair in the 'attrs' map can be a
 747 |         string, a list of strings, a regular expression object, or a
 748 |         callable that takes a string and returns whether or not the
 749 |         string matches for some custom definition of 'matches'. The
 750 |         same is true of the tag name."""
 751 |         generator = self.recursiveChildGenerator
 752 |         if not recursive:
 753 |             generator = self.childGenerator
 754 |         return self._findAll(name, attrs, text, limit, generator, **kwargs)
 755 |     findChildren = findAll
 756 | 
 757 |     # Pre-3.x compatibility methods
 758 |     first = find
 759 |     fetch = findAll
 760 | 
 761 |     def fetchText(self, text=None, recursive=True, limit=None):
 762 |         return self.findAll(text=text, recursive=recursive, limit=limit)
 763 | 
 764 |     def firstText(self, text=None, recursive=True):
 765 |         return self.find(text=text, recursive=recursive)
 766 | 
 767 |     #Private methods
 768 | 
 769 |     def _getAttrMap(self):
 770 |         """Initializes a map representation of this tag's attributes,
 771 |         if not already initialized."""
 772 |         if not getattr(self, 'attrMap'):
 773 |             self.attrMap = {}
 774 |             for (key, value) in self.attrs:
 775 |                 self.attrMap[key] = value
 776 |         return self.attrMap
 777 | 
 778 |     #Generator methods
 779 |     def childGenerator(self):
 780 |         for i in range(0, len(self.contents)):
 781 |             yield self.contents[i]
 782 |         raise StopIteration
 783 | 
 784 |     def recursiveChildGenerator(self):
 785 |         stack = [(self, 0)]
 786 |         while stack:
 787 |             tag, start = stack.pop()
 788 |             if isinstance(tag, Tag):
 789 |                 for i in range(start, len(tag.contents)):
 790 |                     a = tag.contents[i]
 791 |                     yield a
 792 |                     if isinstance(a, Tag) and tag.contents:
 793 |                         if i < len(tag.contents) - 1:
 794 |                             stack.append((tag, i+1))
 795 |                         stack.append((a, 0))
 796 |                         break
 797 |         raise StopIteration
 798 | 
 799 | # Next, a couple classes to represent queries and their results.
 800 | class SoupStrainer:
 801 |     """Encapsulates a number of ways of matching a markup element (tag or
 802 |     text)."""
 803 | 
 804 |     def __init__(self, name=None, attrs={}, text=None, **kwargs):
 805 |         self.name = name
 806 |         if isString(attrs):
 807 |             kwargs['class'] = attrs
 808 |             attrs = None
 809 |         if kwargs:
 810 |             if attrs:
 811 |                 attrs = attrs.copy()
 812 |                 attrs.update(kwargs)
 813 |             else:
 814 |                 attrs = kwargs
 815 |         self.attrs = attrs
 816 |         self.text = text
 817 | 
 818 |     def __str__(self):
 819 |         if self.text:
 820 |             return self.text
 821 |         else:
 822 |             return "%s|%s" % (self.name, self.attrs)
 823 | 
 824 |     def searchTag(self, markupName=None, markupAttrs={}):
 825 |         found = None
 826 |         markup = None
 827 |         if isinstance(markupName, Tag):
 828 |             markup = markupName
 829 |             markupAttrs = markup
 830 |         callFunctionWithTagData = callable(self.name) \
 831 |                                 and not isinstance(markupName, Tag)
 832 | 
 833 |         if (not self.name) \
 834 |                or callFunctionWithTagData \
 835 |                or (markup and self._matches(markup, self.name)) \
 836 |                or (not markup and self._matches(markupName, self.name)):
 837 |             if callFunctionWithTagData:
 838 |                 match = self.name(markupName, markupAttrs)
 839 |             else:
 840 |                 match = True
 841 |                 markupAttrMap = None
 842 |                 for attr, matchAgainst in self.attrs.items():
 843 |                     if not markupAttrMap:
 844 |                          if hasattr(markupAttrs, 'get'):
 845 |                             markupAttrMap = markupAttrs
 846 |                          else:
 847 |                             markupAttrMap = {}
 848 |                             for k,v in markupAttrs:
 849 |                                 markupAttrMap[k] = v
 850 |                     attrValue = markupAttrMap.get(attr)
 851 |                     if not self._matches(attrValue, matchAgainst):
 852 |                         match = False
 853 |                         break
 854 |             if match:
 855 |                 if markup:
 856 |                     found = markup
 857 |                 else:
 858 |                     found = markupName
 859 |         return found
 860 | 
 861 |     def search(self, markup):
 862 |         #print 'looking for %s in %s' % (self, markup)
 863 |         found = None
 864 |         # If given a list of items, scan it for a text element that
 865 |         # matches.
 866 |         if isList(markup) and not isinstance(markup, Tag):
 867 |             for element in markup:
 868 |                 if isinstance(element, NavigableString) \
 869 |                        and self.search(element):
 870 |                     found = element
 871 |                     break
 872 |         # If it's a Tag, make sure its name or attributes match.
 873 |         # Don't bother with Tags if we're searching for text.
 874 |         elif isinstance(markup, Tag):
 875 |             if not self.text:
 876 |                 found = self.searchTag(markup)
 877 |         # If it's text, make sure the text matches.
 878 |         elif isinstance(markup, NavigableString) or \
 879 |                  isString(markup):
 880 |             if self._matches(markup, self.text):
 881 |                 found = markup
 882 |         else:
 883 |             raise Exception, "I don't know how to match against a %s" \
 884 |                   % markup.__class__
 885 |         return found
 886 | 
 887 |     def _matches(self, markup, matchAgainst):
 888 |         #print "Matching %s against %s" % (markup, matchAgainst)
 889 |         result = False
 890 |         if matchAgainst == True and type(matchAgainst) == types.BooleanType:
 891 |             result = markup != None
 892 |         elif callable(matchAgainst):
 893 |             result = matchAgainst(markup)
 894 |         else:
 895 |             #Custom match methods take the tag as an argument, but all
 896 |             #other ways of matching match the tag name as a string.
 897 |             if isinstance(markup, Tag):
 898 |                 markup = markup.name
 899 |             if markup and not isString(markup):
 900 |                 markup = unicode(markup)
 901 |             #Now we know that chunk is either a string, or None.
 902 |             if hasattr(matchAgainst, 'match'):
 903 |                 # It's a regexp object.
 904 |                 result = markup and matchAgainst.search(markup)
 905 |             elif isList(matchAgainst):
 906 |                 result = markup in matchAgainst
 907 |             elif hasattr(matchAgainst, 'items'):
 908 |                 result = markup.has_key(matchAgainst)
 909 |             elif matchAgainst and isString(markup):
 910 |                 if isinstance(markup, unicode):
 911 |                     matchAgainst = unicode(matchAgainst)
 912 |                 else:
 913 |                     matchAgainst = str(matchAgainst)
 914 | 
 915 |             if not result:
 916 |                 result = matchAgainst == markup
 917 |         return result
 918 | 
 919 | class ResultSet(list):
 920 |     """A ResultSet is just a list that keeps track of the SoupStrainer
 921 |     that created it."""
 922 |     def __init__(self, source):
 923 |         list.__init__([])
 924 |         self.source = source
 925 | 
 926 | # Now, some helper functions.
 927 | 
 928 | def isList(l):
 929 |     """Convenience method that works with all 2.x versions of Python
 930 |     to determine whether or not something is listlike."""
 931 |     return hasattr(l, '__iter__') \
 932 |            or (type(l) in (types.ListType, types.TupleType))
 933 | 
 934 | def isString(s):
 935 |     """Convenience method that works with all 2.x versions of Python
 936 |     to determine whether or not something is stringlike."""
 937 |     try:
 938 |         return isinstance(s, unicode) or isinstance(s, basestring)
 939 |     except NameError:
 940 |         return isinstance(s, str)
 941 | 
 942 | def buildTagMap(default, *args):
 943 |     """Turns a list of maps, lists, or scalars into a single map.
 944 |     Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
 945 |     NESTING_RESET_TAGS maps out of lists and partial maps."""
 946 |     built = {}
 947 |     for portion in args:
 948 |         if hasattr(portion, 'items'):
 949 |             #It's a map. Merge it.
 950 |             for k,v in portion.items():
 951 |                 built[k] = v
 952 |         elif isList(portion):
 953 |             #It's a list. Map each item to the default.
 954 |             for k in portion:
 955 |                 built[k] = default
 956 |         else:
 957 |             #It's a scalar. Map it to the default.
 958 |             built[portion] = default
 959 |     return built
 960 | 
 961 | # Now, the parser classes.
 962 | 
 963 | class BeautifulStoneSoup(Tag, SGMLParser):
 964 | 
 965 |     """This class contains the basic parser and search code. It defines
 966 |     a parser that knows nothing about tag behavior except for the
 967 |     following:
 968 | 
 969 |       You can't close a tag without closing all the tags it encloses.
 970 |       That is, "<foo><bar></foo>" actually means
 971 |       "<foo><bar></bar></foo>".
 972 | 
 973 |     [Another possible explanation is "<foo><bar /></foo>", but since
 974 |     this class defines no SELF_CLOSING_TAGS, it will never use that
 975 |     explanation.]
 976 | 
 977 |     This class is useful for parsing XML or made-up markup languages,
 978 |     or when BeautifulSoup makes an assumption counter to what you were
 979 |     expecting."""
 980 | 
 981 |     SELF_CLOSING_TAGS = {}
 982 |     NESTABLE_TAGS = {}
 983 |     RESET_NESTING_TAGS = {}
 984 |     QUOTE_TAGS = {}
 985 | 
 986 |     MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
 987 |                        lambda x: x.group(1) + ' />'),
 988 |                       (re.compile('<!\s+([^<>]*)>'),
 989 |                        lambda x: '<!' + x.group(1) + '>')
 990 |                       ]
 991 | 
 992 |     ROOT_TAG_NAME = u'[document]'
 993 | 
 994 |     HTML_ENTITIES = "html"
 995 |     XML_ENTITIES = "xml"
 996 |     XHTML_ENTITIES = "xhtml"
 997 |     # TODO: This only exists for backwards-compatibility
 998 |     ALL_ENTITIES = XHTML_ENTITIES
 999 | 
1000 |     # Used when determining whether a text node is all whitespace and
1001 |     # can be replaced with a single space. A text node that contains
1002 |     # fancy Unicode spaces (usually non-breaking) should be left
1003 |     # alone.
1004 |     STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
1005 | 
1006 |     def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
1007 |                  markupMassage=True, smartQuotesTo=XML_ENTITIES,
1008 |                  convertEntities=None, selfClosingTags=None):
1009 |         """The Soup object is initialized as the 'root tag', and the
1010 |         provided markup (which can be a string or a file-like object)
1011 |         is fed into the underlying parser.
1012 | 
1013 |         sgmllib will process most bad HTML, and the BeautifulSoup
1014 |         class has some tricks for dealing with some HTML that kills
1015 |         sgmllib, but Beautiful Soup can nonetheless choke or lose data
1016 |         if your data uses self-closing tags or declarations
1017 |         incorrectly.
1018 | 
1019 |         By default, Beautiful Soup uses regexes to sanitize input,
1020 |         avoiding the vast majority of these problems. If the problems
1021 |         don't apply to you, pass in False for markupMassage, and
1022 |         you'll get better performance.
1023 | 
1024 |         The default parser massage techniques fix the two most common
1025 |         instances of invalid HTML that choke sgmllib:
1026 | 
1027 |          <br/> (No space between name of closing tag and tag close)
1028 |          <! --Comment--> (Extraneous whitespace in declaration)
1029 | 
1030 |         You can pass in a custom list of (RE object, replace method)
1031 |         tuples to get Beautiful Soup to scrub your input the way you
1032 |         want."""
1033 | 
1034 |         self.parseOnlyThese = parseOnlyThese
1035 |         self.fromEncoding = fromEncoding
1036 |         self.smartQuotesTo = smartQuotesTo
1037 |         self.convertEntities = convertEntities
1038 |         # Set the rules for how we'll deal with the entities we
1039 |         # encounter
1040 |         if self.convertEntities:
1041 |             # It doesn't make sense to convert encoded characters to
1042 |             # entities even while you're converting entities to Unicode.
1043 |             # Just convert it all to Unicode.
1044 |             self.smartQuotesTo = None
1045 |             if convertEntities == self.HTML_ENTITIES:
1046 |                 self.convertXMLEntities = False
1047 |                 self.convertHTMLEntities = True
1048 |                 self.escapeUnrecognizedEntities = True
1049 |             elif convertEntities == self.XHTML_ENTITIES:
1050 |                 self.convertXMLEntities = True
1051 |                 self.convertHTMLEntities = True
1052 |                 self.escapeUnrecognizedEntities = False
1053 |             elif convertEntities == self.XML_ENTITIES:
1054 |                 self.convertXMLEntities = True
1055 |                 self.convertHTMLEntities = False
1056 |                 self.escapeUnrecognizedEntities = False
1057 |         else:
1058 |             self.convertXMLEntities = False
1059 |             self.convertHTMLEntities = False
1060 |             self.escapeUnrecognizedEntities = False
1061 | 
1062 |         self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
1063 |         SGMLParser.__init__(self)
1064 | 
1065 |         if hasattr(markup, 'read'):        # It's a file-type object.
1066 |             markup = markup.read()
1067 |         self.markup = markup
1068 |         self.markupMassage = markupMassage
1069 |         try:
1070 |             self._feed()
1071 |         except StopParsing:
1072 |             pass
1073 |         self.markup = None                 # The markup can now be GCed
1074 | 
1075 |     def convert_charref(self, name):
1076 |         """This method fixes a bug in Python's SGMLParser."""
1077 |         try:
1078 |             n = int(name)
1079 |         except ValueError:
1080 |             return
1081 |         if not 0 <= n <= 127 : # ASCII ends at 127, not 255
1082 |             return
1083 |         return self.convert_codepoint(n)
1084 | 
1085 |     def _feed(self, inDocumentEncoding=None):
1086 |         # Convert the document to Unicode.
1087 |         markup = self.markup
1088 |         if isinstance(markup, unicode):
1089 |             if not hasattr(self, 'originalEncoding'):
1090 |                 self.originalEncoding = None
1091 |         else:
1092 |             dammit = UnicodeDammit\
1093 |                      (markup, [self.fromEncoding, inDocumentEncoding],
1094 |                       smartQuotesTo=self.smartQuotesTo)
1095 |             markup = dammit.unicode
1096 |             self.originalEncoding = dammit.originalEncoding
1097 |         if markup:
1098 |             if self.markupMassage:
1099 |                 if not isList(self.markupMassage):
1100 |                     self.markupMassage = self.MARKUP_MASSAGE
1101 |                 for fix, m in self.markupMassage:
1102 |                     markup = fix.sub(m, markup)
1103 |                 # TODO: We get rid of markupMassage so that the
1104 |                 # soup object can be deepcopied later on. Some
1105 |                 # Python installations can't copy regexes. If anyone
1106 |                 # was relying on the existence of markupMassage, this
1107 |                 # might cause problems.
1108 |                 del(self.markupMassage)
1109 |         self.reset()
1110 | 
1111 |         SGMLParser.feed(self, markup)
1112 |         # Close out any unfinished strings and close all the open tags.
1113 |         self.endData()
1114 |         while self.currentTag.name != self.ROOT_TAG_NAME:
1115 |             self.popTag()
1116 | 
1117 |     def __getattr__(self, methodName):
1118 |         """This method routes method call requests to either the SGMLParser
1119 |         superclass or the Tag superclass, depending on the method name."""
1120 |         #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
1121 | 
1122 |         if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
1123 |                or methodName.find('do_') == 0:
1124 |             return SGMLParser.__getattr__(self, methodName)
1125 |         elif methodName.find('__') != 0:
1126 |             return Tag.__getattr__(self, methodName)
1127 |         else:
1128 |             raise AttributeError
1129 | 
1130 |     def isSelfClosingTag(self, name):
1131 |         """Returns true iff the given string is the name of a
1132 |         self-closing tag according to this parser."""
1133 |         return self.SELF_CLOSING_TAGS.has_key(name) \
1134 |                or self.instanceSelfClosingTags.has_key(name)
1135 | 
1136 |     def reset(self):
1137 |         Tag.__init__(self, self, self.ROOT_TAG_NAME)
1138 |         self.hidden = 1
1139 |         SGMLParser.reset(self)
1140 |         self.currentData = []
1141 |         self.currentTag = None
1142 |         self.tagStack = []
1143 |         self.quoteStack = []
1144 |         self.pushTag(self)
1145 | 
1146 |     def popTag(self):
1147 |         tag = self.tagStack.pop()
1148 |         # Tags with just one string-owning child get the child as a
1149 |         # 'string' property, so that soup.tag.string is shorthand for
1150 |         # soup.tag.contents[0]
1151 |         if len(self.currentTag.contents) == 1 and \
1152 |            isinstance(self.currentTag.contents[0], NavigableString):
1153 |             self.currentTag.string = self.currentTag.contents[0]
1154 | 
1155 |         #print "Pop", tag.name
1156 |         if self.tagStack:
1157 |             self.currentTag = self.tagStack[-1]
1158 |         return self.currentTag
1159 | 
1160 |     def pushTag(self, tag):
1161 |         #print "Push", tag.name
1162 |         if self.currentTag:
1163 |             self.currentTag.contents.append(tag)
1164 |         self.tagStack.append(tag)
1165 |         self.currentTag = self.tagStack[-1]
1166 | 
1167 |     def endData(self, containerClass=NavigableString):
1168 |         if self.currentData:
1169 |             currentData = ''.join(self.currentData)
1170 |             if not currentData.translate(self.STRIP_ASCII_SPACES):
1171 |                 if '\n' in currentData:
1172 |                     currentData = '\n'
1173 |                 else:
1174 |                     currentData = ' '
1175 |             self.currentData = []
1176 |             if self.parseOnlyThese and len(self.tagStack) <= 1 and \
1177 |                    (not self.parseOnlyThese.text or \
1178 |                     not self.parseOnlyThese.search(currentData)):
1179 |                 return
1180 |             o = containerClass(currentData)
1181 |             o.setup(self.currentTag, self.previous)
1182 |             if self.previous:
1183 |                 self.previous.next = o
1184 |             self.previous = o
1185 |             self.currentTag.contents.append(o)
1186 | 
1187 | 
1188 |     def _popToTag(self, name, inclusivePop=True):
1189 |         """Pops the tag stack up to and including the most recent
1190 |         instance of the given tag. If inclusivePop is false, pops the tag
1191 |         stack up to but *not* including the most recent instqance of
1192 |         the given tag."""
1193 |         #print "Popping to %s" % name
1194 |         if name == self.ROOT_TAG_NAME:
1195 |             return
1196 | 
1197 |         numPops = 0
1198 |         mostRecentTag = None
1199 |         for i in range(len(self.tagStack)-1, 0, -1):
1200 |             if name == self.tagStack[i].name:
1201 |                 numPops = len(self.tagStack)-i
1202 |                 break
1203 |         if not inclusivePop:
1204 |             numPops = numPops - 1
1205 | 
1206 |         for i in range(0, numPops):
1207 |             mostRecentTag = self.popTag()
1208 |         return mostRecentTag
1209 | 
1210 |     def _smartPop(self, name):
1211 | 
1212 |         """We need to pop up to the previous tag of this type, unless
1213 |         one of this tag's nesting reset triggers comes between this
1214 |         tag and the previous tag of this type, OR unless this tag is a
1215 |         generic nesting trigger and another generic nesting trigger
1216 |         comes between this tag and the previous tag of this type.
1217 | 
1218 |         Examples:
1219 |          <p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
1220 |          <p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
1221 |          <p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
1222 | 
1223 |          <li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
1224 |          <tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
1225 |          <td><tr><td> *<td>* should pop to 'tr', not the first 'td'
1226 |         """
1227 | 
1228 |         nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1229 |         isNestable = nestingResetTriggers != None
1230 |         isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
1231 |         popTo = None
1232 |         inclusive = True
1233 |         for i in range(len(self.tagStack)-1, 0, -1):
1234 |             p = self.tagStack[i]
1235 |             if (not p or p.name == name) and not isNestable:
1236 |                 #Non-nestable tags get popped to the top or to their
1237 |                 #last occurance.
1238 |                 popTo = name
1239 |                 break
1240 |             if (nestingResetTriggers != None
1241 |                 and p.name in nestingResetTriggers) \
1242 |                 or (nestingResetTriggers == None and isResetNesting
1243 |                     and self.RESET_NESTING_TAGS.has_key(p.name)):
1244 | 
1245 |                 #If we encounter one of the nesting reset triggers
1246 |                 #peculiar to this tag, or we encounter another tag
1247 |                 #that causes nesting to reset, pop up to but not
1248 |                 #including that tag.
1249 |                 popTo = p.name
1250 |                 inclusive = False
1251 |                 break
1252 |             p = p.parent
1253 |         if popTo:
1254 |             self._popToTag(popTo, inclusive)
1255 | 
1256 |     def unknown_starttag(self, name, attrs, selfClosing=0):
1257 |         #print "Start tag %s: %s" % (name, attrs)
1258 |         if self.quoteStack:
1259 |             #This is not a real tag.
1260 |             #print "<%s> is not real!" % name
1261 |             attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
1262 |             self.handle_data('<%s%s>' % (name, attrs))
1263 |             return
1264 |         self.endData()
1265 | 
1266 |         if not self.isSelfClosingTag(name) and not selfClosing:
1267 |             self._smartPop(name)
1268 | 
1269 |         if self.parseOnlyThese and len(self.tagStack) <= 1 \
1270 |                and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1271 |             return
1272 | 
1273 |         tag = Tag(self, name, attrs, self.currentTag, self.previous)
1274 |         if self.previous:
1275 |             self.previous.next = tag
1276 |         self.previous = tag
1277 |         self.pushTag(tag)
1278 |         if selfClosing or self.isSelfClosingTag(name):
1279 |             self.popTag()
1280 |         if name in self.QUOTE_TAGS:
1281 |             #print "Beginning quote (%s)" % name
1282 |             self.quoteStack.append(name)
1283 |             self.literal = 1
1284 |         return tag
1285 | 
1286 |     def unknown_endtag(self, name):
1287 |         #print "End tag %s" % name
1288 |         if self.quoteStack and self.quoteStack[-1] != name:
1289 |             #This is not a real end tag.
1290 |             #print "</%s> is not real!" % name
1291 |             self.handle_data('</%s>' % name)
1292 |             return
1293 |         self.endData()
1294 |         self._popToTag(name)
1295 |         if self.quoteStack and self.quoteStack[-1] == name:
1296 |             self.quoteStack.pop()
1297 |             self.literal = (len(self.quoteStack) > 0)
1298 | 
1299 |     def handle_data(self, data):
1300 |         self.currentData.append(data)
1301 | 
1302 |     def _toStringSubclass(self, text, subclass):
1303 |         """Adds a certain piece of text to the tree as a NavigableString
1304 |         subclass."""
1305 |         self.endData()
1306 |         self.handle_data(text)
1307 |         self.endData(subclass)
1308 | 
1309 |     def handle_pi(self, text):
1310 |         """Handle a processing instruction as a ProcessingInstruction
1311 |         object, possibly one with a %SOUP-ENCODING% slot into which an
1312 |         encoding will be plugged later."""
1313 |         if text[:3] == "xml":
1314 |             text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
1315 |         self._toStringSubclass(text, ProcessingInstruction)
1316 | 
1317 |     def handle_comment(self, text):
1318 |         "Handle comments as Comment objects."
1319 |         self._toStringSubclass(text, Comment)
1320 | 
1321 |     def handle_charref(self, ref):
1322 |         "Handle character references as data."
1323 |         if self.convertEntities:
1324 |             data = unichr(int(ref))
1325 |         else:
1326 |             data = '&#%s;' % ref
1327 |         self.handle_data(data)
1328 | 
1329 |     def handle_entityref(self, ref):
1330 |         """Handle entity references as data, possibly converting known
1331 |         HTML and/or XML entity references to the corresponding Unicode
1332 |         characters."""
1333 |         data = None
1334 |         if self.convertHTMLEntities:
1335 |             try:
1336 |                 data = unichr(name2codepoint[ref])
1337 |             except KeyError:
1338 |                 pass
1339 | 
1340 |         if not data and self.convertXMLEntities:
1341 |                 data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
1342 | 
1343 |         if not data and self.convertHTMLEntities and \
1344 |             not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
1345 |                 # TODO: We've got a problem here. We're told this is
1346 |                 # an entity reference, but it's not an XML entity
1347 |                 # reference or an HTML entity reference. Nonetheless,
1348 |                 # the logical thing to do is to pass it through as an
1349 |                 # unrecognized entity reference.
1350 |                 #
1351 |                 # Except: when the input is "&carol;" this function
1352 |                 # will be called with input "carol". When the input is
1353 |                 # "AT&T", this function will be called with input
1354 |                 # "T". We have no way of knowing whether a semicolon
1355 |                 # was present originally, so we don't know whether
1356 |                 # this is an unknown entity or just a misplaced
1357 |                 # ampersand.
1358 |                 #
1359 |                 # The more common case is a misplaced ampersand, so I
1360 |                 # escape the ampersand and omit the trailing semicolon.
1361 |                 data = "&amp;%s" % ref
1362 |         if not data:
1363 |             # This case is different from the one above, because we
1364 |             # haven't already gone through a supposedly comprehensive
1365 |             # mapping of entities to Unicode characters. We might not
1366 |             # have gone through any mapping at all. So the chances are
1367 |             # very high that this is a real entity, and not a
1368 |             # misplaced ampersand.
1369 |             data = "&%s;" % ref
1370 |         self.handle_data(data)
1371 | 
1372 |     def handle_decl(self, data):
1373 |         "Handle DOCTYPEs and the like as Declaration objects."
1374 |         self._toStringSubclass(data, Declaration)
1375 | 
1376 |     def parse_declaration(self, i):
1377 |         """Treat a bogus SGML declaration as raw data. Treat a CDATA
1378 |         declaration as a CData object."""
1379 |         j = None
1380 |         if self.rawdata[i:i+9] == '<![CDATA[':
1381 |              k = self.rawdata.find(']]>', i)
1382 |              if k == -1:
1383 |                  k = len(self.rawdata)
1384 |              data = self.rawdata[i+9:k]
1385 |              j = k+3
1386 |              self._toStringSubclass(data, CData)
1387 |         else:
1388 |             try:
1389 |                 j = SGMLParser.parse_declaration(self, i)
1390 |             except SGMLParseError:
1391 |                 toHandle = self.rawdata[i:]
1392 |                 self.handle_data(toHandle)
1393 |                 j = i + len(toHandle)
1394 |         return j
1395 | 
1396 | class BeautifulSoup(BeautifulStoneSoup):
1397 | 
1398 |     """This parser knows the following facts about HTML:
1399 | 
1400 |     * Some tags have no closing tag and should be interpreted as being
1401 |       closed as soon as they are encountered.
1402 | 
1403 |     * The text inside some tags (ie. 'script') may contain tags which
1404 |       are not really part of the document and which should be parsed
1405 |       as text, not tags. If you want to parse the text as tags, you can
1406 |       always fetch it and parse it explicitly.
1407 | 
1408 |     * Tag nesting rules:
1409 | 
1410 |       Most tags can't be nested at all. For instance, the occurance of
1411 |       a <p> tag should implicitly close the previous <p> tag.
1412 | 
1413 |        <p>Para1<p>Para2
1414 |         should be transformed into:
1415 |        <p>Para1</p><p>Para2
1416 | 
1417 |       Some tags can be nested arbitrarily. For instance, the occurance
1418 |       of a <blockquote> tag should _not_ implicitly close the previous
1419 |       <blockquote> tag.
1420 | 
1421 |        Alice said: <blockquote>Bob said: <blockquote>Blah
1422 |         should NOT be transformed into:
1423 |        Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
1424 | 
1425 |       Some tags can be nested, but the nesting is reset by the
1426 |       interposition of other tags. For instance, a <tr> tag should
1427 |       implicitly close the previous <tr> tag within the same <table>,
1428 |       but not close a <tr> tag in another table.
1429 | 
1430 |        <table><tr>Blah<tr>Blah
1431 |         should be transformed into:
1432 |        <table><tr>Blah</tr><tr>Blah
1433 |         but,
1434 |        <tr>Blah<table><tr>Blah
1435 |         should NOT be transformed into
1436 |        <tr>Blah<table></tr><tr>Blah
1437 | 
1438 |     Differing assumptions about tag nesting rules are a major source
1439 |     of problems with the BeautifulSoup class. If BeautifulSoup is not
1440 |     treating as nestable a tag your page author treats as nestable,
1441 |     try ICantBelieveItsBeautifulSoup, MinimalSoup, or
1442 |     BeautifulStoneSoup before writing your own subclass."""
1443 | 
1444 |     def __init__(self, *args, **kwargs):
1445 |         if not kwargs.has_key('smartQuotesTo'):
1446 |             kwargs['smartQuotesTo'] = self.HTML_ENTITIES
1447 |         BeautifulStoneSoup.__init__(self, *args, **kwargs)
1448 | 
1449 |     SELF_CLOSING_TAGS = buildTagMap(None,
1450 |                                     ['br' , 'hr', 'input', 'img', 'meta',
1451 |                                     'spacer', 'link', 'frame', 'base'])
1452 | 
1453 |     QUOTE_TAGS = {'script' : None, 'textarea' : None}
1454 | 
1455 |     #According to the HTML standard, each of these inline tags can
1456 |     #contain another tag of the same type. Furthermore, it's common
1457 |     #to actually use these tags this way.
1458 |     NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
1459 |                             'center']
1460 | 
1461 |     #According to the HTML standard, these block tags can contain
1462 |     #another tag of the same type. Furthermore, it's common
1463 |     #to actually use these tags this way.
1464 |     NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
1465 | 
1466 |     #Lists can contain other lists, but there are restrictions.
1467 |     NESTABLE_LIST_TAGS = { 'ol' : [],
1468 |                            'ul' : [],
1469 |                            'li' : ['ul', 'ol'],
1470 |                            'dl' : [],
1471 |                            'dd' : ['dl'],
1472 |                            'dt' : ['dl'] }
1473 | 
1474 |     #Tables can contain other tables, but there are restrictions.
1475 |     NESTABLE_TABLE_TAGS = {'table' : [],
1476 |                            'tr' : ['table', 'tbody', 'tfoot', 'thead'],
1477 |                            'td' : ['tr'],
1478 |                            'th' : ['tr'],
1479 |                            'thead' : ['table'],
1480 |                            'tbody' : ['table'],
1481 |                            'tfoot' : ['table'],
1482 |                            }
1483 | 
1484 |     NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
1485 | 
1486 |     #If one of these tags is encountered, all tags up to the next tag of
1487 |     #this type are popped.
1488 |     RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
1489 |                                      NON_NESTABLE_BLOCK_TAGS,
1490 |                                      NESTABLE_LIST_TAGS,
1491 |                                      NESTABLE_TABLE_TAGS)
1492 | 
1493 |     NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
1494 |                                 NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
1495 | 
1496 |     # Used to detect the charset in a META tag; see start_meta
1497 |     CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)")
1498 | 
1499 |     def start_meta(self, attrs):
1500 |         """Beautiful Soup can detect a charset included in a META tag,
1501 |         try to convert the document to that charset, and re-parse the
1502 |         document from the beginning."""
1503 |         httpEquiv = None
1504 |         contentType = None
1505 |         contentTypeIndex = None
1506 |         tagNeedsEncodingSubstitution = False
1507 | 
1508 |         for i in range(0, len(attrs)):
1509 |             key, value = attrs[i]
1510 |             key = key.lower()
1511 |             if key == 'http-equiv':
1512 |                 httpEquiv = value
1513 |             elif key == 'content':
1514 |                 contentType = value
1515 |                 contentTypeIndex = i
1516 | 
1517 |         if httpEquiv and contentType: # It's an interesting meta tag.
1518 |             match = self.CHARSET_RE.search(contentType)
1519 |             if match:
1520 |                 if getattr(self, 'declaredHTMLEncoding') or \
1521 |                        (self.originalEncoding == self.fromEncoding):
1522 |                     # This is our second pass through the document, or
1523 |                     # else an encoding was specified explicitly and it
1524 |                     # worked. Rewrite the meta tag.
1525 |                     newAttr = self.CHARSET_RE.sub\
1526 |                               (lambda(match):match.group(1) +
1527 |                                "%SOUP-ENCODING%", contentType)
1528 |                     attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
1529 |                                                newAttr)
1530 |                     tagNeedsEncodingSubstitution = True
1531 |                 else:
1532 |                     # This is our first pass through the document.
1533 |                     # Go through it again with the new information.
1534 |                     newCharset = match.group(3)
1535 |                     if newCharset and newCharset != self.originalEncoding:
1536 |                         self.declaredHTMLEncoding = newCharset
1537 |                         self._feed(self.declaredHTMLEncoding)
1538 |                         raise StopParsing
1539 |         tag = self.unknown_starttag("meta", attrs)
1540 |         if tag and tagNeedsEncodingSubstitution:
1541 |             tag.containsSubstitutions = True
1542 | 
1543 | class StopParsing(Exception):
1544 |     pass
1545 | 
1546 | class ICantBelieveItsBeautifulSoup(BeautifulSoup):
1547 | 
1548 |     """The BeautifulSoup class is oriented towards skipping over
1549 |     common HTML errors like unclosed tags. However, sometimes it makes
1550 |     errors of its own. For instance, consider this fragment:
1551 | 
1552 |      <b>Foo<b>Bar</b></b>
1553 | 
1554 |     This is perfectly valid (if bizarre) HTML. However, the
1555 |     BeautifulSoup class will implicitly close the first b tag when it
1556 |     encounters the second 'b'. It will think the author wrote
1557 |     "<b>Foo<b>Bar", and didn't close the first 'b' tag, because
1558 |     there's no real-world reason to bold something that's already
1559 |     bold. When it encounters '</b></b>' it will close two more 'b'
1560 |     tags, for a grand total of three tags closed instead of two. This
1561 |     can throw off the rest of your document structure. The same is
1562 |     true of a number of other tags, listed below.
1563 | 
1564 |     It's much more common for someone to forget to close a 'b' tag
1565 |     than to actually use nested 'b' tags, and the BeautifulSoup class
1566 |     handles the common case. This class handles the not-co-common
1567 |     case: where you can't believe someone wrote what they did, but
1568 |     it's valid HTML and BeautifulSoup screwed up by assuming it
1569 |     wouldn't be."""
1570 | 
1571 |     I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1572 |      ['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1573 |       'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1574 |       'big']
1575 | 
1576 |     I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
1577 | 
1578 |     NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1579 |                                 I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1580 |                                 I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1581 | 
1582 | class MinimalSoup(BeautifulSoup):
1583 |     """The MinimalSoup class is for parsing HTML that contains
1584 |     pathologically bad markup. It makes no assumptions about tag
1585 |     nesting, but it does know which tags are self-closing, that
1586 |     <script> tags contain Javascript and should not be parsed, that
1587 |     META tags may contain encoding information, and so on.
1588 | 
1589 |     This also makes it better for subclassing than BeautifulStoneSoup
1590 |     or BeautifulSoup."""
1591 | 
1592 |     RESET_NESTING_TAGS = buildTagMap('noscript')
1593 |     NESTABLE_TAGS = {}
1594 | 
1595 | class BeautifulSOAP(BeautifulStoneSoup):
1596 |     """This class will push a tag with only a single string child into
1597 |     the tag's parent as an attribute. The attribute's name is the tag
1598 |     name, and the value is the string child. An example should give
1599 |     the flavor of the change:
1600 | 
1601 |     <foo><bar>baz</bar></foo>
1602 |      =>
1603 |     <foo bar="baz"><bar>baz</bar></foo>
1604 | 
1605 |     You can then access fooTag['bar'] instead of fooTag.barTag.string.
1606 | 
1607 |     This is, of course, useful for scraping structures that tend to
1608 |     use subelements instead of attributes, such as SOAP messages. Note
1609 |     that it modifies its input, so don't print the modified version
1610 |     out.
1611 | 
1612 |     I'm not sure how many people really want to use this class; let me
1613 |     know if you do. Mainly I like the name."""
1614 | 
1615 |     def popTag(self):
1616 |         if len(self.tagStack) > 1:
1617 |             tag = self.tagStack[-1]
1618 |             parent = self.tagStack[-2]
1619 |             parent._getAttrMap()
1620 |             if (isinstance(tag, Tag) and len(tag.contents) == 1 and
1621 |                 isinstance(tag.contents[0], NavigableString) and
1622 |                 not parent.attrMap.has_key(tag.name)):
1623 |                 parent[tag.name] = tag.contents[0]
1624 |         BeautifulStoneSoup.popTag(self)
1625 | 
1626 | #Enterprise class names! It has come to our attention that some people
1627 | #think the names of the Beautiful Soup parser classes are too silly
1628 | #and "unprofessional" for use in enterprise screen-scraping. We feel
1629 | #your pain! For such-minded folk, the Beautiful Soup Consortium And
1630 | #All-Night Kosher Bakery recommends renaming this file to
1631 | #"RobustParser.py" (or, in cases of extreme enterprisiness,
1632 | #"RobustParserBeanInterface.class") and using the following
1633 | #enterprise-friendly class aliases:
1634 | class RobustXMLParser(BeautifulStoneSoup):
1635 |     pass
1636 | class RobustHTMLParser(BeautifulSoup):
1637 |     pass
1638 | class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
1639 |     pass
1640 | class RobustInsanelyWackAssHTMLParser(MinimalSoup):
1641 |     pass
1642 | class SimplifyingSOAPParser(BeautifulSOAP):
1643 |     pass
1644 | 
1645 | ######################################################
1646 | #
1647 | # Bonus library: Unicode, Dammit
1648 | #
1649 | # This class forces XML data into a standard format (usually to UTF-8
1650 | # or Unicode).  It is heavily based on code from Mark Pilgrim's
1651 | # Universal Feed Parser. It does not rewrite the XML or HTML to
1652 | # reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
1653 | # (XML) and BeautifulSoup.start_meta (HTML).
1654 | 
1655 | # Autodetects character encodings.
1656 | # Download from http://chardet.feedparser.org/
1657 | try:
1658 |     import chardet
1659 | #    import chardet.constants
1660 | #    chardet.constants._debug = 1
1661 | except ImportError:
1662 |     chardet = None
1663 | 
1664 | # cjkcodecs and iconv_codec make Python know about more character encodings.
1665 | # Both are available from http://cjkpython.i18n.org/
1666 | # They're built in if you use Python 2.4.
1667 | try:
1668 |     import cjkcodecs.aliases
1669 | except ImportError:
1670 |     pass
1671 | try:
1672 |     import iconv_codec
1673 | except ImportError:
1674 |     pass
1675 | 
1676 | class UnicodeDammit:
1677 |     """A class for detecting the encoding of a *ML document and
1678 |     converting it to a Unicode string. If the source encoding is
1679 |     windows-1252, can replace MS smart quotes with their HTML or XML
1680 |     equivalents."""
1681 | 
1682 |     # This dictionary maps commonly seen values for "charset" in HTML
1683 |     # meta tags to the corresponding Python codec names. It only covers
1684 |     # values that aren't in Python's aliases and can't be determined
1685 |     # by the heuristics in find_codec.
1686 |     CHARSET_ALIASES = { "macintosh" : "mac-roman",
1687 |                         "x-sjis" : "shift-jis" }
1688 | 
1689 |     def __init__(self, markup, overrideEncodings=[],
1690 |                  smartQuotesTo='xml'):
1691 |         self.markup, documentEncoding, sniffedEncoding = \
1692 |                      self._detectEncoding(markup)
1693 |         self.smartQuotesTo = smartQuotesTo
1694 |         self.triedEncodings = []
1695 |         if markup == '' or isinstance(markup, unicode):
1696 |             self.originalEncoding = None
1697 |             self.unicode = unicode(markup)
1698 |             return
1699 | 
1700 |         u = None
1701 |         for proposedEncoding in overrideEncodings:
1702 |             u = self._convertFrom(proposedEncoding)
1703 |             if u: break
1704 |         if not u:
1705 |             for proposedEncoding in (documentEncoding, sniffedEncoding):
1706 |                 u = self._convertFrom(proposedEncoding)
1707 |                 if u: break
1708 | 
1709 |         # If no luck and we have auto-detection library, try that:
1710 |         if not u and chardet and not isinstance(self.markup, unicode):
1711 |             u = self._convertFrom(chardet.detect(self.markup)['encoding'])
1712 | 
1713 |         # As a last resort, try utf-8 and windows-1252:
1714 |         if not u:
1715 |             for proposed_encoding in ("utf-8", "windows-1252"):
1716 |                 u = self._convertFrom(proposed_encoding)
1717 |                 if u: break
1718 |         self.unicode = u
1719 |         if not u: self.originalEncoding = None
1720 | 
1721 |     def _subMSChar(self, orig):
1722 |         """Changes a MS smart quote character to an XML or HTML
1723 |         entity."""
1724 |         sub = self.MS_CHARS.get(orig)
1725 |         if type(sub) == types.TupleType:
1726 |             if self.smartQuotesTo == 'xml':
1727 |                 sub = '&#x%s;' % sub[1]
1728 |             else:
1729 |                 sub = '&%s;' % sub[0]
1730 |         return sub
1731 | 
1732 |     def _convertFrom(self, proposed):
1733 |         proposed = self.find_codec(proposed)
1734 |         if not proposed or proposed in self.triedEncodings:
1735 |             return None
1736 |         self.triedEncodings.append(proposed)
1737 |         markup = self.markup
1738 | 
1739 |         # Convert smart quotes to HTML if coming from an encoding
1740 |         # that might have them.
1741 |         if self.smartQuotesTo and proposed.lower() in("windows-1252",
1742 |                                                       "iso-8859-1",
1743 |                                                       "iso-8859-2"):
1744 |             markup = re.compile("([\x80-\x9f])").sub \
1745 |                      (lambda(x): self._subMSChar(x.group(1)),
1746 |                       markup)
1747 | 
1748 |         try:
1749 |             # print "Trying to convert document to %s" % proposed
1750 |             u = self._toUnicode(markup, proposed)
1751 |             self.markup = u
1752 |             self.originalEncoding = proposed
1753 |         except Exception, e:
1754 |             # print "That didn't work!"
1755 |             # print e
1756 |             return None
1757 |         #print "Correct encoding: %s" % proposed
1758 |         return self.markup
1759 | 
1760 |     def _toUnicode(self, data, encoding):
1761 |         '''Given a string and its encoding, decodes the string into Unicode.
1762 |         %encoding is a string recognized by encodings.aliases'''
1763 | 
1764 |         # strip Byte Order Mark (if present)
1765 |         if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
1766 |                and (data[2:4] != '\x00\x00'):
1767 |             encoding = 'utf-16be'
1768 |             data = data[2:]
1769 |         elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
1770 |                  and (data[2:4] != '\x00\x00'):
1771 |             encoding = 'utf-16le'
1772 |             data = data[2:]
1773 |         elif data[:3] == '\xef\xbb\xbf':
1774 |             encoding = 'utf-8'
1775 |             data = data[3:]
1776 |         elif data[:4] == '\x00\x00\xfe\xff':
1777 |             encoding = 'utf-32be'
1778 |             data = data[4:]
1779 |         elif data[:4] == '\xff\xfe\x00\x00':
1780 |             encoding = 'utf-32le'
1781 |             data = data[4:]
1782 |         newdata = unicode(data, encoding)
1783 |         return newdata
1784 | 
1785 |     def _detectEncoding(self, xml_data):
1786 |         """Given a document, tries to detect its XML encoding."""
1787 |         xml_encoding = sniffed_xml_encoding = None
1788 |         try:
1789 |             if xml_data[:4] == '\x4c\x6f\xa7\x94':
1790 |                 # EBCDIC
1791 |                 xml_data = self._ebcdic_to_ascii(xml_data)
1792 |             elif xml_data[:4] == '\x00\x3c\x00\x3f':
1793 |                 # UTF-16BE
1794 |                 sniffed_xml_encoding = 'utf-16be'
1795 |                 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
1796 |             elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
1797 |                      and (xml_data[2:4] != '\x00\x00'):
1798 |                 # UTF-16BE with BOM
1799 |                 sniffed_xml_encoding = 'utf-16be'
1800 |                 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
1801 |             elif xml_data[:4] == '\x3c\x00\x3f\x00':
1802 |                 # UTF-16LE
1803 |                 sniffed_xml_encoding = 'utf-16le'
1804 |                 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
1805 |             elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
1806 |                      (xml_data[2:4] != '\x00\x00'):
1807 |                 # UTF-16LE with BOM
1808 |                 sniffed_xml_encoding = 'utf-16le'
1809 |                 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
1810 |             elif xml_data[:4] == '\x00\x00\x00\x3c':
1811 |                 # UTF-32BE
1812 |                 sniffed_xml_encoding = 'utf-32be'
1813 |                 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
1814 |             elif xml_data[:4] == '\x3c\x00\x00\x00':
1815 |                 # UTF-32LE
1816 |                 sniffed_xml_encoding = 'utf-32le'
1817 |                 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
1818 |             elif xml_data[:4] == '\x00\x00\xfe\xff':
1819 |                 # UTF-32BE with BOM
1820 |                 sniffed_xml_encoding = 'utf-32be'
1821 |                 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
1822 |             elif xml_data[:4] == '\xff\xfe\x00\x00':
1823 |                 # UTF-32LE with BOM
1824 |                 sniffed_xml_encoding = 'utf-32le'
1825 |                 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
1826 |             elif xml_data[:3] == '\xef\xbb\xbf':
1827 |                 # UTF-8 with BOM
1828 |                 sniffed_xml_encoding = 'utf-8'
1829 |                 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
1830 |             else:
1831 |                 sniffed_xml_encoding = 'ascii'
1832 |                 pass
1833 |             xml_encoding_match = re.compile \
1834 |                                  ('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\
1835 |                                  .match(xml_data)
1836 |         except:
1837 |             xml_encoding_match = None
1838 |         if xml_encoding_match:
1839 |             xml_encoding = xml_encoding_match.groups()[0].lower()
1840 |             if sniffed_xml_encoding and \
1841 |                (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
1842 |                                  'iso-10646-ucs-4', 'ucs-4', 'csucs4',
1843 |                                  'utf-16', 'utf-32', 'utf_16', 'utf_32',
1844 |                                  'utf16', 'u16')):
1845 |                 xml_encoding = sniffed_xml_encoding
1846 |         return xml_data, xml_encoding, sniffed_xml_encoding
1847 | 
1848 | 
1849 |     def find_codec(self, charset):
1850 |         return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
1851 |                or (charset and self._codec(charset.replace("-", ""))) \
1852 |                or (charset and self._codec(charset.replace("-", "_"))) \
1853 |                or charset
1854 | 
1855 |     def _codec(self, charset):
1856 |         if not charset: return charset
1857 |         codec = None
1858 |         try:
1859 |             codecs.lookup(charset)
1860 |             codec = charset
1861 |         except (LookupError, ValueError):
1862 |             pass
1863 |         return codec
1864 | 
1865 |     EBCDIC_TO_ASCII_MAP = None
1866 |     def _ebcdic_to_ascii(self, s):
1867 |         c = self.__class__
1868 |         if not c.EBCDIC_TO_ASCII_MAP:
1869 |             emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1870 |                     16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1871 |                     128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1872 |                     144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1873 |                     32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1874 |                     38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1875 |                     45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1876 |                     186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1877 |                     195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1878 |                     201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1879 |                     206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1880 |                     211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1881 |                     225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1882 |                     73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1883 |                     82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1884 |                     90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1885 |                     250,251,252,253,254,255)
1886 |             import string
1887 |             c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
1888 |             ''.join(map(chr, range(256))), ''.join(map(chr, emap)))
1889 |         return s.translate(c.EBCDIC_TO_ASCII_MAP)
1890 | 
1891 |     MS_CHARS = { '\x80' : ('euro', '20AC'),
1892 |                  '\x81' : ' ',
1893 |                  '\x82' : ('sbquo', '201A'),
1894 |                  '\x83' : ('fnof', '192'),
1895 |                  '\x84' : ('bdquo', '201E'),
1896 |                  '\x85' : ('hellip', '2026'),
1897 |                  '\x86' : ('dagger', '2020'),
1898 |                  '\x87' : ('Dagger', '2021'),
1899 |                  '\x88' : ('circ', '2C6'),
1900 |                  '\x89' : ('permil', '2030'),
1901 |                  '\x8A' : ('Scaron', '160'),
1902 |                  '\x8B' : ('lsaquo', '2039'),
1903 |                  '\x8C' : ('OElig', '152'),
1904 |                  '\x8D' : '?',
1905 |                  '\x8E' : ('#x17D', '17D'),
1906 |                  '\x8F' : '?',
1907 |                  '\x90' : '?',
1908 |                  '\x91' : ('lsquo', '2018'),
1909 |                  '\x92' : ('rsquo', '2019'),
1910 |                  '\x93' : ('ldquo', '201C'),
1911 |                  '\x94' : ('rdquo', '201D'),
1912 |                  '\x95' : ('bull', '2022'),
1913 |                  '\x96' : ('ndash', '2013'),
1914 |                  '\x97' : ('mdash', '2014'),
1915 |                  '\x98' : ('tilde', '2DC'),
1916 |                  '\x99' : ('trade', '2122'),
1917 |                  '\x9a' : ('scaron', '161'),
1918 |                  '\x9b' : ('rsaquo', '203A'),
1919 |                  '\x9c' : ('oelig', '153'),
1920 |                  '\x9d' : '?',
1921 |                  '\x9e' : ('#x17E', '17E'),
1922 |                  '\x9f' : ('Yuml', ''),}
1923 | 
1924 | #######################################################################
1925 | 
1926 | 
1927 | #By default, act as an HTML pretty-printer.
1928 | if __name__ == '__main__':
1929 |     import sys
1930 |     soup = BeautifulSoup(sys.stdin.read())
1931 |     print soup.prettify()
1932 | 


--------------------------------------------------------------------------------
/xgoogle/BeautifulSoup.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kennyledet/Google-EmailScraper/f07d48fe98f1d7a07b2ec762db68667233a4175d/xgoogle/BeautifulSoup.pyc


--------------------------------------------------------------------------------
/xgoogle/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #
 3 | # Peteris Krumins (peter@catonmat.net)
 4 | # http://www.catonmat.net  --  good coders code, great reuse
 5 | #
 6 | # A Google Python library:
 7 | # http://www.catonmat.net/blog/python-library-for-google-search/
 8 | #
 9 | # Distributed under MIT license:
10 | #
11 | # Copyright (c) 2009 Peteris Krumins
12 | #
13 | # Permission is hereby granted, free of charge, to any person
14 | # Obtaining a copy of this software and associated documentation
15 | # Files (the "Software"), to deal in the Software without
16 | # Restriction, including without limitation the rights to use,
17 | # Copy, modify, merge, publish, distribute, sublicense, and/or sell
18 | # Copies of the Software, and to permit persons to whom the
19 | # Software is furnished to do so, subject to the following
20 | # Conditions:
21 | #
22 | # The above copyright notice and this permission notice shall be
23 | # Included in all copies or substantial portions of the Software.
24 | #
25 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
27 | # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
29 | # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
30 | # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
31 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
32 | # OTHER DEALINGS IN THE SOFTWARE.
33 | #
34 | 
35 | 


--------------------------------------------------------------------------------
/xgoogle/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kennyledet/Google-EmailScraper/f07d48fe98f1d7a07b2ec762db68667233a4175d/xgoogle/__init__.pyc


--------------------------------------------------------------------------------
/xgoogle/browser.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #
  3 | # Peteris Krumins (peter@catonmat.net)
  4 | # http://www.catonmat.net  --  good coders code, great reuse
  5 | #
  6 | # http://www.catonmat.net/blog/python-library-for-google-search/
  7 | #
  8 | # Code is licensed under MIT license.
  9 | #
 10 | 
 11 | import random
 12 | import socket
 13 | import urllib
 14 | import urllib2
 15 | import httplib
 16 | 
 17 | BROWSERS = (
 18 |     # Top most popular browsers in my access.log on 2009.02.12
 19 |     # tail -50000 access.log |
 20 |     #  awk -F\" '{B[$6]++} END { for (b in B) { print B[b] ": " b } }' |
 21 |     #  sort -rn |
 22 |     #  head -20
 23 |     'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.6) Gecko/2009011913 Firefox/3.0.6',
 24 |     'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.0.6) Gecko/2009011912 Firefox/3.0.6',
 25 |     'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.6) Gecko/2009011913 Firefox/3.0.6 (.NET CLR 3.5.30729)',
 26 |     'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.6) Gecko/2009020911 Ubuntu/8.10 (intrepid) Firefox/3.0.6',
 27 |     'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.6) Gecko/2009011913 Firefox/3.0.6',
 28 |     'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.6) Gecko/2009011913 Firefox/3.0.6 (.NET CLR 3.5.30729)',
 29 |     'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.48 Safari/525.19',
 30 |     'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30; .NET CLR 3.0.04506.648)',
 31 |     'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.0.6) Gecko/2009020911 Ubuntu/8.10 (intrepid) Firefox/3.0.6',
 32 |     'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.5) Gecko/2008121621 Ubuntu/8.04 (hardy) Firefox/3.0.5',
 33 |     'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-us) AppleWebKit/525.27.1 (KHTML, like Gecko) Version/3.2.1 Safari/525.27.1',
 34 |     'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
 35 |     'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)',
 36 |     'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
 37 | )
 38 | 
 39 | TIMEOUT = 5  # socket timeout
 40 | 
 41 | class BrowserError(Exception):
 42 |     def __init__(self, url, error):
 43 |         self.url = url
 44 |         self.error = error
 45 | 
 46 | class PoolHTTPConnection(httplib.HTTPConnection):
 47 |     def connect(self):
 48 |         """Connect to the host and port specified in __init__."""
 49 |         msg = "getaddrinfo returns an empty list"
 50 |         for res in socket.getaddrinfo(self.host, self.port, 0,
 51 |                                       socket.SOCK_STREAM):
 52 |             af, socktype, proto, canonname, sa = res
 53 |             try:
 54 |                 self.sock = socket.socket(af, socktype, proto)
 55 |                 if self.debuglevel > 0:
 56 |                     print "connect: (%s, %s)" % (self.host, self.port)
 57 |                 self.sock.settimeout(TIMEOUT)
 58 |                 self.sock.connect(sa)
 59 |             except socket.error, msg:
 60 |                 if self.debuglevel > 0:
 61 |                     print 'connect fail:', (self.host, self.port)
 62 |                 if self.sock:
 63 |                     self.sock.close()
 64 |                 self.sock = None
 65 |                 continue
 66 |             break
 67 |         if not self.sock:
 68 |             raise socket.error, msg
 69 | 
 70 | class PoolHTTPHandler(urllib2.HTTPHandler):
 71 |     def http_open(self, req):
 72 |         return self.do_open(PoolHTTPConnection, req)
 73 | 
 74 | class Browser(object):
 75 |     def __init__(self, user_agent=BROWSERS[0], debug=False, use_pool=False):
 76 |         self.headers = {
 77 |             'User-Agent': user_agent,
 78 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 79 |             'Accept-Language': 'en-us,en;q=0.5'
 80 |         }
 81 |         self.debug = debug
 82 | 
 83 |     def get_page(self, url, data=None):
 84 |         handlers = [PoolHTTPHandler]
 85 |         opener = urllib2.build_opener(*handlers)
 86 |         if data: data = urllib.urlencode(data)
 87 |         request = urllib2.Request(url, data, self.headers)
 88 |         try:
 89 |             response = opener.open(request)
 90 |             return response.read()
 91 |         except (urllib2.HTTPError, urllib2.URLError), e:
 92 |             raise BrowserError(url, str(e))
 93 |         except (socket.error, socket.sslerror), msg:
 94 |             raise BrowserError(url, msg)
 95 |         except socket.timeout, e:
 96 |             raise BrowserError(url, "timeout")
 97 |         except KeyboardInterrupt:
 98 |             raise
 99 |         except:
100 |             raise BrowserError(url, "unknown error")
101 | 
102 |     def set_random_user_agent(self):
103 |         self.headers['User-Agent'] = random.choice(BROWSERS)
104 |         return self.headers['User-Agent']
105 | 
106 | 


--------------------------------------------------------------------------------
/xgoogle/browser.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kennyledet/Google-EmailScraper/f07d48fe98f1d7a07b2ec762db68667233a4175d/xgoogle/browser.pyc


--------------------------------------------------------------------------------
/xgoogle/googlesets.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #
 3 | # Peteris Krumins (peter@catonmat.net)
 4 | # http://www.catonmat.net  --  good coders code, great reuse
 5 | #
 6 | # http://www.catonmat.net/blog/python-library-for-google-sets/
 7 | #
 8 | # Code is licensed under MIT license.
 9 | #
10 | 
11 | import re
12 | import urllib
13 | import random
14 | from htmlentitydefs import name2codepoint
15 | from BeautifulSoup import BeautifulSoup
16 | 
17 | from browser import Browser, BrowserError
18 | 
19 | class GSError(Exception):
20 |     """ Google Sets Error """
21 |     pass
22 | 
23 | class GSParseError(Exception):
24 |     """
25 |     Parse error in Google Sets results.
26 |     self.msg attribute contains explanation why parsing failed
27 |     self.tag attribute contains BeautifulSoup object with the most relevant tag that failed to parse
28 |     Thrown only in debug mode
29 |     """
30 |      
31 |     def __init__(self, msg, tag):
32 |         self.msg = msg
33 |         self.tag = tag
34 | 
35 |     def __str__(self):
36 |         return self.msg
37 | 
38 |     def html(self):
39 |         return self.tag.prettify()
40 | 
41 | LARGE_SET = 1
42 | SMALL_SET = 2
43 | 
44 | class GoogleSets(object):
45 |     URL_LARGE = "http://labs.google.com/sets?hl=en&q1=%s&q2=%s&q3=%s&q4=%s&q5=%s&btn=Large+Set"
46 |     URL_SMALL = "http://labs.google.com/sets?hl=en&q1=%s&q2=%s&q3=%s&q4=%s&q5=%s&btn=Small+Set+(15+items+or+fewer)"
47 | 
48 |     def __init__(self, items, random_agent=False, debug=False):
49 |         self.items = items
50 |         self.debug = debug
51 |         self.browser = Browser(debug=debug)
52 | 
53 |         if random_agent:
54 |             self.browser.set_random_user_agent()
55 | 
56 |     def get_results(self, set_type=SMALL_SET):
57 |         page = self._get_results_page(set_type)
58 |         results = self._extract_results(page)
59 |         return results
60 | 
61 |     def _maybe_raise(self, cls, *arg):
62 |         if self.debug:
63 |             raise cls(*arg)
64 | 
65 |     def _get_results_page(self, set_type):
66 |         if set_type == LARGE_SET:
67 |             url = GoogleSets.URL_LARGE
68 |         else:
69 |             url = GoogleSets.URL_SMALL
70 | 
71 |         safe_items = [urllib.quote_plus(i) for i in self.items]
72 |         blank_items = 5 - len(safe_items)
73 |         if blank_items > 0:
74 |             safe_items += ['']*blank_items
75 | 
76 |         safe_url = url % tuple(safe_items)
77 | 
78 |         try:
79 |             page = self.browser.get_page(safe_url)
80 |         except BrowserError, e:
81 |             raise GSError, "Failed getting %s: %s" % (e.url, e.error)
82 | 
83 |         return BeautifulSoup(page)
84 | 
85 |     def _extract_results(self, soup):
86 |         a_links = soup.findAll('a', href=re.compile('/search'))
87 |         ret_res = [a.string for a in a_links]
88 |         return ret_res
89 | 
90 | 


--------------------------------------------------------------------------------
/xgoogle/search.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # encoding: utf-8
  3 | #
  4 | # Peteris Krumins (peter@catonmat.net)
  5 | # http://www.catonmat.net  --  good coders code, great reuse
  6 | #
  7 | # http://www.catonmat.net/blog/python-library-for-google-search/
  8 | #
  9 | # Code is licensed under MIT license.
 10 | #
 11 | 
 12 | import re
 13 | import urllib
 14 | from htmlentitydefs import name2codepoint
 15 | from BeautifulSoup import BeautifulSoup
 16 | 
 17 | from browser import Browser, BrowserError
 18 | 
 19 | class SearchError(Exception):
 20 |     """
 21 |     Base class for Google Search exceptions.
 22 |     """
 23 |     pass
 24 | 
 25 | class ParseError(SearchError):
 26 |     """
 27 |     Parse error in Google results.
 28 |     self.msg attribute contains explanation why parsing failed
 29 |     self.tag attribute contains BeautifulSoup object with the most relevant tag that failed to parse
 30 |     Thrown only in debug mode
 31 |     """
 32 |      
 33 |     def __init__(self, msg, tag):
 34 |         self.msg = msg
 35 |         self.tag = tag
 36 | 
 37 |     def __str__(self):
 38 |         return self.msg
 39 | 
 40 |     def html(self):
 41 |         return self.tag.prettify()
 42 | 
 43 | class SearchResult:
 44 |     def __init__(self, title, url, desc):
 45 |         self.title = title
 46 |         self.url = url
 47 |         self.desc = desc
 48 | 
 49 |     def __str__(self):
 50 |         return 'Google Search Result: "%s"' % self.title
 51 | 
 52 | class GoogleSearch(object):
 53 |     SEARCH_URL_0 = "http://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&btnG=Google+Search"
 54 |     NEXT_PAGE_0 = "http://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&start=%(start)d"
 55 |     SEARCH_URL_1 = "http://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&num=%(num)d&btnG=Google+Search"
 56 |     NEXT_PAGE_1 = "http://www.google.%(tld)s/search?hl=%(lang)s&q=%(query)s&num=%(num)d&start=%(start)d"
 57 | 
 58 |     def __init__(self, query, random_agent=False, debug=False, lang="en", tld="com", re_search_strings=None):
 59 |         self.query = query
 60 |         self.debug = debug
 61 |         self.browser = Browser(debug=debug)
 62 |         self.results_info = None
 63 |         self.eor = False # end of results
 64 |         self._page = 0
 65 |         self._first_indexed_in_previous = None
 66 |         self._filetype = None
 67 |         self._last_search_url = None
 68 |         self._results_per_page = 10
 69 |         self._last_from = 0
 70 |         self._lang = lang
 71 |         self._tld = tld
 72 |         
 73 |         if re_search_strings:
 74 |             self._re_search_strings = re_search_strings
 75 |         elif lang == "de":
 76 |             self._re_search_strings = ("Ergebnisse", "von", u"ungefähr")
 77 |         elif lang == "es":
 78 |             self._re_search_strings = ("Resultados", "de", "aproximadamente")
 79 |         # add more localised versions here
 80 |         else:
 81 |             self._re_search_strings = ("Results", "of", "about")
 82 | 
 83 |         if random_agent:
 84 |             self.browser.set_random_user_agent()
 85 | 
 86 |     @property
 87 |     def num_results(self):
 88 |         if not self.results_info:
 89 |             page = self._get_results_page()
 90 |             self.results_info = self._extract_info(page)
 91 |             if self.results_info['total'] == 0:
 92 |                 self.eor = True
 93 |         return self.results_info['total']
 94 | 
 95 |     @property
 96 |     def last_search_url(self):
 97 |         return self._last_search_url
 98 | 
 99 |     def _get_page(self):
100 |         return self._page
101 | 
102 |     def _set_page(self, page):
103 |         self._page = page
104 | 
105 |     page = property(_get_page, _set_page)
106 | 
107 |     def _get_first_indexed_in_previous(self):
108 |         return self._first_indexed_in_previous
109 | 
110 |     def _set_first_indexed_in_previous(self, interval):
111 |         if interval == "day":
112 |             self._first_indexed_in_previous = 'd'
113 |         elif interval == "week":
114 |             self._first_indexed_in_previous = 'w'
115 |         elif interval == "month":
116 |             self._first_indexed_in_previous = 'm'
117 |         elif interval == "year":
118 |             self._first_indexed_in_previous = 'y'
119 |         else:
120 |             # a floating point value is a number of months
121 |             try:
122 |                 num = float(interval)
123 |             except ValueError:
124 |                 raise SearchError, "Wrong parameter to first_indexed_in_previous: %s" % (str(interval))
125 |             self._first_indexed_in_previous = 'm' + str(interval)
126 |     
127 |     first_indexed_in_previous = property(_get_first_indexed_in_previous, _set_first_indexed_in_previous, doc="possible values: day, week, month, year, or a float value of months")
128 |     
129 |     def _get_filetype(self):
130 |         return self._filetype
131 | 
132 |     def _set_filetype(self, filetype):
133 |         self._filetype = filetype
134 |     
135 |     filetype = property(_get_filetype, _set_filetype, doc="file extension to search for")
136 |     
137 |     def _get_results_per_page(self):
138 |         return self._results_per_page
139 | 
140 |     def _set_results_par_page(self, rpp):
141 |         self._results_per_page = rpp
142 | 
143 |     results_per_page = property(_get_results_per_page, _set_results_par_page)
144 | 
145 |     def get_results(self):
146 |         """ Gets a page of results """
147 |         if self.eor:
148 |             return []
149 |         MAX_VALUE = 1000000
150 |         page = self._get_results_page()
151 |         #search_info = self._extract_info(page)
152 |         results = self._extract_results(page)
153 |         search_info = {'from': self.results_per_page*self._page,
154 |                        'to': self.results_per_page*self._page + len(results),
155 |                        'total': MAX_VALUE}
156 |         if not self.results_info:
157 |             self.results_info = search_info
158 |             if self.num_results == 0:
159 |                 self.eor = True
160 |                 return []
161 |         if not results:
162 |             self.eor = True
163 |             return []
164 |         if self._page > 0 and search_info['from'] == self._last_from:
165 |             self.eor = True
166 |             return []
167 |         if search_info['to'] == search_info['total']:
168 |             self.eor = True
169 |         self._page += 1
170 |         self._last_from = search_info['from']
171 |         return results
172 | 
173 |     def _maybe_raise(self, cls, *arg):
174 |         if self.debug:
175 |             raise cls(*arg)
176 | 
177 |     def _get_results_page(self):
178 |         if self._page == 0:
179 |             if self._results_per_page == 10:
180 |                 url = GoogleSearch.SEARCH_URL_0
181 |             else:
182 |                 url = GoogleSearch.SEARCH_URL_1
183 |         else:
184 |             if self._results_per_page == 10:
185 |                 url = GoogleSearch.NEXT_PAGE_0
186 |             else:
187 |                 url = GoogleSearch.NEXT_PAGE_1
188 | 
189 |         safe_url = [url % { 'query': urllib.quote_plus(self.query),
190 |                            'start': self._page * self._results_per_page,
191 |                            'num': self._results_per_page,
192 |                            'tld' : self._tld,
193 |                            'lang' : self._lang }]
194 |         
195 |         # possibly extend url with optional properties
196 |         if self._first_indexed_in_previous:
197 |             safe_url.extend(["&as_qdr=", self._first_indexed_in_previous])
198 |         if self._filetype:
199 |             safe_url.extend(["&as_filetype=", self._filetype])
200 |         
201 |         safe_url = "".join(safe_url)
202 |         self._last_search_url = safe_url
203 |         
204 |         try:
205 |             page = self.browser.get_page(safe_url)
206 |         except BrowserError, e:
207 |             raise SearchError, "Failed getting %s: %s" % (e.url, e.error)
208 | 
209 |         return BeautifulSoup(page)
210 | 
211 |     def _extract_info(self, soup):
212 |         empty_info = {'from': 0, 'to': 0, 'total': 0}
213 |         div_ssb = soup.find('div', id='ssb')
214 |         if not div_ssb:
215 |             self._maybe_raise(ParseError, "Div with number of results was not found on Google search page", soup)
216 |             return empty_info
217 |         p = div_ssb.find('p')
218 |         if not p:
219 |             self._maybe_raise(ParseError, """<p> tag within <div id="ssb"> was not found on Google search page""", soup)
220 |             return empty_info
221 |         txt = ''.join(p.findAll(text=True))
222 |         txt = txt.replace(',', '')
223 |         matches = re.search(r'%s (\d+) - (\d+) %s (?:%s )?(\d+)' % self._re_search_strings, txt, re.U)
224 |         if not matches:
225 |             return empty_info
226 |         return {'from': int(matches.group(1)), 'to': int(matches.group(2)), 'total': int(matches.group(3))}
227 | 
228 |     def _extract_results(self, soup):
229 |         results = soup.findAll('li','g')
230 |         ret_res = []
231 |         for result in results:
232 |             eres = self._extract_result(result)
233 |             if eres:
234 |                 ret_res.append(eres)
235 |         return ret_res
236 | 
237 |     def _extract_result(self, result):
238 |         title, url = self._extract_title_url(result)
239 |         desc = self._extract_description(result)
240 |         if not title or not url or not desc:
241 |             return None
242 |         return SearchResult(title, url, desc)
243 | 
244 |     def _extract_title_url(self, result):
245 |         #title_a = result.find('a', {'class': re.compile(r'\bl\b')})
246 |         title_a = result.find('a')
247 |         if not title_a:
248 |             self._maybe_raise(ParseError, "Title tag in Google search result was not found", result)
249 |             return None, None
250 |         title = ''.join(title_a.findAll(text=True))
251 |         title = self._html_unescape(title)
252 |         url = title_a['href']
253 |         match = re.match(r'/url\?q=(http[^&]+)&', url)
254 |         if match:
255 |             url = urllib.unquote(match.group(1))
256 |         return title, url
257 | 
258 |     def _extract_description(self, result):
259 |         desc_div = result.find('span', 'st')
260 |         if not desc_div:
261 |             self._maybe_raise(ParseError, "Description tag in Google search result was not found", result)
262 |             return None
263 | 
264 |         desc_strs = []
265 |         def looper(tag):
266 |             if not tag: return
267 |             for t in tag:
268 |                 try:
269 |                     if t.name == 'br': break
270 |                 except AttributeError:
271 |                     pass
272 | 
273 |                 try:
274 |                     desc_strs.append(t.string)
275 |                 except AttributeError:
276 |                     desc_strs.append(t)
277 | 
278 |         looper(desc_div)
279 |         looper(desc_div.find('wbr')) # BeautifulSoup does not self-close <wbr>
280 | 
281 |         desc = ''.join(s for s in desc_strs if s)
282 |         return self._html_unescape(desc)
283 | 
284 |     def _html_unescape(self, str):
285 |         def entity_replacer(m):
286 |             entity = m.group(1)
287 |             if entity in name2codepoint:
288 |                 return unichr(name2codepoint[entity])
289 |             else:
290 |                 return m.group(0)
291 | 
292 |         def ascii_replacer(m):
293 |             cp = int(m.group(1))
294 |             if cp <= 255:
295 |                 return unichr(cp)
296 |             else:
297 |                 return m.group(0)
298 | 
299 |         s =    re.sub(r'&#(\d+);',  ascii_replacer, str, re.U)
300 |         return re.sub(r'&([^;]+);', entity_replacer, s, re.U)
301 | 
302 | class BlogSearch(GoogleSearch):
303 | 
304 |     def _extract_info(self, soup):
305 |         empty_info = {'from': 0, 'to': 0, 'total': 0}
306 |         td_rsb = soup.find('td', 'rsb')
307 |         if not td_rsb:
308 |             self._maybe_raise(ParseError, "Td with number of results was not found on Blogs search page", soup)
309 |             return empty_info
310 |         font = td_rsb.find('font')
311 |         if not font:
312 |             self._maybe_raise(ParseError, """<p> tag within <tr class='rsb'> was not found on Blogs search page""", soup)
313 |             return empty_info
314 |         txt = ''.join(font.findAll(text=True))
315 |         txt = txt.replace(',', '')
316 |         if self.hl == 'es':
317 |             matches = re.search(r'Resultados (\d+) - (\d+) de (?:aproximadamente )?(\d+)', txt, re.U)
318 |         elif self.hl == 'en':
319 |             matches = re.search(r'Results (\d+) - (\d+) of (?:about )?(\d+)', txt, re.U)
320 |         if not matches:
321 |             return empty_info
322 |         return {'from': int(matches.group(1)), 'to': int(matches.group(2)), 'total': int(matches.group(3))}
323 | 
324 |     def _extract_results(self, soup):
325 |         results = soup.findAll('p', {'class': 'g'})
326 |         ret_res = []
327 |         for result in results:
328 |             eres = self._extract_result(result)
329 |             if eres:
330 |                 ret_res.append(eres)
331 |         return ret_res
332 | 
333 |     def _extract_result(self, result):
334 |         title, url = self._extract_title_url(result)
335 |         desc = self._extract_description(result)
336 |         if not title or not url or not desc:
337 |             return None
338 |         return SearchResult(title, url, desc)
339 | 
340 |     def _extract_title_url(self, result):
341 |         #title_a = result.find('a', {'class': re.compile(r'\bl\b')})
342 |         title_a = result.findNext('a')
343 |         if not title_a:
344 |             self._maybe_raise(ParseError, "Title tag in Blog search result was not found", result)
345 |             return None, None
346 |         title = ''.join(title_a.findAll(text=True))
347 |         title = self._html_unescape(title)
348 |         url = title_a['href']
349 |         match = re.match(r'/url\?q=(http[^&]+)&', url)
350 |         if match:
351 |             url = urllib.unquote(match.group(1))
352 |         return title, url
353 | 
354 |     def _extract_description(self, result):
355 |         desc_td = result.findNext('td')
356 |         if not desc_td:
357 |             self._maybe_raise(ParseError, "Description tag in Google search result was not found", result)
358 |             return None
359 | 
360 |         desc_strs = []
361 |         def looper(tag):
362 |             if not tag: return
363 |             for t in tag:
364 |                 try:
365 |                     if t.name == 'br': break
366 |                 except AttributeError:
367 |                     pass
368 | 
369 |                 try:
370 |                     desc_strs.append(t.string)
371 |                 except AttributeError:
372 |                     desc_strs.append(t)
373 | 
374 |         looper(desc_td)
375 |         looper(desc_td.find('wbr')) # BeautifulSoup does not self-close <wbr>
376 | 
377 |         desc = ''.join(s for s in desc_strs if s)
378 |         return self._html_unescape(desc)
379 |         
380 | 


--------------------------------------------------------------------------------
/xgoogle/search.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kennyledet/Google-EmailScraper/f07d48fe98f1d7a07b2ec762db68667233a4175d/xgoogle/search.pyc


--------------------------------------------------------------------------------
/xgoogle/sponsoredlinks.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #
  3 | # Peteris Krumins (peter@catonmat.net)
  4 | # http://www.catonmat.net  --  good coders code, great reuse
  5 | #
  6 | # http://www.catonmat.net/blog/python-library-for-google-sponsored-links-search/
  7 | #
  8 | # Code is licensed under MIT license.
  9 | #
 10 | 
 11 | import re
 12 | import urllib
 13 | import random
 14 | from htmlentitydefs import name2codepoint
 15 | from BeautifulSoup import BeautifulSoup
 16 | 
 17 | from browser import Browser, BrowserError
 18 | 
 19 | #
 20 | # TODO: join GoogleSearch and SponsoredLinks classes under a single base class
 21 | #
 22 | 
 23 | class SLError(Exception):
 24 |     """ Sponsored Links Error """
 25 |     pass
 26 | 
 27 | class SLParseError(Exception):
 28 |     """
 29 |     Parse error in Google results.
 30 |     self.msg attribute contains explanation why parsing failed
 31 |     self.tag attribute contains BeautifulSoup object with the most relevant tag that failed to parse
 32 |     Thrown only in debug mode
 33 |     """
 34 |      
 35 |     def __init__(self, msg, tag):
 36 |         self.msg = msg
 37 |         self.tag = tag
 38 | 
 39 |     def __str__(self):
 40 |         return self.msg
 41 | 
 42 |     def html(self):
 43 |         return self.tag.prettify()
 44 | 
 45 | GET_ALL_SLEEP_FUNCTION = object()
 46 | 
 47 | class SponsoredLink(object):
 48 |     """ a single sponsored link """
 49 |     def __init__(self, title, url, display_url, desc):
 50 |         self.title = title
 51 |         self.url = url
 52 |         self.display_url = display_url
 53 |         self.desc = desc
 54 | 
 55 | class SponsoredLinks(object):
 56 |     SEARCH_URL_0 = "http://www.google.com/sponsoredlinks?q=%(query)s&btnG=Search+Sponsored+Links&hl=en"
 57 |     NEXT_PAGE_0 = "http://www.google.com/sponsoredlinks?q=%(query)s&sa=N&start=%(start)d&hl=en"
 58 |     SEARCH_URL_1 = "http://www.google.com/sponsoredlinks?q=%(query)s&num=%(num)d&btnG=Search+Sponsored+Links&hl=en"
 59 |     NEXT_PAGE_1 = "http://www.google.com/sponsoredlinks?q=%(query)s&num=%(num)d&sa=N&start=%(start)d&hl=en"
 60 | 
 61 |     def __init__(self, query, random_agent=False, debug=False):
 62 |         self.query = query
 63 |         self.debug = debug
 64 |         self.browser = Browser(debug=debug)
 65 |         self._page = 0
 66 |         self.eor = False
 67 |         self.results_info = None
 68 |         self._results_per_page = 10
 69 | 
 70 |         if random_agent:
 71 |             self.browser.set_random_user_agent()
 72 | 
 73 |     @property
 74 |     def num_results(self):
 75 |         if not self.results_info:
 76 |             page = self._get_results_page()
 77 |             self.results_info = self._extract_info(page)
 78 |             if self.results_info['total'] == 0:
 79 |                 self.eor = True
 80 |         return self.results_info['total']
 81 | 
 82 |     def _get_results_per_page(self):
 83 |         return self._results_per_page
 84 | 
 85 |     def _set_results_par_page(self, rpp):
 86 |         self._results_per_page = rpp
 87 | 
 88 |     results_per_page = property(_get_results_per_page, _set_results_par_page)
 89 | 
 90 |     def get_results(self):
 91 |         if self.eor:
 92 |             return []
 93 |         page = self._get_results_page()
 94 |         info = self._extract_info(page)
 95 |         if self.results_info is None:
 96 |             self.results_info = info
 97 |         if info['to'] == info['total']:
 98 |             self.eor = True
 99 |         results = self._extract_results(page)
100 |         if not results:
101 |             self.eor = True
102 |             return []
103 |         self._page += 1
104 |         return results
105 | 
106 |     def _get_all_results_sleep_fn(self):
107 |         return random.random()*5 + 1 # sleep from 1 - 6 seconds
108 | 
109 |     def get_all_results(self, sleep_function=None):
110 |         if sleep_function is GET_ALL_SLEEP_FUNCTION:
111 |             sleep_function = self._get_all_results_sleep_fn
112 |         if sleep_function is None:
113 |             sleep_function = lambda: None 
114 |         ret_results = []
115 |         while True:
116 |             res = self.get_results()
117 |             if not res:
118 |                 return ret_results
119 |             ret_results.extend(res)
120 |         return ret_results
121 | 
122 |     def _maybe_raise(self, cls, *arg):
123 |         if self.debug:
124 |             raise cls(*arg)
125 | 
126 |     def _extract_info(self, soup):
127 |         empty_info = { 'from': 0, 'to': 0, 'total': 0 }
128 |         stats_span = soup.find('span', id='stats')
129 |         if not stats_span:
130 |             return empty_info
131 |         txt = ''.join(stats_span.findAll(text=True))
132 |         txt = txt.replace(',', '').replace("&nbsp;", ' ')
133 |         matches = re.search(r'Results (\d+) - (\d+) of (?:about )?(\d+)', txt)
134 |         if not matches:
135 |             return empty_info
136 |         return {'from': int(matches.group(1)), 'to': int(matches.group(2)), 'total': int(matches.group(3))}
137 | 
138 |     def _get_results_page(self):
139 |         if self._page == 0:
140 |             if self._results_per_page == 10:
141 |                 url = SponsoredLinks.SEARCH_URL_0
142 |             else:
143 |                 url = SponsoredLinks.SEARCH_URL_1
144 |         else:
145 |             if self._results_per_page == 10:
146 |                 url = SponsoredLinks.NEXT_PAGE_0
147 |             else:
148 |                 url = SponsoredLinks.NEXT_PAGE_1
149 | 
150 |         safe_url = url % { 'query': urllib.quote_plus(self.query),
151 |                            'start': self._page * self._results_per_page,
152 |                            'num': self._results_per_page }
153 | 
154 |         try:
155 |             page = self.browser.get_page(safe_url)
156 |         except BrowserError, e:
157 |             raise SLError, "Failed getting %s: %s" % (e.url, e.error)
158 | 
159 |         return BeautifulSoup(page)
160 | 
161 |     def _extract_results(self, soup):
162 |         results = soup.findAll('div', {'class': 'g'})
163 |         ret_res = []
164 |         for result in results:
165 |             eres = self._extract_result(result)
166 |             if eres:
167 |                 ret_res.append(eres)
168 |         return ret_res
169 | 
170 |     def _extract_result(self, result):
171 |         title, url = self._extract_title_url(result)
172 |         display_url = self._extract_display_url(result) # Warning: removes 'cite' from the result
173 |         desc = self._extract_description(result)
174 |         if not title or not url or not display_url or not desc:
175 |             return None
176 |         return SponsoredLink(title, url, display_url, desc)
177 | 
178 |     def _extract_title_url(self, result):
179 |         title_a = result.find('a')
180 |         if not title_a:
181 |             self._maybe_raise(SLParseError, "Title tag in sponsored link was not found", result)
182 |             return None, None
183 |         title = ''.join(title_a.findAll(text=True))
184 |         title = self._html_unescape(title)
185 |         url = title_a['href']
186 |         match = re.search(r'q=(http[^&]+)&', url)
187 |         if not match:
188 |             self._maybe_raise(SLParseError, "URL inside a sponsored link was not found", result)
189 |             return None, None
190 |         url = urllib.unquote(match.group(1))
191 |         return title, url
192 | 
193 |     def _extract_display_url(self, result):
194 |         cite = result.find('cite')
195 |         if not cite:
196 |             self._maybe_raise(SLParseError, "<cite> not found inside result", result)
197 |             return None
198 | 
199 |         return ''.join(cite.findAll(text=True))
200 | 
201 |     def _extract_description(self, result):
202 |         cite = result.find('cite')
203 |         if not cite:
204 |             return None
205 |         cite.extract()
206 | 
207 |         desc_div = result.find('div', {'class': 'line23'})
208 |         if not desc_div:
209 |             self._maybe_raise(ParseError, "Description tag not found in sponsored link", result)
210 |             return None
211 | 
212 |         desc_strs = desc_div.findAll(text=True)[0:-1]
213 |         desc = ''.join(desc_strs)
214 |         desc = desc.replace("\n", " ")
215 |         desc = desc.replace("  ", " ")
216 |         return self._html_unescape(desc)
217 | 
218 |     def _html_unescape(self, str):
219 |         def entity_replacer(m):
220 |             entity = m.group(1)
221 |             if entity in name2codepoint:
222 |                 return unichr(name2codepoint[entity])
223 |             else:
224 |                 return m.group(0)
225 | 
226 |         def ascii_replacer(m):
227 |             cp = int(m.group(1))
228 |             if cp <= 255:
229 |                 return unichr(cp)
230 |             else:
231 |                 return m.group(0)
232 | 
233 |         s =    re.sub(r'&#(\d+);',  ascii_replacer, str, re.U)
234 |         return re.sub(r'&([^;]+);', entity_replacer, s, re.U)
235 | 
236 | 


--------------------------------------------------------------------------------
/xgoogle/translate.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #
  3 | # Peteris Krumins (peter@catonmat.net)
  4 | # http://www.catonmat.net  --  good coders code, great reuse
  5 | #
  6 | # http://www.catonmat.net/blog/python-library-for-google-translate/
  7 | #
  8 | # Code is licensed under MIT license.
  9 | #
 10 | 
 11 | from browser import Browser, BrowserError
 12 | from urllib import quote_plus
 13 | 
 14 | try:    import json
 15 | except: import simplejson as json
 16 | 
 17 | 
 18 | class TranslationError(Exception):
 19 |     pass
 20 | 
 21 | class Translator(object):
 22 |     translate_url = "http://ajax.googleapis.com/ajax/services/language/translate?v=1.0&q=%(message)s&langpair=%(from)s%%7C%(to)s"
 23 | 
 24 |     def __init__(self):
 25 |         self.browser = Browser()
 26 | 
 27 |     def translate(self, message, lang_to='en', lang_from=''):
 28 |         """
 29 |         Given a 'message' translate it from 'lang_from' to 'lang_to'.
 30 |         If 'lang_from' is empty, auto-detects the language.
 31 |         Returns the translated message.
 32 |         """
 33 | 
 34 |         if lang_to not in _languages:
 35 |             raise TranslationError, "Language %s is not supported as lang_to." % lang_to
 36 |         if lang_from not in _languages and lang_from != '':
 37 |             raise TranslationError, "Language %s is not supported as lang_from." % lang_from
 38 | 
 39 |         message = quote_plus(message)
 40 |         real_url = Translator.translate_url % { 'message': message,
 41 |                                                 'from':    lang_from,
 42 |                                                 'to':      lang_to }
 43 | 
 44 |         try:
 45 |             translation = self.browser.get_page(real_url)
 46 |             data = json.loads(translation)
 47 | 
 48 |             if data['responseStatus'] != 200:
 49 |                 raise TranslationError, "Failed translating: %s" % data['responseDetails']
 50 | 
 51 |             return data['responseData']['translatedText']
 52 |         except BrowserError, e:
 53 |             raise TranslationError, "Failed translating (getting %s failed): %s" % (e.url, e.error)
 54 |         except ValueError, e:
 55 |             raise TranslationError, "Failed translating (json failed): %s" % e.message
 56 |         except KeyError, e:
 57 |             raise TranslationError, "Failed translating, response didn't contain the translation"
 58 | 
 59 |         return None
 60 | 
 61 | class DetectionError(Exception):
 62 |     pass
 63 | 
 64 | class Language(object):
 65 |     def __init__(self, lang, confidence, is_reliable):
 66 |         self.lang_code = lang
 67 |         self.lang = _languages[lang]
 68 |         self.confidence = confidence
 69 |         self.is_reliable = is_reliable
 70 | 
 71 |     def __repr__(self):
 72 |         return '<Language: %s (%s)>' % (self.lang_code, self.lang)
 73 | 
 74 | class LanguageDetector(object):
 75 |     detect_url = "http://ajax.googleapis.com/ajax/services/language/detect?v=1.0&q=%(message)s"
 76 | 
 77 |     def __init__(self):
 78 |         self.browser = Browser()
 79 | 
 80 |     def detect(self, message):
 81 |         """
 82 |         Given a 'message' detects its language.
 83 |         Returns Language object.
 84 |         """
 85 | 
 86 |         message = quote_plus(message)
 87 |         real_url = LanguageDetector.detect_url % { 'message': message }
 88 | 
 89 |         try:
 90 |             detection = self.browser.get_page(real_url)
 91 |             data = json.loads(detection)
 92 | 
 93 |             if data['responseStatus'] != 200:
 94 |                 raise DetectionError, "Failed detecting language: %s" % data['responseDetails']
 95 | 
 96 |             rd = data['responseData']
 97 |             return Language(rd['language'], rd['confidence'], rd['isReliable'])
 98 | 
 99 |         except BrowserError, e:
100 |             raise DetectionError, "Failed detecting language (getting %s failed): %s" % (e.url, e.error)
101 |         except ValueError, e:
102 |             raise DetectionErrro, "Failed detecting language (json failed): %s" % e.message
103 |         except KeyError, e:
104 |             raise DetectionError, "Failed detecting language, response didn't contain the necessary data"
105 | 
106 |         return None
107 | 
108 | 
109 | _languages = {
110 |   'af': 'Afrikaans',
111 |   'sq': 'Albanian',
112 |   'am': 'Amharic',
113 |   'ar': 'Arabic',
114 |   'hy': 'Armenian',
115 |   'az': 'Azerbaijani',
116 |   'eu': 'Basque',
117 |   'be': 'Belarusian',
118 |   'bn': 'Bengali',
119 |   'bh': 'Bihari',
120 |   'bg': 'Bulgarian',
121 |   'my': 'Burmese',
122 |   'ca': 'Catalan',
123 |   'chr': 'Cherokee',
124 |   'zh': 'Chinese',
125 |   'zh-CN': 'Chinese_simplified',
126 |   'zh-TW': 'Chinese_traditional',
127 |   'hr': 'Croatian',
128 |   'cs': 'Czech',
129 |   'da': 'Danish',
130 |   'dv': 'Dhivehi',
131 |   'nl': 'Dutch',
132 |   'en': 'English',
133 |   'eo': 'Esperanto',
134 |   'et': 'Estonian',
135 |   'tl': 'Filipino',
136 |   'fi': 'Finnish',
137 |   'fr': 'French',
138 |   'gl': 'Galician',
139 |   'ka': 'Georgian',
140 |   'de': 'German',
141 |   'el': 'Greek',
142 |   'gn': 'Guarani',
143 |   'gu': 'Gujarati',
144 |   'iw': 'Hebrew',
145 |   'hi': 'Hindi',
146 |   'hu': 'Hungarian',
147 |   'is': 'Icelandic',
148 |   'id': 'Indonesian',
149 |   'iu': 'Inuktitut',
150 |   'ga': 'Irish',
151 |   'it': 'Italian',
152 |   'ja': 'Japanese',
153 |   'kn': 'Kannada',
154 |   'kk': 'Kazakh',
155 |   'km': 'Khmer',
156 |   'ko': 'Korean',
157 |   'ku': 'Kurdish',
158 |   'ky': 'Kyrgyz',
159 |   'lo': 'Laothian',
160 |   'lv': 'Latvian',
161 |   'lt': 'Lithuanian',
162 |   'mk': 'Macedonian',
163 |   'ms': 'Malay',
164 |   'ml': 'Malayalam',
165 |   'mt': 'Maltese',
166 |   'mr': 'Marathi',
167 |   'mn': 'Mongolian',
168 |   'ne': 'Nepali',
169 |   'no': 'Norwegian',
170 |   'or': 'Oriya',
171 |   'ps': 'Pashto',
172 |   'fa': 'Persian',
173 |   'pl': 'Polish',
174 |   'pt-PT': 'Portuguese',
175 |   'pa': 'Punjabi',
176 |   'ro': 'Romanian',
177 |   'ru': 'Russian',
178 |   'sa': 'Sanskrit',
179 |   'sr': 'Serbian',
180 |   'sd': 'Sindhi',
181 |   'si': 'Sinhalese',
182 |   'sk': 'Slovak',
183 |   'sl': 'Slovenian',
184 |   'es': 'Spanish',
185 |   'sw': 'Swahili',
186 |   'sv': 'Swedish',
187 |   'tg': 'Tajik',
188 |   'ta': 'Tamil',
189 |   'tl': 'Tagalog',
190 |   'te': 'Telugu',
191 |   'th': 'Thai',
192 |   'bo': 'Tibetan',
193 |   'tr': 'Turkish',
194 |   'uk': 'Ukrainian',
195 |   'ur': 'Urdu',
196 |   'uz': 'Uzbek',
197 |   'ug': 'Uighur',
198 |   'vi': 'Vietnamese',
199 |   'cy': 'Welsh',
200 |   'yi': 'Yiddish'
201 | };
202 | 
203 | 


--------------------------------------------------------------------------------