├── .gitignore
├── contributors.txt
├── examples
├── example1.py
├── example2.py
└── example3.py
├── programs
├── english.py
├── google_fight.py
└── google_fight2.py
├── projects-using-xgoogle.txt
├── readme.txt
├── setup.py
└── xgoogle
├── BeautifulSoup.py
├── __init__.py
├── browser.py
├── googlesets.py
├── search.py
├── sponsoredlinks.py
└── translate.py
/.gitignore:
--------------------------------------------------------------------------------
1 | xgoogle.egg-info
2 | *~
3 | *.swp
4 |
--------------------------------------------------------------------------------
/contributors.txt:
--------------------------------------------------------------------------------
1 | This file contains people who have helped xgoogle project:
2 |
3 | * Holger Berndt
4 | Thanks for adding:
5 | * 'lang' and 'tld' arguments to Google Search
6 | * 'filetype' search
7 | * 'last_search_url' property
8 | * 'date indexed' search
9 |
10 | * Juanjo Conti
11 | Thanks for adding Google Blog Search class
12 |
13 | * Steve Steiner
14 | Thanks for adding setup.py
15 |
16 | * azappella
17 | Thanks for fixing search.py after redesign
18 |
19 |
20 | PS. If I missed you, please email me at peter@catonmat.net, and I'll add you here!
21 |
22 |
--------------------------------------------------------------------------------
/examples/example1.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # This program does a Google search for "quick and dirty" and returns
4 | # 50 results.
5 | #
6 |
7 | from xgoogle.search import GoogleSearch, SearchError
8 | try:
9 | gs = GoogleSearch("quick and dirty")
10 | gs.results_per_page = 50
11 | results = gs.get_results()
12 | for res in results:
13 | print res.title.encode('utf8')
14 | print res.desc.encode('utf8')
15 | print res.url.encode('utf8')
16 | print
17 | except SearchError, e:
18 | print "Search failed: %s" % e
19 |
20 |
--------------------------------------------------------------------------------
/examples/example2.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # This program finds how the target_domain ranks for target_keyword keyword.
4 | #
5 | #
6 |
7 | import re
8 | from urlparse import urlparse
9 | from xgoogle.search import GoogleSearch, SearchError
10 |
11 | target_domain = "catonmat.net"
12 | target_keyword = "python videos"
13 |
14 | def mk_nice_domain(domain):
15 | """
16 | convert domain into a nicer one (eg. www3.google.com into google.com)
17 | """
18 | domain = re.sub("^www(\d+)?\.", "", domain)
19 | # add more here
20 | return domain
21 |
22 | gs = GoogleSearch(target_keyword)
23 | gs.results_per_page = 100
24 | results = gs.get_results()
25 | for idx, res in enumerate(results):
26 | parsed = urlparse(res.url)
27 | domain = mk_nice_domain(parsed.netloc)
28 | if domain == target_domain:
29 | print "Ranking position %d for keyword %s on domain %s" % (idx+1, target_keyword, target_domain)
30 |
31 |
--------------------------------------------------------------------------------
/examples/example3.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # This program uses GeoIP to find websites that match `dst_keyword`, which
4 | # are located in `dst_country` country, and even more specifically, which
5 | # are located in `dst_states` state in this country.
6 | #
7 |
8 | import GeoIP
9 | from urlparse import urlparse
10 | from xgoogle.search import GoogleSearch, SearchError
11 |
12 | class Geo(object):
13 | GEO_PATH = "/usr/local/geo_ip/GeoLiteCity.dat"
14 |
15 | def __init__(self):
16 | self.geo = GeoIP.open(Geo.GEO_PATH, GeoIP.GEOIP_STANDARD)
17 |
18 | def detect_by_host(self, host):
19 | try:
20 | gir = self.geo.record_by_name(host)
21 | return {'country': gir['country_code'].lower(),
22 | 'region': gir['region'].lower()}
23 | except Exception, e:
24 | return {'country': 'none', 'region': 'none'}
25 |
26 | dst_country = 'us'
27 | dst_states = ['ca', 'ny']
28 | dst_keyword = "wicked code"
29 | num_results = 10
30 | final_results = []
31 | geo = Geo()
32 |
33 | gs = GoogleSearch(dst_keyword)
34 | gs.results_per_page = 100
35 |
36 | seen_websites = []
37 | while len(final_results) < num_results:
38 | results = gs.get_results()
39 | domains = [urlparse(r.url).netloc for r in results]
40 | for d in domains:
41 | geo_loc = geo.detect_by_host(d)
42 | if (geo_loc['country'] == dst_country and
43 | geo_loc['region'] in dst_states and
44 | d not in seen_websites):
45 | final_results.append((d, geo_loc['region']))
46 | seen_websites.append(d)
47 | if len(final_results) == num_results:
48 | break
49 |
50 | print "Found %d websites:" % len(final_results)
51 | for w in final_results:
52 | print "%s (state: %s)" % w
53 |
54 |
--------------------------------------------------------------------------------
/programs/english.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Peteris Krumins (peter@catonmat.net)
4 | # http://www.catonmat.net -- good coders code, great reuse
5 | #
6 |
7 | # Given a query such as "this is a * night", this program uses xgoogle to
8 | # search for this on google, find 100 results, and sort the results by number
9 | # of appearances.
10 | #
11 | # For example, it may outout:
12 | # 15 "this is a cold night"
13 | # 8 "this is a romantic night"
14 | # 5 "this is a nice night"
15 | # ... etc.
16 | #
17 | # This way you can quickly find adjectives associated with night.
18 | #
19 |
20 | import itertools
21 | import operator
22 | import sys
23 | import re
24 | from xgoogle.search import GoogleSearch, SearchError, ParseError
25 |
26 | def count_results(results):
27 | return dict((len(list(group)), item) for item, group in itertools.groupby(sorted(results)))
28 |
29 | def print_results(results):
30 | sr = sorted(results.iteritems(), key=operator.itemgetter(0), reverse=True)
31 | for r in sr:
32 | print "%d\t%s" % r
33 |
34 | if __name__ == "__main__":
35 | args = sys.argv[1:]
36 | if not args:
37 | print 'Error: usage %s "phrase with * in it"' % sys.argv[0]
38 | sys.exit(1)
39 | if args > 1:
40 | args = ' '.join(args)
41 | query = '"%s"' % args
42 | gs = GoogleSearch(query)
43 | gs.results_per_page = 100
44 | results = gs.get_results()
45 | args_re = args.replace('*', '.*?')
46 | filter_re = re.compile(args_re, re.I)
47 | frf = filter_re.findall
48 | nomnomnom = []
49 | for r in results:
50 | descs = filter_re.findall(r.desc)
51 | titles = filter_re.findall(r.title)
52 | nomnomnom += [x.lower().encode('utf-8') for x in descs + titles]
53 | results = count_results(nomnomnom)
54 | print_results(results)
55 |
56 |
--------------------------------------------------------------------------------
/programs/google_fight.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Peteris Krumins (peter@catonmat.net)
4 | # http://www.catonmat.net -- good coders code, great reuse
5 | #
6 | # A Google Fight program. v1.0
7 | #
8 | # Released under GNU GPL
9 | #
10 | # http://www.catonmat.net/blog/python-library-for-google-search
11 | #
12 |
13 | import sys
14 | from xgoogle.search import GoogleSearch, SearchError
15 |
16 | args = sys.argv[1:]
17 | if len(args) < 2:
18 | print 'Usage: google_fight.py "keyword 1" "keyword 2"'
19 | sys.exit(1)
20 |
21 | try:
22 | n0 = GoogleSearch('"%s"' % args[0]).num_results
23 | n1 = GoogleSearch('"%s"' % args[1]).num_results
24 | except SearchError, e:
25 | print "Google search failed: %s" % e
26 | sys.exit(1)
27 |
28 | if n0 > n1:
29 | print "%s wins with %d results! (%s had %d)" % (args[0], n0, args[1], n1)
30 | elif n1 > n0:
31 | print "%s wins with %d results! (%s had %d)" % (args[1], n1, args[0], n0)
32 | else:
33 | print "It's a tie! Both keywords have %d results!" % n1
34 |
35 |
--------------------------------------------------------------------------------
/programs/google_fight2.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #
3 | # Peteris Krumins (peter@catonmat.net)
4 | # http://www.catonmat.net -- good coders code, great reuse
5 | #
6 | # A Google Fight program. v1.0
7 | #
8 | # Released under GNU GPL
9 | #
10 | # http://www.catonmat.net/blog/python-library-for-google-search
11 | #
12 |
13 | import sys
14 | from operator import itemgetter
15 | from xgoogle.search import GoogleSearch, SearchError
16 |
17 | args = sys.argv[1:]
18 | if not args:
19 | print "Usage: google_fight.py keyword one, keyword two, ..."
20 | sys.exit(1)
21 |
22 | keywords = [k.strip() for k in ' '.join(args).split(',')]
23 | try:
24 | results = [(k, GoogleSearch('"%s"' % k).num_results) for k in keywords]
25 | except SearchError, e:
26 | print "Google search failed: %s" % e
27 | sys.exit(1)
28 |
29 | results.sort(key=itemgetter(1), reverse=True)
30 | for res in results:
31 | print "%s: %d" % res
32 |
33 |
--------------------------------------------------------------------------------
/projects-using-xgoogle.txt:
--------------------------------------------------------------------------------
1 | Here is a list of projects that other people have done, that use my library:
2 |
3 | * fimap -- http://code.google.com/p/fimap/
4 |
5 | fimap is a little python tool which can find, prepare, audit, exploit and
6 | even google automaticly for local and remote file inclusion bugs in webapps.
7 |
8 | * translate.org.za -- http://www.translate.org.za/
9 |
10 | Translate.org.za is focused on the localisation, or translation, of Open
11 | Source software into South Africa's 11 official languages and localisations
12 | of GNOME, KDE, OpenOffice.org, Firefox and Thunderbird.
13 |
14 | * Top40 -- http://github.com/staticd/Top40
15 |
16 | This program grabs the top 40 artists played on Alt Nation (a Sirius XM
17 | radio channel) over the past week and runs a Google search on each artist in
18 | an attempt to find links to free music from the artist being searched.
19 | Thanks to xgoogle, I have an interface with Google to make this program
20 | successful.
21 |
22 |
--------------------------------------------------------------------------------
/readme.txt:
--------------------------------------------------------------------------------
1 | This is a Google library called 'xgoogle'. Current version is 1.3.
2 |
3 | It's written by Peteris Krumins (peter@catonmat.net).
4 | His blog is at http://www.catonmat.net -- good coders code, great reuse.
5 |
6 | The code is licensed under MIT license.
7 |
8 | --------------------------------------------------------------------------
9 |
10 | At the moment it contains:
11 | * Google Search module xgoogle/search.py.
12 | http://www.catonmat.net/blog/python-library-for-google-search/
13 |
14 | * Google Sponsored Links Search module xgoogle/sponsoredlinks.py
15 | http://www.catonmat.net/blog/python-library-for-google-sponsored-links-search/
16 |
17 | * Google Sets module xgoogle/googlesets.py
18 | http://www.catonmat.net/blog/python-library-for-google-sets/
19 |
20 | * Google Translate module xgoogle/translate.py
21 | http://www.catonmat.net/blog/python-library-for-google-translate/
22 |
23 | --------------------------------------------------------------------------
24 |
25 | Here is an example usage of Google Search module:
26 |
27 | >>> from xgoogle.search import GoogleSearch
28 | >>> gs = GoogleSearch("catonmat")
29 | >>> gs.results_per_page = 25
30 | >>> results = gs.get_results()
31 | >>> for res in results:
32 | ... print res.title.encode('utf8')
33 | ...
34 |
35 | output:
36 |
37 | good coders code, great reuse
38 | MIT's Introduction to Algorithms, Lectures 1 and 2: Analysis of ...
39 | catonmat - Google Code
40 | ...
41 |
42 | The GoogleSearch object has several public methods and properties:
43 |
44 | method get_results() - gets a page of results, returning a list of SearchResult objects.
45 | property num_results - returns number of search results found.
46 | property results_per_page - sets/gets the number of results to get per page.
47 | property page - sets/gets the search page.
48 |
49 | A SearchResult object has three attributes -- "title", "desc", and "url".
50 | They are Unicode strings, so do a proper encoding before outputting them.
51 |
52 | --------------------------------------------------------------------------
53 |
54 | Here is an example usage of Google Sponsored Links Search module:
55 |
56 | >>> from xgoogle.sponsoredlinks import SponsoredLinks, SLError
57 | >>> sl = SponsoredLinks("video software")
58 | >>> sl.results_per_page = 100
59 | >>> results = sl.get_results()
60 | >>> for result in results:
61 | ... print result.title.encode('utf8')
62 | ...
63 |
64 | output:
65 |
66 | Photoshop Video Software
67 | Video Poker Software
68 | DVD/Video Rental Software
69 | ...
70 |
71 | The SponsoredLinks object has several public methods and properties:
72 |
73 | method get_results() - gets a page of results, returning a list of SearchResult objects.
74 | property num_results - returns number of search results found.
75 | property results_per_page - sets/gets the number of results to get per page.
76 |
77 | A SponsoredLink object has four attributes -- "title", "desc", "url", and "display_url".
78 | They are Unicode strings, don't forget to use a proper encoding before outputting them.
79 |
80 | --------------------------------------------------------------------------
81 |
82 | Here is an example usage of Google Sets module:
83 |
84 | >>> from xgoogle.googlesets import GoogleSets
85 | >>> gs = GoogleSets(['red', 'yellow'])
86 | >>> results = gs.get_results()
87 | >>> print len(results)
88 | >>> for r in results:
89 | ... print r.encode('utf8')
90 | ...
91 |
92 | output:
93 |
94 | red
95 | yellow
96 | blue
97 | white
98 | ...
99 |
100 | The GoogleSets object has only get_results(set_type) public method. The default value
101 | for set_type is SMALL_SET, which makes it return 15 related items or fewer.
102 | Use LARGE_SET to get more than 15 items. This get_results() method returns a list of
103 | related items that are represented as unicode strings.
104 | Don't forget to do the proper encoding when outputting these strings!
105 |
106 | Here is an example showing differences between SMALL_SET and LARGE_SET:
107 |
108 | >>> from xgoogle.googlesets import GoogleSets, LARGE_SET, SMALL_SET
109 | >>> gs = GoogleSets(['python', 'perl'])
110 | >>> results_small = gs.get_results() # SMALL_SET by default
111 | >>> len(results_small)
112 | 11
113 | >>> results_small
114 | [u'python', u'perl', u'php', u'ruby', u'java', u'javascript', u'c++', u'c',
115 | u'cgi', u'tcl', u'c#']
116 | >>>
117 | >>> results_large = gs.get_results(LARGE_SET)
118 | >>> len(results_large)
119 | 46
120 | >>> results_large
121 | [u'perl', u'python', u'java', u'c++', u'php', u'c', u'c#', u'javascript',
122 | u'howto', u'wiki', u'raid', u'dd', u'linux', u'ruby', u'language', u'xml',
123 | u'sgml', u'svn', u'kernel', ...]
124 |
125 |
126 | --------------------------------------------------------------------------
127 |
128 | Here is an example usage of Google Translate module:
129 |
130 | >>> from xgoogle.translate import Translator
131 | >>>
132 | >>> translate = Translator().translate
133 | >>> print translate("Mani sauc Pēteris", lang_to="ru").encode('utf-8')
134 | Меня зовут Петр
135 | >>> print translate("Mani sauc Pēteris", lang_to="en")
136 | My name is Peter
137 | >>> print translate("Меня зовут Петр")
138 | My name is Peter
139 |
140 | The "translate" function takes three arguments - "message", "lang_from" and "lang_to".
141 | If "lang_from" is not given, Google's translation service auto-detects it.
142 | If "lang_to" is not given, it defaults to "en" (English).
143 |
144 | In case of an error the "translate" function throws "TranslationError" exception.
145 | Make sure to wrap your code in try/except block to catch it:
146 |
147 | >>> from xgoogle.translate import Translator, TranslationError
148 | >>>
149 | >>> try:
150 | >>> translate = Translator().translate
151 | >>> print translate("")
152 | >>> except TranslationError, e:
153 | >>> print e
154 |
155 | Failed translating: invalid text
156 |
157 |
158 | The Google Translate module also provides "LanguageDetector" class that can be used
159 | to detect the language of the text.
160 |
161 | Here is an example usage of LanguageDetector:
162 |
163 | >>> from xgoogle.translate import LanguageDetector, DetectionError
164 | >>>
165 | >>> detect = LanguageDetector().detect
166 | >>> english = detect("This is a wonderful library.")
167 | >>> english.lang_code
168 | 'en'
169 | >>> english.lang
170 | 'English'
171 | >>> english.confidence
172 | 0.28078437000000001
173 | >>> english.is_reliable
174 | True
175 |
176 | The "DetectionError" may get raised if the detection failed.
177 |
178 |
179 | --------------------------------------------------------------------------
180 |
181 |
182 | Version history:
183 |
184 | v1.0: * initial release, xgoogle library contains just the Google Search.
185 | v1.1: * added Google Sponsored Links Search.
186 | * fixed a bug in browser.py that might have thrown an unexpected exception.
187 | v1.2: * added Google Sets module
188 | v1.3: * added Google Translate module
189 | * fixed a bug in browser.py when KeyboardInterrupt did not get propagated.
190 |
191 | --------------------------------------------------------------------------
192 |
193 | That's it. Have fun! :)
194 |
195 |
196 | Sincerely,
197 | Peteris Krumins
198 | http://www.catonmat.net
199 |
200 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 | import sys
3 |
4 | __version__ = '1.3'
5 |
6 | import os
7 | def _read(fname):
8 | return open(os.path.join(os.path.dirname(__file__), fname)).read()
9 |
10 | setup(
11 | name='xgoogle',
12 | version=__version__,
13 | description="Python library to Google services (google search, google sets, google translate, sponsored links)",
14 | long_description=_read('readme.txt'),
15 | classifiers=[],
16 | keywords='google search',
17 | author='Peteris Krumins',
18 | author_email='peter@catonmat.net',
19 | url='http://github.com/pkrumins/xgoogle',
20 | license='MIT',
21 | packages=find_packages(exclude=['ez_setup', 'examples', 'tests']),
22 | entry_points={
23 | # -*- Entry points: -*-
24 | },
25 | include_package_data=True,
26 | zip_safe=False,
27 | install_requires=[
28 | # -*- Extra requirements: -*-
29 | ],
30 | )
31 |
--------------------------------------------------------------------------------
/xgoogle/BeautifulSoup.py:
--------------------------------------------------------------------------------
1 | """Beautiful Soup
2 | Elixir and Tonic
3 | "The Screen-Scraper's Friend"
4 | http://www.crummy.com/software/BeautifulSoup/
5 |
6 | Beautiful Soup parses a (possibly invalid) XML or HTML document into a
7 | tree representation. It provides methods and Pythonic idioms that make
8 | it easy to navigate, search, and modify the tree.
9 |
10 | A well-formed XML/HTML document yields a well-formed data
11 | structure. An ill-formed XML/HTML document yields a correspondingly
12 | ill-formed data structure. If your document is only locally
13 | well-formed, you can use this library to find and process the
14 | well-formed part of it.
15 |
16 | Beautiful Soup works with Python 2.2 and up. It has no external
17 | dependencies, but you'll have more success at converting data to UTF-8
18 | if you also install these three packages:
19 |
20 | * chardet, for auto-detecting character encodings
21 | http://chardet.feedparser.org/
22 | * cjkcodecs and iconv_codec, which add more encodings to the ones supported
23 | by stock Python.
24 | http://cjkpython.i18n.org/
25 |
26 | Beautiful Soup defines classes for two main parsing strategies:
27 |
28 | * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
29 | language that kind of looks like XML.
30 |
31 | * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
32 | or invalid. This class has web browser-like heuristics for
33 | obtaining a sensible parse tree in the face of common HTML errors.
34 |
35 | Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
36 | the encoding of an HTML or XML document, and converting it to
37 | Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
38 |
39 | For more than you ever wanted to know about Beautiful Soup, see the
40 | documentation:
41 | http://www.crummy.com/software/BeautifulSoup/documentation.html
42 |
43 | Here, have some legalese:
44 |
45 | Copyright (c) 2004-2007, Leonard Richardson
46 |
47 | All rights reserved.
48 |
49 | Redistribution and use in source and binary forms, with or without
50 | modification, are permitted provided that the following conditions are
51 | met:
52 |
53 | * Redistributions of source code must retain the above copyright
54 | notice, this list of conditions and the following disclaimer.
55 |
56 | * Redistributions in binary form must reproduce the above
57 | copyright notice, this list of conditions and the following
58 | disclaimer in the documentation and/or other materials provided
59 | with the distribution.
60 |
61 | * Neither the name of the the Beautiful Soup Consortium and All
62 | Night Kosher Bakery nor the names of its contributors may be
63 | used to endorse or promote products derived from this software
64 | without specific prior written permission.
65 |
66 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
67 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
68 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
69 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
70 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
71 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
72 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
73 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
74 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
75 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
76 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
77 |
78 | """
79 | from __future__ import generators
80 |
81 | __author__ = "Leonard Richardson (leonardr@segfault.org)"
82 | __version__ = "3.0.6"
83 | __copyright__ = "Copyright (c) 2004-2008 Leonard Richardson"
84 | __license__ = "New-style BSD"
85 |
86 | from sgmllib import SGMLParser, SGMLParseError
87 | import codecs
88 | import types
89 | import re
90 | import sgmllib
91 | try:
92 | from htmlentitydefs import name2codepoint
93 | except ImportError:
94 | name2codepoint = {}
95 |
96 | #This hack makes Beautiful Soup able to parse XML with namespaces
97 | sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
98 |
99 | DEFAULT_OUTPUT_ENCODING = "utf-8"
100 |
101 | # First, the classes that represent markup elements.
102 |
103 | class PageElement:
104 | """Contains the navigational information for some part of the page
105 | (either a tag or a piece of text)"""
106 |
107 | def setup(self, parent=None, previous=None):
108 | """Sets up the initial relations between this element and
109 | other elements."""
110 | self.parent = parent
111 | self.previous = previous
112 | self.next = None
113 | self.previousSibling = None
114 | self.nextSibling = None
115 | if self.parent and self.parent.contents:
116 | self.previousSibling = self.parent.contents[-1]
117 | self.previousSibling.nextSibling = self
118 |
119 | def replaceWith(self, replaceWith):
120 | oldParent = self.parent
121 | myIndex = self.parent.contents.index(self)
122 | if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
123 | # We're replacing this element with one of its siblings.
124 | index = self.parent.contents.index(replaceWith)
125 | if index and index < myIndex:
126 | # Furthermore, it comes before this element. That
127 | # means that when we extract it, the index of this
128 | # element will change.
129 | myIndex = myIndex - 1
130 | self.extract()
131 | oldParent.insert(myIndex, replaceWith)
132 |
133 | def extract(self):
134 | """Destructively rips this element out of the tree."""
135 | if self.parent:
136 | try:
137 | self.parent.contents.remove(self)
138 | except ValueError:
139 | pass
140 |
141 | #Find the two elements that would be next to each other if
142 | #this element (and any children) hadn't been parsed. Connect
143 | #the two.
144 | lastChild = self._lastRecursiveChild()
145 | nextElement = lastChild.next
146 |
147 | if self.previous:
148 | self.previous.next = nextElement
149 | if nextElement:
150 | nextElement.previous = self.previous
151 | self.previous = None
152 | lastChild.next = None
153 |
154 | self.parent = None
155 | if self.previousSibling:
156 | self.previousSibling.nextSibling = self.nextSibling
157 | if self.nextSibling:
158 | self.nextSibling.previousSibling = self.previousSibling
159 | self.previousSibling = self.nextSibling = None
160 | return self
161 |
162 | def _lastRecursiveChild(self):
163 | "Finds the last element beneath this object to be parsed."
164 | lastChild = self
165 | while hasattr(lastChild, 'contents') and lastChild.contents:
166 | lastChild = lastChild.contents[-1]
167 | return lastChild
168 |
169 | def insert(self, position, newChild):
170 | if (isinstance(newChild, basestring)
171 | or isinstance(newChild, unicode)) \
172 | and not isinstance(newChild, NavigableString):
173 | newChild = NavigableString(newChild)
174 |
175 | position = min(position, len(self.contents))
176 | if hasattr(newChild, 'parent') and newChild.parent != None:
177 | # We're 'inserting' an element that's already one
178 | # of this object's children.
179 | if newChild.parent == self:
180 | index = self.find(newChild)
181 | if index and index < position:
182 | # Furthermore we're moving it further down the
183 | # list of this object's children. That means that
184 | # when we extract this element, our target index
185 | # will jump down one.
186 | position = position - 1
187 | newChild.extract()
188 |
189 | newChild.parent = self
190 | previousChild = None
191 | if position == 0:
192 | newChild.previousSibling = None
193 | newChild.previous = self
194 | else:
195 | previousChild = self.contents[position-1]
196 | newChild.previousSibling = previousChild
197 | newChild.previousSibling.nextSibling = newChild
198 | newChild.previous = previousChild._lastRecursiveChild()
199 | if newChild.previous:
200 | newChild.previous.next = newChild
201 |
202 | newChildsLastElement = newChild._lastRecursiveChild()
203 |
204 | if position >= len(self.contents):
205 | newChild.nextSibling = None
206 |
207 | parent = self
208 | parentsNextSibling = None
209 | while not parentsNextSibling:
210 | parentsNextSibling = parent.nextSibling
211 | parent = parent.parent
212 | if not parent: # This is the last element in the document.
213 | break
214 | if parentsNextSibling:
215 | newChildsLastElement.next = parentsNextSibling
216 | else:
217 | newChildsLastElement.next = None
218 | else:
219 | nextChild = self.contents[position]
220 | newChild.nextSibling = nextChild
221 | if newChild.nextSibling:
222 | newChild.nextSibling.previousSibling = newChild
223 | newChildsLastElement.next = nextChild
224 |
225 | if newChildsLastElement.next:
226 | newChildsLastElement.next.previous = newChildsLastElement
227 | self.contents.insert(position, newChild)
228 |
229 | def append(self, tag):
230 | """Appends the given tag to the contents of this tag."""
231 | self.insert(len(self.contents), tag)
232 |
233 | def findNext(self, name=None, attrs={}, text=None, **kwargs):
234 | """Returns the first item that matches the given criteria and
235 | appears after this Tag in the document."""
236 | return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
237 |
238 | def findAllNext(self, name=None, attrs={}, text=None, limit=None,
239 | **kwargs):
240 | """Returns all items that match the given criteria and appear
241 | after this Tag in the document."""
242 | return self._findAll(name, attrs, text, limit, self.nextGenerator,
243 | **kwargs)
244 |
245 | def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
246 | """Returns the closest sibling to this Tag that matches the
247 | given criteria and appears after this Tag in the document."""
248 | return self._findOne(self.findNextSiblings, name, attrs, text,
249 | **kwargs)
250 |
251 | def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
252 | **kwargs):
253 | """Returns the siblings of this Tag that match the given
254 | criteria and appear after this Tag in the document."""
255 | return self._findAll(name, attrs, text, limit,
256 | self.nextSiblingGenerator, **kwargs)
257 | fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
258 |
259 | def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
260 | """Returns the first item that matches the given criteria and
261 | appears before this Tag in the document."""
262 | return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
263 |
264 | def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
265 | **kwargs):
266 | """Returns all items that match the given criteria and appear
267 | before this Tag in the document."""
268 | return self._findAll(name, attrs, text, limit, self.previousGenerator,
269 | **kwargs)
270 | fetchPrevious = findAllPrevious # Compatibility with pre-3.x
271 |
272 | def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
273 | """Returns the closest sibling to this Tag that matches the
274 | given criteria and appears before this Tag in the document."""
275 | return self._findOne(self.findPreviousSiblings, name, attrs, text,
276 | **kwargs)
277 |
278 | def findPreviousSiblings(self, name=None, attrs={}, text=None,
279 | limit=None, **kwargs):
280 | """Returns the siblings of this Tag that match the given
281 | criteria and appear before this Tag in the document."""
282 | return self._findAll(name, attrs, text, limit,
283 | self.previousSiblingGenerator, **kwargs)
284 | fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
285 |
286 | def findParent(self, name=None, attrs={}, **kwargs):
287 | """Returns the closest parent of this Tag that matches the given
288 | criteria."""
289 | # NOTE: We can't use _findOne because findParents takes a different
290 | # set of arguments.
291 | r = None
292 | l = self.findParents(name, attrs, 1)
293 | if l:
294 | r = l[0]
295 | return r
296 |
297 | def findParents(self, name=None, attrs={}, limit=None, **kwargs):
298 | """Returns the parents of this Tag that match the given
299 | criteria."""
300 |
301 | return self._findAll(name, attrs, None, limit, self.parentGenerator,
302 | **kwargs)
303 | fetchParents = findParents # Compatibility with pre-3.x
304 |
305 | #These methods do the real heavy lifting.
306 |
307 | def _findOne(self, method, name, attrs, text, **kwargs):
308 | r = None
309 | l = method(name, attrs, text, 1, **kwargs)
310 | if l:
311 | r = l[0]
312 | return r
313 |
314 | def _findAll(self, name, attrs, text, limit, generator, **kwargs):
315 | "Iterates over a generator looking for things that match."
316 |
317 | if isinstance(name, SoupStrainer):
318 | strainer = name
319 | else:
320 | # Build a SoupStrainer
321 | strainer = SoupStrainer(name, attrs, text, **kwargs)
322 | results = ResultSet(strainer)
323 | g = generator()
324 | while True:
325 | try:
326 | i = g.next()
327 | except StopIteration:
328 | break
329 | if i:
330 | found = strainer.search(i)
331 | if found:
332 | results.append(found)
333 | if limit and len(results) >= limit:
334 | break
335 | return results
336 |
337 | #These Generators can be used to navigate starting from both
338 | #NavigableStrings and Tags.
339 | def nextGenerator(self):
340 | i = self
341 | while i:
342 | i = i.next
343 | yield i
344 |
345 | def nextSiblingGenerator(self):
346 | i = self
347 | while i:
348 | i = i.nextSibling
349 | yield i
350 |
351 | def previousGenerator(self):
352 | i = self
353 | while i:
354 | i = i.previous
355 | yield i
356 |
357 | def previousSiblingGenerator(self):
358 | i = self
359 | while i:
360 | i = i.previousSibling
361 | yield i
362 |
363 | def parentGenerator(self):
364 | i = self
365 | while i:
366 | i = i.parent
367 | yield i
368 |
369 | # Utility methods
370 | def substituteEncoding(self, str, encoding=None):
371 | encoding = encoding or "utf-8"
372 | return str.replace("%SOUP-ENCODING%", encoding)
373 |
374 | def toEncoding(self, s, encoding=None):
375 | """Encodes an object to a string in some encoding, or to Unicode.
376 | ."""
377 | if isinstance(s, unicode):
378 | if encoding:
379 | s = s.encode(encoding)
380 | elif isinstance(s, str):
381 | if encoding:
382 | s = s.encode(encoding)
383 | else:
384 | s = unicode(s)
385 | else:
386 | if encoding:
387 | s = self.toEncoding(str(s), encoding)
388 | else:
389 | s = unicode(s)
390 | return s
391 |
392 | class NavigableString(unicode, PageElement):
393 |
394 | def __getnewargs__(self):
395 | return (NavigableString.__str__(self),)
396 |
397 | def __getattr__(self, attr):
398 | """text.string gives you text. This is for backwards
399 | compatibility for Navigable*String, but for CData* it lets you
400 | get the string without the CData wrapper."""
401 | if attr == 'string':
402 | return self
403 | else:
404 | raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
405 |
406 | def __unicode__(self):
407 | return str(self).decode(DEFAULT_OUTPUT_ENCODING)
408 |
409 | def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
410 | if encoding:
411 | return self.encode(encoding)
412 | else:
413 | return self
414 |
415 | class CData(NavigableString):
416 |
417 | def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
418 | return "" % NavigableString.__str__(self, encoding)
419 |
420 | class ProcessingInstruction(NavigableString):
421 | def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
422 | output = self
423 | if "%SOUP-ENCODING%" in output:
424 | output = self.substituteEncoding(output, encoding)
425 | return "%s?>" % self.toEncoding(output, encoding)
426 |
427 | class Comment(NavigableString):
428 | def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
429 | return "" % NavigableString.__str__(self, encoding)
430 |
431 | class Declaration(NavigableString):
432 | def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
433 | return "" % NavigableString.__str__(self, encoding)
434 |
435 | class Tag(PageElement):
436 |
437 | """Represents a found HTML tag with its attributes and contents."""
438 |
439 | def _invert(h):
440 | "Cheap function to invert a hash."
441 | i = {}
442 | for k,v in h.items():
443 | i[v] = k
444 | return i
445 |
446 | XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
447 | "quot" : '"',
448 | "amp" : "&",
449 | "lt" : "<",
450 | "gt" : ">" }
451 |
452 | XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
453 |
454 | def _convertEntities(self, match):
455 | """Used in a call to re.sub to replace HTML, XML, and numeric
456 | entities with the appropriate Unicode characters. If HTML
457 | entities are being converted, any unrecognized entities are
458 | escaped."""
459 | x = match.group(1)
460 | if self.convertHTMLEntities and x in name2codepoint:
461 | return unichr(name2codepoint[x])
462 | elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
463 | if self.convertXMLEntities:
464 | return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
465 | else:
466 | return u'&%s;' % x
467 | elif len(x) > 0 and x[0] == '#':
468 | # Handle numeric entities
469 | if len(x) > 1 and x[1] == 'x':
470 | return unichr(int(x[2:], 16))
471 | else:
472 | return unichr(int(x[1:]))
473 |
474 | elif self.escapeUnrecognizedEntities:
475 | return u'&%s;' % x
476 | else:
477 | return u'&%s;' % x
478 |
479 | def __init__(self, parser, name, attrs=None, parent=None,
480 | previous=None):
481 | "Basic constructor."
482 |
483 | # We don't actually store the parser object: that lets extracted
484 | # chunks be garbage-collected
485 | self.parserClass = parser.__class__
486 | self.isSelfClosing = parser.isSelfClosingTag(name)
487 | self.name = name
488 | if attrs == None:
489 | attrs = []
490 | self.attrs = attrs
491 | self.contents = []
492 | self.setup(parent, previous)
493 | self.hidden = False
494 | self.containsSubstitutions = False
495 | self.convertHTMLEntities = parser.convertHTMLEntities
496 | self.convertXMLEntities = parser.convertXMLEntities
497 | self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
498 |
499 | # Convert any HTML, XML, or numeric entities in the attribute values.
500 | convert = lambda(k, val): (k,
501 | re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
502 | self._convertEntities,
503 | val))
504 | self.attrs = map(convert, self.attrs)
505 |
506 | def get(self, key, default=None):
507 | """Returns the value of the 'key' attribute for the tag, or
508 | the value given for 'default' if it doesn't have that
509 | attribute."""
510 | return self._getAttrMap().get(key, default)
511 |
512 | def has_key(self, key):
513 | return self._getAttrMap().has_key(key)
514 |
515 | def __getitem__(self, key):
516 | """tag[key] returns the value of the 'key' attribute for the tag,
517 | and throws an exception if it's not there."""
518 | return self._getAttrMap()[key]
519 |
520 | def __iter__(self):
521 | "Iterating over a tag iterates over its contents."
522 | return iter(self.contents)
523 |
524 | def __len__(self):
525 | "The length of a tag is the length of its list of contents."
526 | return len(self.contents)
527 |
528 | def __contains__(self, x):
529 | return x in self.contents
530 |
531 | def __nonzero__(self):
532 | "A tag is non-None even if it has no contents."
533 | return True
534 |
535 | def __setitem__(self, key, value):
536 | """Setting tag[key] sets the value of the 'key' attribute for the
537 | tag."""
538 | self._getAttrMap()
539 | self.attrMap[key] = value
540 | found = False
541 | for i in range(0, len(self.attrs)):
542 | if self.attrs[i][0] == key:
543 | self.attrs[i] = (key, value)
544 | found = True
545 | if not found:
546 | self.attrs.append((key, value))
547 | self._getAttrMap()[key] = value
548 |
549 | def __delitem__(self, key):
550 | "Deleting tag[key] deletes all 'key' attributes for the tag."
551 | for item in self.attrs:
552 | if item[0] == key:
553 | self.attrs.remove(item)
554 | #We don't break because bad HTML can define the same
555 | #attribute multiple times.
556 | self._getAttrMap()
557 | if self.attrMap.has_key(key):
558 | del self.attrMap[key]
559 |
560 | def __call__(self, *args, **kwargs):
561 | """Calling a tag like a function is the same as calling its
562 | findAll() method. Eg. tag('a') returns a list of all the A tags
563 | found within this tag."""
564 | return apply(self.findAll, args, kwargs)
565 |
566 | def __getattr__(self, tag):
567 | #print "Getattr %s.%s" % (self.__class__, tag)
568 | if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
569 | return self.find(tag[:-3])
570 | elif tag.find('__') != 0:
571 | return self.find(tag)
572 | raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
573 |
574 | def __eq__(self, other):
575 | """Returns true iff this tag has the same name, the same attributes,
576 | and the same contents (recursively) as the given tag.
577 |
578 | NOTE: right now this will return false if two tags have the
579 | same attributes in a different order. Should this be fixed?"""
580 | if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
581 | return False
582 | for i in range(0, len(self.contents)):
583 | if self.contents[i] != other.contents[i]:
584 | return False
585 | return True
586 |
587 | def __ne__(self, other):
588 | """Returns true iff this tag is not identical to the other tag,
589 | as defined in __eq__."""
590 | return not self == other
591 |
592 | def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
593 | """Renders this tag as a string."""
594 | return self.__str__(encoding)
595 |
596 | def __unicode__(self):
597 | return self.__str__(None)
598 |
599 | BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
600 | + "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
601 | + ")")
602 |
603 | def _sub_entity(self, x):
604 | """Used with a regular expression to substitute the
605 | appropriate XML entity for an XML special character."""
606 | return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
607 |
608 | def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
609 | prettyPrint=False, indentLevel=0):
610 | """Returns a string or Unicode representation of this tag and
611 | its contents. To get Unicode, pass None for encoding.
612 |
613 | NOTE: since Python's HTML parser consumes whitespace, this
614 | method is not certain to reproduce the whitespace present in
615 | the original string."""
616 |
617 | encodedName = self.toEncoding(self.name, encoding)
618 |
619 | attrs = []
620 | if self.attrs:
621 | for key, val in self.attrs:
622 | fmt = '%s="%s"'
623 | if isString(val):
624 | if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
625 | val = self.substituteEncoding(val, encoding)
626 |
627 | # The attribute value either:
628 | #
629 | # * Contains no embedded double quotes or single quotes.
630 | # No problem: we enclose it in double quotes.
631 | # * Contains embedded single quotes. No problem:
632 | # double quotes work here too.
633 | # * Contains embedded double quotes. No problem:
634 | # we enclose it in single quotes.
635 | # * Embeds both single _and_ double quotes. This
636 | # can't happen naturally, but it can happen if
637 | # you modify an attribute value after parsing
638 | # the document. Now we have a bit of a
639 | # problem. We solve it by enclosing the
640 | # attribute in single quotes, and escaping any
641 | # embedded single quotes to XML entities.
642 | if '"' in val:
643 | fmt = "%s='%s'"
644 | if "'" in val:
645 | # TODO: replace with apos when
646 | # appropriate.
647 | val = val.replace("'", "&squot;")
648 |
649 | # Now we're okay w/r/t quotes. But the attribute
650 | # value might also contain angle brackets, or
651 | # ampersands that aren't part of entities. We need
652 | # to escape those to XML entities too.
653 | val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
654 |
655 | attrs.append(fmt % (self.toEncoding(key, encoding),
656 | self.toEncoding(val, encoding)))
657 | close = ''
658 | closeTag = ''
659 | if self.isSelfClosing:
660 | close = ' /'
661 | else:
662 | closeTag = '%s>' % encodedName
663 |
664 | indentTag, indentContents = 0, 0
665 | if prettyPrint:
666 | indentTag = indentLevel
667 | space = (' ' * (indentTag-1))
668 | indentContents = indentTag + 1
669 | contents = self.renderContents(encoding, prettyPrint, indentContents)
670 | if self.hidden:
671 | s = contents
672 | else:
673 | s = []
674 | attributeString = ''
675 | if attrs:
676 | attributeString = ' ' + ' '.join(attrs)
677 | if prettyPrint:
678 | s.append(space)
679 | s.append('<%s%s%s>' % (encodedName, attributeString, close))
680 | if prettyPrint:
681 | s.append("\n")
682 | s.append(contents)
683 | if prettyPrint and contents and contents[-1] != "\n":
684 | s.append("\n")
685 | if prettyPrint and closeTag:
686 | s.append(space)
687 | s.append(closeTag)
688 | if prettyPrint and closeTag and self.nextSibling:
689 | s.append("\n")
690 | s = ''.join(s)
691 | return s
692 |
693 | def decompose(self):
694 | """Recursively destroys the contents of this tree."""
695 | contents = [i for i in self.contents]
696 | for i in contents:
697 | if isinstance(i, Tag):
698 | i.decompose()
699 | else:
700 | i.extract()
701 | self.extract()
702 |
703 | def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
704 | return self.__str__(encoding, True)
705 |
706 | def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
707 | prettyPrint=False, indentLevel=0):
708 | """Renders the contents of this tag as a string in the given
709 | encoding. If encoding is None, returns a Unicode string.."""
710 | s=[]
711 | for c in self:
712 | text = None
713 | if isinstance(c, NavigableString):
714 | text = c.__str__(encoding)
715 | elif isinstance(c, Tag):
716 | s.append(c.__str__(encoding, prettyPrint, indentLevel))
717 | if text and prettyPrint:
718 | text = text.strip()
719 | if text:
720 | if prettyPrint:
721 | s.append(" " * (indentLevel-1))
722 | s.append(text)
723 | if prettyPrint:
724 | s.append("\n")
725 | return ''.join(s)
726 |
727 | #Soup methods
728 |
729 | def find(self, name=None, attrs={}, recursive=True, text=None,
730 | **kwargs):
731 | """Return only the first child of this Tag matching the given
732 | criteria."""
733 | r = None
734 | l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
735 | if l:
736 | r = l[0]
737 | return r
738 | findChild = find
739 |
740 | def findAll(self, name=None, attrs={}, recursive=True, text=None,
741 | limit=None, **kwargs):
742 | """Extracts a list of Tag objects that match the given
743 | criteria. You can specify the name of the Tag and any
744 | attributes you want the Tag to have.
745 |
746 | The value of a key-value pair in the 'attrs' map can be a
747 | string, a list of strings, a regular expression object, or a
748 | callable that takes a string and returns whether or not the
749 | string matches for some custom definition of 'matches'. The
750 | same is true of the tag name."""
751 | generator = self.recursiveChildGenerator
752 | if not recursive:
753 | generator = self.childGenerator
754 | return self._findAll(name, attrs, text, limit, generator, **kwargs)
755 | findChildren = findAll
756 |
757 | # Pre-3.x compatibility methods
758 | first = find
759 | fetch = findAll
760 |
761 | def fetchText(self, text=None, recursive=True, limit=None):
762 | return self.findAll(text=text, recursive=recursive, limit=limit)
763 |
764 | def firstText(self, text=None, recursive=True):
765 | return self.find(text=text, recursive=recursive)
766 |
767 | #Private methods
768 |
769 | def _getAttrMap(self):
770 | """Initializes a map representation of this tag's attributes,
771 | if not already initialized."""
772 | if not getattr(self, 'attrMap'):
773 | self.attrMap = {}
774 | for (key, value) in self.attrs:
775 | self.attrMap[key] = value
776 | return self.attrMap
777 |
778 | #Generator methods
779 | def childGenerator(self):
780 | for i in range(0, len(self.contents)):
781 | yield self.contents[i]
782 | raise StopIteration
783 |
784 | def recursiveChildGenerator(self):
785 | stack = [(self, 0)]
786 | while stack:
787 | tag, start = stack.pop()
788 | if isinstance(tag, Tag):
789 | for i in range(start, len(tag.contents)):
790 | a = tag.contents[i]
791 | yield a
792 | if isinstance(a, Tag) and tag.contents:
793 | if i < len(tag.contents) - 1:
794 | stack.append((tag, i+1))
795 | stack.append((a, 0))
796 | break
797 | raise StopIteration
798 |
799 | # Next, a couple classes to represent queries and their results.
800 | class SoupStrainer:
801 | """Encapsulates a number of ways of matching a markup element (tag or
802 | text)."""
803 |
804 | def __init__(self, name=None, attrs={}, text=None, **kwargs):
805 | self.name = name
806 | if isString(attrs):
807 | kwargs['class'] = attrs
808 | attrs = None
809 | if kwargs:
810 | if attrs:
811 | attrs = attrs.copy()
812 | attrs.update(kwargs)
813 | else:
814 | attrs = kwargs
815 | self.attrs = attrs
816 | self.text = text
817 |
818 | def __str__(self):
819 | if self.text:
820 | return self.text
821 | else:
822 | return "%s|%s" % (self.name, self.attrs)
823 |
824 | def searchTag(self, markupName=None, markupAttrs={}):
825 | found = None
826 | markup = None
827 | if isinstance(markupName, Tag):
828 | markup = markupName
829 | markupAttrs = markup
830 | callFunctionWithTagData = callable(self.name) \
831 | and not isinstance(markupName, Tag)
832 |
833 | if (not self.name) \
834 | or callFunctionWithTagData \
835 | or (markup and self._matches(markup, self.name)) \
836 | or (not markup and self._matches(markupName, self.name)):
837 | if callFunctionWithTagData:
838 | match = self.name(markupName, markupAttrs)
839 | else:
840 | match = True
841 | markupAttrMap = None
842 | for attr, matchAgainst in self.attrs.items():
843 | if not markupAttrMap:
844 | if hasattr(markupAttrs, 'get'):
845 | markupAttrMap = markupAttrs
846 | else:
847 | markupAttrMap = {}
848 | for k,v in markupAttrs:
849 | markupAttrMap[k] = v
850 | attrValue = markupAttrMap.get(attr)
851 | if not self._matches(attrValue, matchAgainst):
852 | match = False
853 | break
854 | if match:
855 | if markup:
856 | found = markup
857 | else:
858 | found = markupName
859 | return found
860 |
861 | def search(self, markup):
862 | #print 'looking for %s in %s' % (self, markup)
863 | found = None
864 | # If given a list of items, scan it for a text element that
865 | # matches.
866 | if isList(markup) and not isinstance(markup, Tag):
867 | for element in markup:
868 | if isinstance(element, NavigableString) \
869 | and self.search(element):
870 | found = element
871 | break
872 | # If it's a Tag, make sure its name or attributes match.
873 | # Don't bother with Tags if we're searching for text.
874 | elif isinstance(markup, Tag):
875 | if not self.text:
876 | found = self.searchTag(markup)
877 | # If it's text, make sure the text matches.
878 | elif isinstance(markup, NavigableString) or \
879 | isString(markup):
880 | if self._matches(markup, self.text):
881 | found = markup
882 | else:
883 | raise Exception, "I don't know how to match against a %s" \
884 | % markup.__class__
885 | return found
886 |
887 | def _matches(self, markup, matchAgainst):
888 | #print "Matching %s against %s" % (markup, matchAgainst)
889 | result = False
890 | if matchAgainst == True and type(matchAgainst) == types.BooleanType:
891 | result = markup != None
892 | elif callable(matchAgainst):
893 | result = matchAgainst(markup)
894 | else:
895 | #Custom match methods take the tag as an argument, but all
896 | #other ways of matching match the tag name as a string.
897 | if isinstance(markup, Tag):
898 | markup = markup.name
899 | if markup and not isString(markup):
900 | markup = unicode(markup)
901 | #Now we know that chunk is either a string, or None.
902 | if hasattr(matchAgainst, 'match'):
903 | # It's a regexp object.
904 | result = markup and matchAgainst.search(markup)
905 | elif isList(matchAgainst):
906 | result = markup in matchAgainst
907 | elif hasattr(matchAgainst, 'items'):
908 | result = markup.has_key(matchAgainst)
909 | elif matchAgainst and isString(markup):
910 | if isinstance(markup, unicode):
911 | matchAgainst = unicode(matchAgainst)
912 | else:
913 | matchAgainst = str(matchAgainst)
914 |
915 | if not result:
916 | result = matchAgainst == markup
917 | return result
918 |
919 | class ResultSet(list):
920 | """A ResultSet is just a list that keeps track of the SoupStrainer
921 | that created it."""
922 | def __init__(self, source):
923 | list.__init__([])
924 | self.source = source
925 |
926 | # Now, some helper functions.
927 |
928 | def isList(l):
929 | """Convenience method that works with all 2.x versions of Python
930 | to determine whether or not something is listlike."""
931 | return hasattr(l, '__iter__') \
932 | or (type(l) in (types.ListType, types.TupleType))
933 |
934 | def isString(s):
935 | """Convenience method that works with all 2.x versions of Python
936 | to determine whether or not something is stringlike."""
937 | try:
938 | return isinstance(s, unicode) or isinstance(s, basestring)
939 | except NameError:
940 | return isinstance(s, str)
941 |
942 | def buildTagMap(default, *args):
943 | """Turns a list of maps, lists, or scalars into a single map.
944 | Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
945 | NESTING_RESET_TAGS maps out of lists and partial maps."""
946 | built = {}
947 | for portion in args:
948 | if hasattr(portion, 'items'):
949 | #It's a map. Merge it.
950 | for k,v in portion.items():
951 | built[k] = v
952 | elif isList(portion):
953 | #It's a list. Map each item to the default.
954 | for k in portion:
955 | built[k] = default
956 | else:
957 | #It's a scalar. Map it to the default.
958 | built[portion] = default
959 | return built
960 |
961 | # Now, the parser classes.
962 |
963 | class BeautifulStoneSoup(Tag, SGMLParser):
964 |
965 | """This class contains the basic parser and search code. It defines
966 | a parser that knows nothing about tag behavior except for the
967 | following:
968 |
969 | You can't close a tag without closing all the tags it encloses.
970 | That is, "
(No space between name of closing tag and tag close)
1028 | (Extraneous whitespace in declaration)
1029 |
1030 | You can pass in a custom list of (RE object, replace method)
1031 | tuples to get Beautiful Soup to scrub your input the way you
1032 | want."""
1033 |
1034 | self.parseOnlyThese = parseOnlyThese
1035 | self.fromEncoding = fromEncoding
1036 | self.smartQuotesTo = smartQuotesTo
1037 | self.convertEntities = convertEntities
1038 | # Set the rules for how we'll deal with the entities we
1039 | # encounter
1040 | if self.convertEntities:
1041 | # It doesn't make sense to convert encoded characters to
1042 | # entities even while you're converting entities to Unicode.
1043 | # Just convert it all to Unicode.
1044 | self.smartQuotesTo = None
1045 | if convertEntities == self.HTML_ENTITIES:
1046 | self.convertXMLEntities = False
1047 | self.convertHTMLEntities = True
1048 | self.escapeUnrecognizedEntities = True
1049 | elif convertEntities == self.XHTML_ENTITIES:
1050 | self.convertXMLEntities = True
1051 | self.convertHTMLEntities = True
1052 | self.escapeUnrecognizedEntities = False
1053 | elif convertEntities == self.XML_ENTITIES:
1054 | self.convertXMLEntities = True
1055 | self.convertHTMLEntities = False
1056 | self.escapeUnrecognizedEntities = False
1057 | else:
1058 | self.convertXMLEntities = False
1059 | self.convertHTMLEntities = False
1060 | self.escapeUnrecognizedEntities = False
1061 |
1062 | self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
1063 | SGMLParser.__init__(self)
1064 |
1065 | if hasattr(markup, 'read'): # It's a file-type object.
1066 | markup = markup.read()
1067 | self.markup = markup
1068 | self.markupMassage = markupMassage
1069 | try:
1070 | self._feed()
1071 | except StopParsing:
1072 | pass
1073 | self.markup = None # The markup can now be GCed
1074 |
1075 | def convert_charref(self, name):
1076 | """This method fixes a bug in Python's SGMLParser."""
1077 | try:
1078 | n = int(name)
1079 | except ValueError:
1080 | return
1081 | if not 0 <= n <= 127 : # ASCII ends at 127, not 255
1082 | return
1083 | return self.convert_codepoint(n)
1084 |
1085 | def _feed(self, inDocumentEncoding=None):
1086 | # Convert the document to Unicode.
1087 | markup = self.markup
1088 | if isinstance(markup, unicode):
1089 | if not hasattr(self, 'originalEncoding'):
1090 | self.originalEncoding = None
1091 | else:
1092 | dammit = UnicodeDammit\
1093 | (markup, [self.fromEncoding, inDocumentEncoding],
1094 | smartQuotesTo=self.smartQuotesTo)
1095 | markup = dammit.unicode
1096 | self.originalEncoding = dammit.originalEncoding
1097 | if markup:
1098 | if self.markupMassage:
1099 | if not isList(self.markupMassage):
1100 | self.markupMassage = self.MARKUP_MASSAGE
1101 | for fix, m in self.markupMassage:
1102 | markup = fix.sub(m, markup)
1103 | # TODO: We get rid of markupMassage so that the
1104 | # soup object can be deepcopied later on. Some
1105 | # Python installations can't copy regexes. If anyone
1106 | # was relying on the existence of markupMassage, this
1107 | # might cause problems.
1108 | del(self.markupMassage)
1109 | self.reset()
1110 |
1111 | SGMLParser.feed(self, markup)
1112 | # Close out any unfinished strings and close all the open tags.
1113 | self.endData()
1114 | while self.currentTag.name != self.ROOT_TAG_NAME:
1115 | self.popTag()
1116 |
1117 | def __getattr__(self, methodName):
1118 | """This method routes method call requests to either the SGMLParser
1119 | superclass or the Tag superclass, depending on the method name."""
1120 | #print "__getattr__ called on %s.%s" % (self.__class__, methodName)
1121 |
1122 | if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
1123 | or methodName.find('do_') == 0:
1124 | return SGMLParser.__getattr__(self, methodName)
1125 | elif methodName.find('__') != 0:
1126 | return Tag.__getattr__(self, methodName)
1127 | else:
1128 | raise AttributeError
1129 |
1130 | def isSelfClosingTag(self, name):
1131 | """Returns true iff the given string is the name of a
1132 | self-closing tag according to this parser."""
1133 | return self.SELF_CLOSING_TAGS.has_key(name) \
1134 | or self.instanceSelfClosingTags.has_key(name)
1135 |
1136 | def reset(self):
1137 | Tag.__init__(self, self, self.ROOT_TAG_NAME)
1138 | self.hidden = 1
1139 | SGMLParser.reset(self)
1140 | self.currentData = []
1141 | self.currentTag = None
1142 | self.tagStack = []
1143 | self.quoteStack = []
1144 | self.pushTag(self)
1145 |
1146 | def popTag(self):
1147 | tag = self.tagStack.pop()
1148 | # Tags with just one string-owning child get the child as a
1149 | # 'string' property, so that soup.tag.string is shorthand for
1150 | # soup.tag.contents[0]
1151 | if len(self.currentTag.contents) == 1 and \
1152 | isinstance(self.currentTag.contents[0], NavigableString):
1153 | self.currentTag.string = self.currentTag.contents[0]
1154 |
1155 | #print "Pop", tag.name
1156 | if self.tagStack:
1157 | self.currentTag = self.tagStack[-1]
1158 | return self.currentTag
1159 |
1160 | def pushTag(self, tag):
1161 | #print "Push", tag.name
1162 | if self.currentTag:
1163 | self.currentTag.contents.append(tag)
1164 | self.tagStack.append(tag)
1165 | self.currentTag = self.tagStack[-1]
1166 |
1167 | def endData(self, containerClass=NavigableString):
1168 | if self.currentData:
1169 | currentData = ''.join(self.currentData)
1170 | if not currentData.translate(self.STRIP_ASCII_SPACES):
1171 | if '\n' in currentData:
1172 | currentData = '\n'
1173 | else:
1174 | currentData = ' '
1175 | self.currentData = []
1176 | if self.parseOnlyThese and len(self.tagStack) <= 1 and \
1177 | (not self.parseOnlyThese.text or \
1178 | not self.parseOnlyThese.search(currentData)):
1179 | return
1180 | o = containerClass(currentData)
1181 | o.setup(self.currentTag, self.previous)
1182 | if self.previous:
1183 | self.previous.next = o
1184 | self.previous = o
1185 | self.currentTag.contents.append(o)
1186 |
1187 |
1188 | def _popToTag(self, name, inclusivePop=True):
1189 | """Pops the tag stack up to and including the most recent
1190 | instance of the given tag. If inclusivePop is false, pops the tag
1191 | stack up to but *not* including the most recent instqance of
1192 | the given tag."""
1193 | #print "Popping to %s" % name
1194 | if name == self.ROOT_TAG_NAME:
1195 | return
1196 |
1197 | numPops = 0
1198 | mostRecentTag = None
1199 | for i in range(len(self.tagStack)-1, 0, -1):
1200 | if name == self.tagStack[i].name:
1201 | numPops = len(self.tagStack)-i
1202 | break
1203 | if not inclusivePop:
1204 | numPops = numPops - 1
1205 |
1206 | for i in range(0, numPops):
1207 | mostRecentTag = self.popTag()
1208 | return mostRecentTag
1209 |
1210 | def _smartPop(self, name):
1211 |
1212 | """We need to pop up to the previous tag of this type, unless
1213 | one of this tag's nesting reset triggers comes between this
1214 | tag and the previous tag of this type, OR unless this tag is a
1215 | generic nesting trigger and another generic nesting trigger
1216 | comes between this tag and the previous tag of this type.
1217 |
1218 | Examples:
1219 |
FooBar *
* should pop to 'p', not 'b'. 1220 |
Foo
* | * should pop to 'tr', not the first 'td'
1226 | """
1227 |
1228 | nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1229 | isNestable = nestingResetTriggers != None
1230 | isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
1231 | popTo = None
1232 | inclusive = True
1233 | for i in range(len(self.tagStack)-1, 0, -1):
1234 | p = self.tagStack[i]
1235 | if (not p or p.name == name) and not isNestable:
1236 | #Non-nestable tags get popped to the top or to their
1237 | #last occurance.
1238 | popTo = name
1239 | break
1240 | if (nestingResetTriggers != None
1241 | and p.name in nestingResetTriggers) \
1242 | or (nestingResetTriggers == None and isResetNesting
1243 | and self.RESET_NESTING_TAGS.has_key(p.name)):
1244 |
1245 | #If we encounter one of the nesting reset triggers
1246 | #peculiar to this tag, or we encounter another tag
1247 | #that causes nesting to reset, pop up to but not
1248 | #including that tag.
1249 | popTo = p.name
1250 | inclusive = False
1251 | break
1252 | p = p.parent
1253 | if popTo:
1254 | self._popToTag(popTo, inclusive)
1255 |
1256 | def unknown_starttag(self, name, attrs, selfClosing=0):
1257 | #print "Start tag %s: %s" % (name, attrs)
1258 | if self.quoteStack:
1259 | #This is not a real tag.
1260 | #print "<%s> is not real!" % name
1261 | attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
1262 | self.handle_data('<%s%s>' % (name, attrs))
1263 | return
1264 | self.endData()
1265 |
1266 | if not self.isSelfClosingTag(name) and not selfClosing:
1267 | self._smartPop(name)
1268 |
1269 | if self.parseOnlyThese and len(self.tagStack) <= 1 \
1270 | and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1271 | return
1272 |
1273 | tag = Tag(self, name, attrs, self.currentTag, self.previous)
1274 | if self.previous:
1275 | self.previous.next = tag
1276 | self.previous = tag
1277 | self.pushTag(tag)
1278 | if selfClosing or self.isSelfClosingTag(name):
1279 | self.popTag()
1280 | if name in self.QUOTE_TAGS:
1281 | #print "Beginning quote (%s)" % name
1282 | self.quoteStack.append(name)
1283 | self.literal = 1
1284 | return tag
1285 |
1286 | def unknown_endtag(self, name):
1287 | #print "End tag %s" % name
1288 | if self.quoteStack and self.quoteStack[-1] != name:
1289 | #This is not a real end tag.
1290 | #print "%s> is not real!" % name
1291 | self.handle_data('%s>' % name)
1292 | return
1293 | self.endData()
1294 | self._popToTag(name)
1295 | if self.quoteStack and self.quoteStack[-1] == name:
1296 | self.quoteStack.pop()
1297 | self.literal = (len(self.quoteStack) > 0)
1298 |
1299 | def handle_data(self, data):
1300 | self.currentData.append(data)
1301 |
1302 | def _toStringSubclass(self, text, subclass):
1303 | """Adds a certain piece of text to the tree as a NavigableString
1304 | subclass."""
1305 | self.endData()
1306 | self.handle_data(text)
1307 | self.endData(subclass)
1308 |
1309 | def handle_pi(self, text):
1310 | """Handle a processing instruction as a ProcessingInstruction
1311 | object, possibly one with a %SOUP-ENCODING% slot into which an
1312 | encoding will be plugged later."""
1313 | if text[:3] == "xml":
1314 | text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
1315 | self._toStringSubclass(text, ProcessingInstruction)
1316 |
1317 | def handle_comment(self, text):
1318 | "Handle comments as Comment objects."
1319 | self._toStringSubclass(text, Comment)
1320 |
1321 | def handle_charref(self, ref):
1322 | "Handle character references as data."
1323 | if self.convertEntities:
1324 | data = unichr(int(ref))
1325 | else:
1326 | data = '%s;' % ref
1327 | self.handle_data(data)
1328 |
1329 | def handle_entityref(self, ref):
1330 | """Handle entity references as data, possibly converting known
1331 | HTML and/or XML entity references to the corresponding Unicode
1332 | characters."""
1333 | data = None
1334 | if self.convertHTMLEntities:
1335 | try:
1336 | data = unichr(name2codepoint[ref])
1337 | except KeyError:
1338 | pass
1339 |
1340 | if not data and self.convertXMLEntities:
1341 | data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
1342 |
1343 | if not data and self.convertHTMLEntities and \
1344 | not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
1345 | # TODO: We've got a problem here. We're told this is
1346 | # an entity reference, but it's not an XML entity
1347 | # reference or an HTML entity reference. Nonetheless,
1348 | # the logical thing to do is to pass it through as an
1349 | # unrecognized entity reference.
1350 | #
1351 | # Except: when the input is "&carol;" this function
1352 | # will be called with input "carol". When the input is
1353 | # "AT&T", this function will be called with input
1354 | # "T". We have no way of knowing whether a semicolon
1355 | # was present originally, so we don't know whether
1356 | # this is an unknown entity or just a misplaced
1357 | # ampersand.
1358 | #
1359 | # The more common case is a misplaced ampersand, so I
1360 | # escape the ampersand and omit the trailing semicolon.
1361 | data = "&%s" % ref
1362 | if not data:
1363 | # This case is different from the one above, because we
1364 | # haven't already gone through a supposedly comprehensive
1365 | # mapping of entities to Unicode characters. We might not
1366 | # have gone through any mapping at all. So the chances are
1367 | # very high that this is a real entity, and not a
1368 | # misplaced ampersand.
1369 | data = "&%s;" % ref
1370 | self.handle_data(data)
1371 |
1372 | def handle_decl(self, data):
1373 | "Handle DOCTYPEs and the like as Declaration objects."
1374 | self._toStringSubclass(data, Declaration)
1375 |
1376 | def parse_declaration(self, i):
1377 | """Treat a bogus SGML declaration as raw data. Treat a CDATA
1378 | declaration as a CData object."""
1379 | j = None
1380 | if self.rawdata[i:i+9] == '', i)
1382 | if k == -1:
1383 | k = len(self.rawdata)
1384 | data = self.rawdata[i+9:k]
1385 | j = k+3
1386 | self._toStringSubclass(data, CData)
1387 | else:
1388 | try:
1389 | j = SGMLParser.parse_declaration(self, i)
1390 | except SGMLParseError:
1391 | toHandle = self.rawdata[i:]
1392 | self.handle_data(toHandle)
1393 | j = i + len(toHandle)
1394 | return j
1395 |
1396 | class BeautifulSoup(BeautifulStoneSoup):
1397 |
1398 | """This parser knows the following facts about HTML:
1399 |
1400 | * Some tags have no closing tag and should be interpreted as being
1401 | closed as soon as they are encountered.
1402 |
1403 | * The text inside some tags (ie. 'script') may contain tags which
1404 | are not really part of the document and which should be parsed
1405 | as text, not tags. If you want to parse the text as tags, you can
1406 | always fetch it and parse it explicitly.
1407 |
1408 | * Tag nesting rules:
1409 |
1410 | Most tags can't be nested at all. For instance, the occurance of
1411 | a tag should implicitly close the previous tag. 1412 | 1413 | Para1 Para2 1414 | should be transformed into: 1415 | Para1 Para2 1416 | 1417 | Some tags can be nested arbitrarily. For instance, the occurance 1418 | of a tag should _not_ implicitly close the previous 1419 |tag. 1420 | 1421 | Alice said:Bob said:Blah 1422 | should NOT be transformed into: 1423 | Alice said:Bob said:Blah 1424 | 1425 | Some tags can be nested, but the nesting is reset by the 1426 | interposition of other tags. For instance, a |