├── README.md
├── googlesearch
    ├── __init__.py
    ├── items.py
    ├── pipelines.py
    ├── settings.py
    └── spiders
    │   ├── __init__.py
    │   └── googlespider.py
└── scrapy.cfg


/README.md:
--------------------------------------------------------------------------------
1 | ## Googlesearch
2 | Scrape the google advanced search result with scrapy bootstraped from given queries. This spider can be used to collect the
3 | HTML pages to prepare for a corpus.
4 | 
5 | ## Usage
6 | `scrapy crawl googlesearch -a queries='xxx'` -a region='xxx' replace the 'xxx' to the keywords you want search with Google
7 | and the region (e.g. ie for Ireland) you wish to limit to.


--------------------------------------------------------------------------------
/googlesearch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tpeng/googlesearch/9defbcffdea314181aad6952ac4c25506a2a1aea/googlesearch/__init__.py


--------------------------------------------------------------------------------
/googlesearch/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # http://doc.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | from scrapy.item import Item, Field
 7 | 
 8 | class GoogleSearchItem(Item):
 9 |     name = Field()
10 |     region = Field()
11 |     url = Field()
12 |     html = Field()
13 |     query = Field()
14 |     crawled = Field()
15 | 


--------------------------------------------------------------------------------
/googlesearch/pipelines.py:
--------------------------------------------------------------------------------
1 | # Define your item pipelines here
2 | #
3 | 
4 | class ScrapyGoogleSpiderPipeline(object):
5 | 
6 |     def process_item(self, item, spider):
7 |         return item
8 | 


--------------------------------------------------------------------------------
/googlesearch/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for scrapy_google_spider project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 7 | #
 8 | 
 9 | BOT_NAME = 'googlesearch'
10 | 
11 | SPIDER_MODULES = ['googlesearch.spiders']
12 | NEWSPIDER_MODULE = 'googlesearch.spiders'
13 | 
14 | ITEM_PIPELINES = ['googlesearch.pipelines.ScrapyGoogleSpiderPipeline']
15 | 
16 | CLOSESPIDER_ITEMCOUNT = 500


--------------------------------------------------------------------------------
/googlesearch/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/googlesearch/spiders/googlespider.py:
--------------------------------------------------------------------------------
 1 | from urlparse import urljoin, urlparse, parse_qsl
 2 | import datetime
 3 | from scrapy.http import Request
 4 | from scrapy.selector import HtmlXPathSelector
 5 | from scrapy.spider import BaseSpider
 6 | from scrapy.utils.response import get_base_url
 7 | from scrapy.utils.misc import arg_to_iter
 8 | from googlesearch.items import GoogleSearchItem
 9 | 
10 | COUNTRIES = {
11 |     'ie': 'countryIE',
12 |     'nl': 'countryNL'
13 | }
14 | 
15 | """
16 | A spider to parse the google search result bootstraped from given queries.
17 | """
18 | class GoogleSearchSpider(BaseSpider):
19 |     name = 'googlesearch'
20 |     queries = ('contact us', 'hotel')
21 |     region = 'ie'
22 |     download_delay = 5
23 |     base_url_fmt = 'http://www.google.{region}/search?hl=en&as_q=&as_epq={query}&as_oq=&as_eq=&as_nlo=&as_nhi=&lr=&cr={country}&as_qdr=all&as_sitesearch=&as_occt=any&safe=images&tbs=&as_filetype=&as_rights='
24 |     download_html = False
25 |     limit_country = False
26 | 
27 |     def start_requests(self):
28 |         for query in arg_to_iter(self.queries):
29 |             url = self.make_google_search_request(COUNTRIES[self.region], query)
30 |             yield Request(url=url, meta={'query': query})
31 | 
32 |     def make_google_search_request(self, country, query):
33 |         if not self.limit_country:
34 |             country = ''
35 |         return self.base_url_fmt.format(country=country, region=self.region, query='+'.join(query.split()).strip('+'))
36 | 
37 |     def parse(self, response):
38 |         hxs = HtmlXPathSelector(response)
39 |         for sel in hxs.select('//div[@id="ires"]//li[@class="g"]//h3[@class="r"]'):
40 |             name = u''.join(sel.select(".//text()").extract())
41 |             url = _parse_url(sel.select('.//a/@href').extract()[0])
42 |             region = _get_region(url)
43 |             if len(url):
44 |                 if self.download_html:
45 |                     yield Request(url=url, callback=self.parse_item, meta={'name':name,
46 |                                                                            'query': response.meta['query']})
47 |                 else:
48 |                     yield GoogleSearchItem(url=url, name=name, query=response.meta['query'], crawled=datetime.datetime.utcnow().isoformat())
49 | 
50 |         next_page = hxs.select('//table[@id="nav"]//td[contains(@class, "b") and position() = last()]/a')
51 |         if next_page:
52 |             url = self._build_absolute_url(response, next_page.select('.//@href').extract()[0])
53 |             yield Request(url=url, callback=self.parse, meta={'query': response.meta['query']})
54 | 
55 |     def parse_item(self, response):
56 |         name = response.meta['name']
57 |         query = response.meta['query']
58 |         url = response.url
59 |         html = response.body[:1024 * 256]
60 |         timestamp = datetime.datetime.utcnow().isoformat()
61 |         yield GoogleSearchItem({'name': name,
62 |                                 'url': url,
63 |                                 'html': html,
64 |                                 'region': self.region,
65 |                                 'query': query,
66 |                                 'crawled': timestamp})
67 | 
68 |     def _build_absolute_url(self, response, url):
69 |         return urljoin(get_base_url(response), url)
70 | 
71 | def _parse_url(href):
72 |     """
73 |     parse the website from anchor href.
74 | 
75 |     for example:
76 | 
77 |     >>> _parse_url(u'/url?q=http://www.getmore.com.hk/page18.php&sa=U&ei=Xmd_UdqBEtGy4AO254GIDg&ved=0CDQQFjAGODw&usg=AFQjCNH08dgfL10dJVCyQjfu_1DEyhiMHQ')
78 |     u'http://www.getmore.com.hk/page18.php'
79 |     """
80 |     queries = dict(parse_qsl(urlparse(href).query))
81 |     return queries.get('q', '')
82 | 
83 | def _get_region(url):
84 |     """
85 |     get country code from the url.
86 | 
87 |     >>> _get_region('http://scrapinghub.ie')
88 |     'ie'
89 |     >>> _get_region('http://www.astoncarpets.ie/contact.htm')
90 |     'ie'
91 |     """
92 |     netloc = urlparse(url)[1]
93 |     return netloc.rpartition('.')[-1]
94 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = googlesearch.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = googlesearch
12 | 


--------------------------------------------------------------------------------