├── README.md ├── googlesearch ├── __init__.py ├── items.py ├── pipelines.py ├── settings.py └── spiders │ ├── __init__.py │ └── googlespider.py └── scrapy.cfg /README.md: -------------------------------------------------------------------------------- 1 | ## Googlesearch 2 | Scrape the google advanced search result with scrapy bootstraped from given queries. This spider can be used to collect the 3 | HTML pages to prepare for a corpus. 4 | 5 | ## Usage 6 | `scrapy crawl googlesearch -a queries='xxx'` -a region='xxx' replace the 'xxx' to the keywords you want search with Google 7 | and the region (e.g. ie for Ireland) you wish to limit to. -------------------------------------------------------------------------------- /googlesearch/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tpeng/googlesearch/9defbcffdea314181aad6952ac4c25506a2a1aea/googlesearch/__init__.py -------------------------------------------------------------------------------- /googlesearch/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/en/latest/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class GoogleSearchItem(Item): 9 | name = Field() 10 | region = Field() 11 | url = Field() 12 | html = Field() 13 | query = Field() 14 | crawled = Field() 15 | -------------------------------------------------------------------------------- /googlesearch/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | 4 | class ScrapyGoogleSpiderPipeline(object): 5 | 6 | def process_item(self, item, spider): 7 | return item 8 | -------------------------------------------------------------------------------- /googlesearch/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for scrapy_google_spider project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/en/latest/topics/settings.html 7 | # 8 | 9 | BOT_NAME = 'googlesearch' 10 | 11 | SPIDER_MODULES = ['googlesearch.spiders'] 12 | NEWSPIDER_MODULE = 'googlesearch.spiders' 13 | 14 | ITEM_PIPELINES = ['googlesearch.pipelines.ScrapyGoogleSpiderPipeline'] 15 | 16 | CLOSESPIDER_ITEMCOUNT = 500 -------------------------------------------------------------------------------- /googlesearch/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /googlesearch/spiders/googlespider.py: -------------------------------------------------------------------------------- 1 | from urlparse import urljoin, urlparse, parse_qsl 2 | import datetime 3 | from scrapy.http import Request 4 | from scrapy.selector import HtmlXPathSelector 5 | from scrapy.spider import BaseSpider 6 | from scrapy.utils.response import get_base_url 7 | from scrapy.utils.misc import arg_to_iter 8 | from googlesearch.items import GoogleSearchItem 9 | 10 | COUNTRIES = { 11 | 'ie': 'countryIE', 12 | 'nl': 'countryNL' 13 | } 14 | 15 | """ 16 | A spider to parse the google search result bootstraped from given queries. 17 | """ 18 | class GoogleSearchSpider(BaseSpider): 19 | name = 'googlesearch' 20 | queries = ('contact us', 'hotel') 21 | region = 'ie' 22 | download_delay = 5 23 | base_url_fmt = 'http://www.google.{region}/search?hl=en&as_q=&as_epq={query}&as_oq=&as_eq=&as_nlo=&as_nhi=&lr=&cr={country}&as_qdr=all&as_sitesearch=&as_occt=any&safe=images&tbs=&as_filetype=&as_rights=' 24 | download_html = False 25 | limit_country = False 26 | 27 | def start_requests(self): 28 | for query in arg_to_iter(self.queries): 29 | url = self.make_google_search_request(COUNTRIES[self.region], query) 30 | yield Request(url=url, meta={'query': query}) 31 | 32 | def make_google_search_request(self, country, query): 33 | if not self.limit_country: 34 | country = '' 35 | return self.base_url_fmt.format(country=country, region=self.region, query='+'.join(query.split()).strip('+')) 36 | 37 | def parse(self, response): 38 | hxs = HtmlXPathSelector(response) 39 | for sel in hxs.select('//div[@id="ires"]//li[@class="g"]//h3[@class="r"]'): 40 | name = u''.join(sel.select(".//text()").extract()) 41 | url = _parse_url(sel.select('.//a/@href').extract()[0]) 42 | region = _get_region(url) 43 | if len(url): 44 | if self.download_html: 45 | yield Request(url=url, callback=self.parse_item, meta={'name':name, 46 | 'query': response.meta['query']}) 47 | else: 48 | yield GoogleSearchItem(url=url, name=name, query=response.meta['query'], crawled=datetime.datetime.utcnow().isoformat()) 49 | 50 | next_page = hxs.select('//table[@id="nav"]//td[contains(@class, "b") and position() = last()]/a') 51 | if next_page: 52 | url = self._build_absolute_url(response, next_page.select('.//@href').extract()[0]) 53 | yield Request(url=url, callback=self.parse, meta={'query': response.meta['query']}) 54 | 55 | def parse_item(self, response): 56 | name = response.meta['name'] 57 | query = response.meta['query'] 58 | url = response.url 59 | html = response.body[:1024 * 256] 60 | timestamp = datetime.datetime.utcnow().isoformat() 61 | yield GoogleSearchItem({'name': name, 62 | 'url': url, 63 | 'html': html, 64 | 'region': self.region, 65 | 'query': query, 66 | 'crawled': timestamp}) 67 | 68 | def _build_absolute_url(self, response, url): 69 | return urljoin(get_base_url(response), url) 70 | 71 | def _parse_url(href): 72 | """ 73 | parse the website from anchor href. 74 | 75 | for example: 76 | 77 | >>> _parse_url(u'/url?q=http://www.getmore.com.hk/page18.php&sa=U&ei=Xmd_UdqBEtGy4AO254GIDg&ved=0CDQQFjAGODw&usg=AFQjCNH08dgfL10dJVCyQjfu_1DEyhiMHQ') 78 | u'http://www.getmore.com.hk/page18.php' 79 | """ 80 | queries = dict(parse_qsl(urlparse(href).query)) 81 | return queries.get('q', '') 82 | 83 | def _get_region(url): 84 | """ 85 | get country code from the url. 86 | 87 | >>> _get_region('http://scrapinghub.ie') 88 | 'ie' 89 | >>> _get_region('http://www.astoncarpets.ie/contact.htm') 90 | 'ie' 91 | """ 92 | netloc = urlparse(url)[1] 93 | return netloc.rpartition('.')[-1] 94 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = googlesearch.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = googlesearch 12 | --------------------------------------------------------------------------------