├── .gitignore ├── README.md ├── quotes_crawler ├── README.md ├── quotes_crawler │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ ├── toscrape-csrf-login-v1.py │ │ ├── toscrape-csrf-login-v2.py │ │ ├── toscrape-css.py │ │ ├── toscrape-infinite-scrolling.py │ │ ├── toscrape-js.py │ │ ├── toscrape-microdata.py │ │ ├── toscrape-selenium.py │ │ └── toscrape-xpath.py ├── requirements.txt └── scrapy.cfg ├── sc_custom_image ├── README.md ├── requirements.txt ├── sc_custom_image │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── demo.py ├── scrapinghub.yml └── scrapy.cfg ├── sc_scripts_demo ├── bin │ └── check_jobs.py ├── requirements.txt ├── sc_scripts_demo │ ├── __init__.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ ├── bad_spider.py │ │ └── good_spider.py ├── scrapinghub.yml ├── scrapy.cfg └── setup.py ├── scrapy_price_monitor ├── .gitignore ├── README.md ├── bin │ └── monitor.py ├── price_monitor │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── resources │ │ └── urls.json │ ├── settings.py │ ├── spiders │ │ ├── __init__.py │ │ ├── amazon.py │ │ ├── base_spider.py │ │ ├── bestbuy.py │ │ └── ebay.py │ ├── templates │ │ └── email.html │ └── utils.py ├── requirements.txt ├── scrapinghub.yml ├── scrapy.cfg └── setup.py ├── splash_based_project ├── scrapy.cfg └── splash_based_project │ ├── __init__.py │ ├── settings.py │ └── spiders │ ├── __init__.py │ ├── quotes-js-1.py │ └── quotes-js-2.py ├── splash_crawlera_example ├── README.md ├── requirements.txt ├── scrapinghub.yml ├── scrapy.cfg ├── setup.py └── splash_crawlera_example │ ├── __init__.py │ ├── scripts │ └── crawlera.lua │ ├── settings.py │ └── spiders │ ├── __init__.py │ └── quotes-js.py ├── splash_scrapy_spm_headless_proxy_example ├── README.md ├── requirements.txt ├── scrapinghub.yml ├── scrapy.cfg ├── setup.py └── splash_scrapy_spm_headless_proxy_example │ ├── __init__.py │ ├── scripts │ └── smart_proxy_manager.lua │ ├── settings.py │ └── spiders │ ├── __init__.py │ └── quotes-js.py └── splash_smart_proxy_manager_example ├── README.md ├── requirements.txt ├── scrapinghub.yml ├── scrapy.cfg ├── setup.py └── splash_smart_proxy_manager_example ├── __init__.py ├── scripts └── smart_proxy_manager.lua ├── settings.py └── spiders ├── __init__.py └── quotes-js.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | build 3 | *.egg 4 | *.egg-info 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Zyte (Formally Scrapinghub) Sample Projects 2 | 3 | This repo contains a few sample projects demonstrating capabilities 4 | of [Zyte (Formally Scrapinghub)](https://zyte.com) technologies. 5 | 6 | There is not much to see here yet, but stay tuned, we're just getting started! 7 | -------------------------------------------------------------------------------- /quotes_crawler/README.md: -------------------------------------------------------------------------------- 1 | #Spiders for Quotes.Toscrape.com 2 | 3 | This project contains spiders to scrape many variations of the [quotes.toscrape.com](https://quotes.toscrape.com), such as: 4 | 5 | * `toscrape-css`: scrapes [quotes.toscrape.com](https://quotes.toscrape.com) using CSS selectors; 6 | * `toscrape-xpath`: scrapes [quotes.toscrape.com](https://quotes.toscrape.com) using XPath; 7 | * `toscrape-microdata`: read the semantic markup data from [quotes.toscrape.com](https://quotes.toscrape.com) using [extruct](https://github.com/scrapinghub/extruct); 8 | * `toscrape-js`: scrapes the JavaScript-powered version of `Quotes to Scrape`([quotes.toscrape.com/js](https://quotes.toscrape.com/js)) using [js2xml](https://github.com/scrapinghub/js2xml) to parse the data from inside the JavaScript code; 9 | * `toscrape-selenium`: scrapes the JavaScript-powered version of `Quotes to Scrape`([quotes.toscrape.com/js](https://quotes.toscrape.com/js)) using Selenium + PhantomJS to render the page; 10 | * `toscrape-infinite-scrolling`: scrapes the infinite scrolling version ([quotes.toscrape.com/scroll](https://quotes.toscrape.com/scroll)) via AJAX API calls; 11 | * `toscrape-csrf-login-v1`: authenticates into [quotes.toscrape.com/login](https://quotes.toscrape.com/login) loading the CSRF token manually into the request; 12 | * `toscrape-csrf-login-v2`: authenticates into [quotes.toscrape.com/login](https://quotes.toscrape.com/login) using `FormRequest.from_respose()` to load automatically the CSRF token; 13 | 14 | -------------------------------------------------------------------------------- /quotes_crawler/quotes_crawler/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/sample-projects/fdff7a0be50041c30e059568aca0d81d1183ad53/quotes_crawler/quotes_crawler/__init__.py -------------------------------------------------------------------------------- /quotes_crawler/quotes_crawler/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class QuotesCrawlerItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /quotes_crawler/quotes_crawler/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class QuotesCrawlerPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /quotes_crawler/quotes_crawler/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for quotes_crawler project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'quotes_crawler' 13 | 14 | SPIDER_MODULES = ['quotes_crawler.spiders'] 15 | NEWSPIDER_MODULE = 'quotes_crawler.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'quotes_crawler (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'quotes_crawler.middlewares.MyCustomSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'quotes_crawler.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'quotes_crawler.pipelines.SomePipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /quotes_crawler/quotes_crawler/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /quotes_crawler/quotes_crawler/spiders/toscrape-csrf-login-v1.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | 3 | 4 | class ToScrapeCSRFLoginSpiderV1(scrapy.Spider): 5 | name = 'toscrape-csrf-login-v1' 6 | start_urls = [ 7 | 'http://quotes.toscrape.com/login' 8 | ] 9 | 10 | def parse(self, response): 11 | # Forms with CSRF verification generates a CSRF token for each request 12 | # and they require that same value in the data the client sends back. 13 | # WARNING: 14 | # This could be done automatically using FormRequest.from_response() 15 | # check toscrape-csrf-login-v2.py for reference 16 | token = response.css("input[name=csrf_token] ::attr(value)").extract_first() 17 | yield scrapy.FormRequest( 18 | self.start_urls[0], 19 | formdata={ 20 | 'csrf_token': token, 21 | 'username': 'valdir', 22 | 'password': 'abc' 23 | }, 24 | callback=self.after_login 25 | ) 26 | 27 | def after_login(self, response): 28 | authenticated = response.css('div.header-box p > a::text').extract_first() == 'Logout' 29 | for quote in response.css('div.quote'): 30 | yield { 31 | 'text': quote.css('span::text').extract_first(), 32 | 'author': quote.css('small::text').extract_first(), 33 | 'tags': quote.css('.tags a::text').extract(), 34 | 'authenticated': authenticated, 35 | } 36 | -------------------------------------------------------------------------------- /quotes_crawler/quotes_crawler/spiders/toscrape-csrf-login-v2.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | 3 | 4 | class ToScrapeCSRFLoginSpiderV2(scrapy.Spider): 5 | name = 'toscrape-csrf-login-v2' 6 | start_urls = [ 7 | 'http://quotes.toscrape.com/login' 8 | ] 9 | 10 | def parse(self, response): 11 | # FormRequest.from_response automatically loads all the form data that 12 | # is in the form present in the response object. This way, we don't 13 | # have to worry about explicitly loading the CSRF token in the data we 14 | # will POST to the server. 15 | yield scrapy.FormRequest.from_response( 16 | response, 17 | formdata={ 18 | 'username': 'any', 19 | 'password': 'doesnt matter' 20 | }, 21 | callback=self.after_login, 22 | ) 23 | 24 | def after_login(self, response): 25 | authenticated = response.css('div.header-box p > a::text').extract_first() == 'Logout' 26 | for quote in response.css('div.quote'): 27 | yield { 28 | 'text': quote.css('span::text').extract_first(), 29 | 'author': quote.css('small::text').extract_first(), 30 | 'tags': quote.css('.tags a::text').extract(), 31 | 'authenticated': authenticated, 32 | } 33 | -------------------------------------------------------------------------------- /quotes_crawler/quotes_crawler/spiders/toscrape-css.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | 4 | 5 | class ToScrapeCSSSpider(scrapy.Spider): 6 | name = "toscrape-css" 7 | start_urls = [ 8 | 'http://quotes.toscrape.com/', 9 | ] 10 | 11 | def parse(self, response): 12 | for quote in response.css("div.quote"): 13 | yield { 14 | 'text': quote.css("span.text::text").extract_first(), 15 | 'author': quote.css("small.author::text").extract_first(), 16 | 'tags': quote.css("div.tags > a.tag::text").extract() 17 | } 18 | 19 | next_page_url = response.css("li.next > a::attr(href)").extract_first() 20 | if next_page_url is not None: 21 | yield scrapy.Request(response.urljoin(next_page_url)) 22 | -------------------------------------------------------------------------------- /quotes_crawler/quotes_crawler/spiders/toscrape-infinite-scrolling.py: -------------------------------------------------------------------------------- 1 | import json 2 | import scrapy 3 | 4 | 5 | # Most AJAX based websites can be scraped by reproducing the API calls made 6 | # by the browser, as we do in this simple example that scrapes 7 | # a website paginated via infinite scrolling (quotes.toscrape.com/scroll) 8 | class ToScrapeInfiniteScrollingSpider(scrapy.Spider): 9 | name = 'toscrape-infinite-scrolling' 10 | base_url = 'http://quotes.toscrape.com/api/quotes?page=%d' 11 | start_urls = [base_url % 1] 12 | 13 | def parse(self, response): 14 | json_data = json.loads(response.text) 15 | for quote in json_data['quotes']: 16 | yield quote 17 | if json_data['has_next']: 18 | yield scrapy.Request(self.base_url % (int(json_data['page']) + 1)) 19 | -------------------------------------------------------------------------------- /quotes_crawler/quotes_crawler/spiders/toscrape-js.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | import js2xml 3 | 4 | 5 | class ToScrapeJSSpider(scrapy.Spider): 6 | name = 'toscrape-js' 7 | start_urls = [ 8 | 'http://quotes.toscrape.com/js/' 9 | ] 10 | 11 | def parse(self, response): 12 | script = response.xpath('//script[contains(., "var data =")]/text()').extract_first() 13 | sel = scrapy.Selector(_root=js2xml.parse(script)) 14 | for quote in sel.xpath('//var[@name="data"]/array/object'): 15 | yield { 16 | 'text': quote.xpath('string(./property[@name="text"])').extract_first(), 17 | 'author': quote.xpath( 18 | 'string(./property[@name="author"]//property[@name="name"])' 19 | ).extract_first(), 20 | 'tags': quote.xpath('./property[@name="tags"]//string/text()').extract(), 21 | } 22 | 23 | link_next = response.css('li.next a::attr("href")').extract_first() 24 | if link_next: 25 | yield scrapy.Request(response.urljoin(link_next)) 26 | -------------------------------------------------------------------------------- /quotes_crawler/quotes_crawler/spiders/toscrape-microdata.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from extruct.w3cmicrodata import LxmlMicrodataExtractor 3 | 4 | 5 | class ToScrapeMicrodataSpider(scrapy.Spider): 6 | name = "toscrape-microdata" 7 | start_urls = [ 8 | 'http://quotes.toscrape.com/' 9 | ] 10 | 11 | def parse(self, response): 12 | extractor = LxmlMicrodataExtractor() 13 | items = extractor.extract(response.text, response.url)['items'] 14 | for it in items: 15 | yield it['properties'] 16 | 17 | next_page_url = response.css("li.next > a::attr(href)").extract_first() 18 | if next_page_url is not None: 19 | yield scrapy.Request(response.urljoin(next_page_url)) 20 | -------------------------------------------------------------------------------- /quotes_crawler/quotes_crawler/spiders/toscrape-selenium.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import scrapy 3 | from selenium import webdriver 4 | 5 | 6 | # this spider needs PhantomJS (http://phantomjs.org/) installed somewhere in your PATH 7 | class ToScrapeSeleniumSpider(scrapy.Spider): 8 | name = 'toscrape-selenium' 9 | start_urls = [ 10 | 'http://quotes.toscrape.com/js' 11 | ] 12 | 13 | def __init__(self, *args, **kwargs): 14 | self.driver = webdriver.PhantomJS() 15 | super(ToScrapeSeleniumSpider, self).__init__(*args, **kwargs) 16 | 17 | def parse(self, response): 18 | self.driver.get(response.url) 19 | for quote in self.driver.find_elements_by_css_selector('div.quote'): 20 | yield { 21 | 'quote': quote.find_element_by_css_selector("span.text").text, 22 | 'author': quote.find_element_by_css_selector("small.author").text, 23 | 'tags': [e.text for e in quote.find_elements_by_class_name('tag')], 24 | } 25 | # pagination links are not generated by JS code in this page 26 | next_page_url = response.css("li.next > a::attr(href)").extract_first() 27 | if next_page_url is not None: 28 | yield scrapy.Request(response.urljoin(next_page_url)) 29 | -------------------------------------------------------------------------------- /quotes_crawler/quotes_crawler/spiders/toscrape-xpath.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | 4 | 5 | class ToScrapeSpiderXPath(scrapy.Spider): 6 | name = 'toscrape-xpath' 7 | start_urls = [ 8 | 'http://quotes.toscrape.com/', 9 | ] 10 | 11 | def parse(self, response): 12 | for quote in response.xpath('//div[@class="quote"]'): 13 | yield { 14 | 'text': quote.xpath('./span[@class="text"]/text()').extract_first(), 15 | 'author': quote.xpath('.//small[@class="author"]/text()').extract_first(), 16 | 'tags': quote.xpath('.//div[@class="tags"]/a[@class="tag"]/text()').extract() 17 | } 18 | 19 | next_page_url = response.xpath('//li[@class="next"]/a/@href').extract_first() 20 | if next_page_url is not None: 21 | yield scrapy.Request(response.urljoin(next_page_url)) 22 | -------------------------------------------------------------------------------- /quotes_crawler/requirements.txt: -------------------------------------------------------------------------------- 1 | extruct 2 | js2xml 3 | selenium 4 | -------------------------------------------------------------------------------- /quotes_crawler/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = quotes_crawler.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = quotes_crawler 12 | -------------------------------------------------------------------------------- /sc_custom_image/README.md: -------------------------------------------------------------------------------- 1 | ## Scrapy Cloud Custom Image 2 | 3 | Sample Scrapy project demonstrating using PhantomJS and 4 | deploying it to Scrapy Cloud using a custom Docker image. 5 | -------------------------------------------------------------------------------- /sc_custom_image/requirements.txt: -------------------------------------------------------------------------------- 1 | selenium 2 | -------------------------------------------------------------------------------- /sc_custom_image/sc_custom_image/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/sample-projects/fdff7a0be50041c30e059568aca0d81d1183ad53/sc_custom_image/sc_custom_image/__init__.py -------------------------------------------------------------------------------- /sc_custom_image/sc_custom_image/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class ScCustomImageItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /sc_custom_image/sc_custom_image/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class ScCustomImagePipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /sc_custom_image/sc_custom_image/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for sc_custom_image project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'sc_custom_image' 13 | 14 | SPIDER_MODULES = ['sc_custom_image.spiders'] 15 | NEWSPIDER_MODULE = 'sc_custom_image.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'sc_custom_image (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'sc_custom_image.middlewares.MyCustomSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'sc_custom_image.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'sc_custom_image.pipelines.SomePipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /sc_custom_image/sc_custom_image/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /sc_custom_image/sc_custom_image/spiders/demo.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from selenium import webdriver 3 | 4 | 5 | class DemoSpider(scrapy.Spider): 6 | name = 'demo' 7 | start_urls = ['http://quotes.toscrape.com/js'] 8 | 9 | def __init__(self, *args, **kwargs): 10 | # XXX: needs phantomjs binary available in PATH 11 | self.driver = webdriver.PhantomJS() 12 | super(DemoSpider, self).__init__(*args, **kwargs) 13 | 14 | def parse(self, response): 15 | self.driver.get(response.url) 16 | for quote in self.driver.find_elements_by_css_selector('div.quote'): 17 | yield { 18 | 'quote': quote.find_element_by_css_selector('span').text, 19 | 'author': quote.find_element_by_css_selector('small').text, 20 | } 21 | next_page_url = response.css('nav li.next a ::attr(href)').extract_first() 22 | if next_page_url: 23 | yield scrapy.Request(response.urljoin(next_page_url)) 24 | -------------------------------------------------------------------------------- /sc_custom_image/scrapinghub.yml: -------------------------------------------------------------------------------- 1 | project: PUT_YOUR_PROJECT_ID_HERE 2 | requirements_file: ./requirements.txt 3 | image: true 4 | -------------------------------------------------------------------------------- /sc_custom_image/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = sc_custom_image.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = sc_custom_image 12 | -------------------------------------------------------------------------------- /sc_scripts_demo/bin/check_jobs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """Simple monitor jobs checker for the last 24 hours 4 | """ 5 | 6 | from __future__ import print_function 7 | 8 | import argparse 9 | import os 10 | 11 | import boto 12 | from datetime import datetime 13 | from datetime import timedelta 14 | from scrapinghub import Project, Connection 15 | 16 | # Configure your SES credentials here 17 | AWS_ACCESS_KEY = '' 18 | AWS_SECRET_KEY = '' 19 | 20 | # Configure the Mail-from here 21 | DEFAULT_MAIL_FROM = 'Custom Notification ' 22 | 23 | 24 | def send_email(recipients, subject, body, mail_from=DEFAULT_MAIL_FROM): 25 | """Send an email using AWS Simple Email Service 26 | """ 27 | ses = boto.connect_ses(AWS_ACCESS_KEY, AWS_SECRET_KEY) 28 | ses.send_email(mail_from, subject, body, recipients) 29 | print('Email sent to %s' % ', '.join(recipients)) 30 | 31 | 32 | def parse_date(date_str): 33 | return datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S') 34 | 35 | 36 | def has_job_error(job): 37 | success_reason = 'no_reason' 38 | return (job.info.get('errors_count', 0) > 0 39 | or job.info.get('close_reason') != success_reason) 40 | 41 | 42 | def is_job_newer_than(job, since_time): 43 | cancelled_before_starting = ('updated_time' not in job.info 44 | and job.info.get('close_reason') == 'cancelled') 45 | if cancelled_before_starting: 46 | return False 47 | return since_time <= parse_date(job.info['updated_time']) 48 | 49 | 50 | def get_last_24h_jobs(apikey, project_id): 51 | """Fetch jobs that finished in the last 24 hours 52 | """ 53 | project = Project(Connection(apikey), project_id) 54 | since_time = datetime.utcnow() - timedelta(hours=24) 55 | jobs = [ 56 | job for job in project.jobs(state='finished') 57 | if is_job_newer_than(job, since_time) 58 | ] 59 | return jobs 60 | 61 | 62 | def render_report(jobs_with_error): 63 | """Build a text report for the jobs with errors 64 | """ 65 | output = [] 66 | for job in jobs_with_error: 67 | errors_count = job.info.get('errors_count', 0) 68 | close_reason = job.info.get('close_reason') 69 | 70 | job_id = job.info["id"].split('/') 71 | url = 'https://app.scrapinghub.com/p/{0}/job/{1}/{2}'.format( 72 | job_id[0], job_id[1], job_id[2]) 73 | 74 | error_message = ['Errors found for job "{0}" ({1}):'.format( 75 | job.info['spider'], url)] 76 | if errors_count > 0: 77 | error_message.append(' There were {} error{}.'.format( 78 | errors_count, '' if errors_count == 1 else 's')) 79 | 80 | success_reasons = ('no_reason', 'finished') 81 | if close_reason not in success_reasons: 82 | error_message.append(' Close reason should not be "{}".'.format( 83 | close_reason)) 84 | output.append('\n'.join(error_message)) 85 | 86 | return '\n\n'.join(output) 87 | 88 | 89 | def main(args): 90 | job_list = get_last_24h_jobs(args.apikey, args.project_id) 91 | jobs_with_errors = [job for job in job_list if has_job_error(job)] 92 | 93 | if jobs_with_errors: 94 | report = render_report(jobs_with_errors) 95 | if args.mail: 96 | subject = 'Scrapy Cloud - jobs with errors' 97 | send_email(args.mail, subject, body=report) 98 | else: 99 | print(report) 100 | else: 101 | print('No errors found.') 102 | 103 | 104 | def parse_args(): 105 | parser = argparse.ArgumentParser(description=__doc__) 106 | 107 | parser.add_argument('--apikey', default=os.getenv('SHUB_APIKEY', None), 108 | help='API key to use for scrapinghub (will fallback ' 109 | 'to SHUB_APIKEY variable)') 110 | parser.add_argument('project_id', type=int, 111 | help='Project ID to get info from.') 112 | parser.add_argument('--mail', action='append', help='Send output as email') 113 | args = parser.parse_args() 114 | 115 | if not args.apikey: 116 | parser.error('Please provide an API key with --apikey option') 117 | return args 118 | 119 | 120 | if '__main__' == __name__: 121 | main(parse_args()) 122 | -------------------------------------------------------------------------------- /sc_scripts_demo/requirements.txt: -------------------------------------------------------------------------------- 1 | scrapinghub 2 | -------------------------------------------------------------------------------- /sc_scripts_demo/sc_scripts_demo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/sample-projects/fdff7a0be50041c30e059568aca0d81d1183ad53/sc_scripts_demo/sc_scripts_demo/__init__.py -------------------------------------------------------------------------------- /sc_scripts_demo/sc_scripts_demo/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | BOT_NAME = 'sc_scripts_demo' 4 | 5 | SPIDER_MODULES = ['sc_scripts_demo.spiders'] 6 | NEWSPIDER_MODULE = 'sc_scripts_demo.spiders' 7 | 8 | USER_AGENT = 'sc_scripts_demo (http://scrapinghub.com)' 9 | 10 | ROBOTSTXT_OBEY = True 11 | -------------------------------------------------------------------------------- /sc_scripts_demo/sc_scripts_demo/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /sc_scripts_demo/sc_scripts_demo/spiders/bad_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | 4 | 5 | class SpiderWithErrors(scrapy.Spider): 6 | name = "bad" 7 | start_urls = [ 8 | 'http://quotes.toscrape.com/', 9 | ] 10 | 11 | def parse(self, response): 12 | raise ValueError('Oops, this spider has errors') 13 | -------------------------------------------------------------------------------- /sc_scripts_demo/sc_scripts_demo/spiders/good_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | 4 | 5 | class NiceWorkingSpider(scrapy.Spider): 6 | name = "good" 7 | start_urls = [ 8 | 'http://quotes.toscrape.com/', 9 | ] 10 | 11 | def parse(self, response): 12 | for quote in response.css("div.quote"): 13 | yield { 14 | 'text': quote.css("span.text::text").extract_first(), 15 | 'author': quote.css("small.author::text").extract_first(), 16 | 'tags': quote.css("div.tags > a.tag::text").extract() 17 | } 18 | 19 | next_page_url = response.css("li.next > a::attr(href)").extract_first() 20 | if next_page_url is not None: 21 | yield scrapy.Request(response.urljoin(next_page_url)) 22 | -------------------------------------------------------------------------------- /sc_scripts_demo/scrapinghub.yml: -------------------------------------------------------------------------------- 1 | projects: 2 | default: 105217 3 | requirements_file: requirements.txt 4 | -------------------------------------------------------------------------------- /sc_scripts_demo/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = sc_scripts_demo.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = sc_scripts_demo 12 | -------------------------------------------------------------------------------- /sc_scripts_demo/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | 4 | setup( 5 | name='sc_scripts_demo', 6 | version='1.0', 7 | packages=find_packages(), 8 | scripts=[ 9 | 'bin/check_jobs.py', 10 | ], 11 | entry_points={ 12 | 'scrapy': ['settings = sc_scripts_demo.settings'], 13 | }, 14 | ) 15 | -------------------------------------------------------------------------------- /scrapy_price_monitor/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | .venv/ 83 | venv/ 84 | ENV/ 85 | 86 | # Spyder project settings 87 | .spyderproject 88 | 89 | # Rope project settings 90 | .ropeproject 91 | 92 | .scrapy -------------------------------------------------------------------------------- /scrapy_price_monitor/README.md: -------------------------------------------------------------------------------- 1 | Scrapy Price Monitor 2 | ==================== 3 | 4 | This is a simple price monitor built with [Scrapy](https://github.com/scrapy/scrapy) 5 | and [Scrapy Cloud](https://scrapinghub.com/scrapy-cloud). 6 | 7 | It is basically a Scrapy project with one spider for each online retailer that 8 | we want to monitor prices from. In addition to the spiders, there's a Python 9 | Script that is scheduled to run periodically on Scrapy Cloud, checking whether 10 | the latest prices are the best ones in a given time span. If so, the monitor 11 | sends an email alerting you about the price drops. 12 | 13 | 14 | ## Including Products to Monitor 15 | 16 | There's a `resources/urls.json` file that lists the URLs from the products that 17 | we want to monitor. If you just want to include a new product to monitor from 18 | the already supported retailers, just add a new key for that product and add 19 | the URL list as its value, such as: 20 | 21 | { 22 | "headsetlogitech": [ 23 | "https://www.amazon.com/.../B005GTO07O/", 24 | "http://www.bestbuy.com/.../3436118.p", 25 | "http://www.ebay.com/.../110985874014" 26 | ], 27 | "NewProduct": [ 28 | "http://url.for.retailer.x", 29 | "http://url.for.retailer.y", 30 | "http://url.for.retailer.z" 31 | ] 32 | } 33 | 34 | 35 | ## Supporting Further Retailers 36 | 37 | This project currently only works with 3 online retailers, and you can list them 38 | running: 39 | 40 | $ scrapy list 41 | amazon.com 42 | bestbuy.com 43 | ebay.com 44 | 45 | If the retailer that you want to monitor is not yet supported, just create a spider 46 | to handle the product pages from it. To include a spider for samsclub.com, you 47 | could run: 48 | 49 | $ scrapy genspider samsclub.com samsclub.com 50 | 51 | And then, open the spider and add the extraction rules: 52 | 53 | $ scrapy edit samsclub.com 54 | 55 | Have a look at the current spiders and implement the new ones using the same 56 | structure, subclassing `BaseSpider` instead of `scrapy.Spider`. This way, your 57 | spiders will automatically read the URLs list from `resources/urls.json`. 58 | 59 | 60 | ## Customizing the Price Monitor 61 | 62 | The price monitor sends an email using Amazon SES service, so to run it you 63 | have to set both `AWS_ACCESS_KEY` and `AWS_SECRET_KEY` variables in 64 | `price_monitor/settings.py`. If you want to use another email service, 65 | you have to rewrite the `send_email_alert` function in 66 | `price_monitor/bin/monitor.py`. 67 | 68 | The price monitor can be further customized via parameters to the 69 | `price_monitor/bin/monitor.py` script. We will dig on those parameters 70 | later when showing how to schedule the project on Scrapy Cloud. 71 | 72 | 73 | ## Installing and Running 74 | 75 | 1. Clone this repo: 76 | 77 | $ git clone git@github.com:stummjr/scrapy_price_monitor.git 78 | 79 | 2. Enter the folder and install the project dependencies: 80 | 81 | $ cd scrapy_price_monitor 82 | $ pip install -r requirements.txt 83 | 84 | 3. Create a free forever account on Scrapy Cloud: 85 | https://app.scrapinghub.com/account/signup/. 86 | 87 | 4. Create a Scrapy project on Scrapy Cloud and copy the project id from the project URL. 88 | 89 | 5. Install [Scrapinghub command line tool (shub)](https://github.com/scrapinghub/shub): 90 | 91 | $ pip install shub 92 | 93 | 6. Authenticate using your Scrapinghub API key: 94 | 95 | $ shub login 96 | 97 | 7. Finally, deploy the local project to your Scrapy Cloud project: 98 | 99 | $ shub deploy 100 | 101 | This video also explains how to deploy a Scrapy project to Scrapy Cloud: 102 | https://youtu.be/JYch0zRmcgU 103 | 104 | 105 | ## How to Schedule on Scrapy Cloud 106 | 107 | After you have deployed the project to Scrapy Cloud, it's time to schedule its 108 | execution on Scrapy Cloud. 109 | 110 | This project has two main components: 111 | 112 | - the [**spiders**](https://github.com/scrapinghub/sample-projects/blob/master/scrapy_price_monitor/price_monitor/spiders) that collect prices from the retailers' websites 113 | - the [**price monitor script**](https://github.com/scrapinghub/sample-projects/blob/master/scrapy_price_monitor/bin/monitor.py) that checks whether there's a new deal in the latest prices 114 | 115 | You have to schedule both the spiders and the monitor to run periodically on 116 | Scrapy Cloud. It's a good idea to schedule all the spiders to run at the same 117 | time and schedule the monitor to run about 15 minutes after the spiders. 118 | 119 | Take a look at this video to learn how to schedule periodic jobs on Scrapy Cloud: 120 | https://youtu.be/JYch0zRmcgU?t=1m51s 121 | 122 | 123 | ### Parameters for the Monitor Script 124 | 125 | The monitor script takes these parameters and you can pass them via the parameters box in the 126 | scheduling dialog: 127 | 128 | - `--days`: how many days of data we want to compare with the scraped prices. 129 | - `--threshold`: a margin that you can set to avoid getting alerts from minor price changes. For example, if you set it to 1.0, you will only get alerts when the price drop is bigger than $1.00. 130 | - `--apikey`: your Scrapy Cloud API key. You can get it in: https://app.scrapinghub.com/account/apikey. 131 | - `--project`: the Scrapy Cloud project where the monitor is deployed (you can grab it from your project URL at Scrapy Cloud). 132 | 133 | 134 | ## Running in a Local Environment 135 | 136 | You can run this project on Scrapy Cloud or on your local environment. The only dependency 137 | from Scrapy Cloud is the [Collections API](https://doc.scrapinghub.com/api/collections.html), 138 | but the spiders and the monitor can be executed locally. 139 | 140 | To do that, first add your Scrapy Cloud project id to [settings.py `SHUB_PROJ_ID` variable](https://github.com/scrapinghub/sample-projects/blob/master/scrapy_price_monitor/price_monitor/settings.py#L11). 141 | 142 | Then run the spiders via command line: 143 | 144 | $ scrapy crawl bestbuy.com 145 | 146 | This will run the spider named as `bestbuy.com` and store the scraped data into 147 | a Scrapy Cloud collection, under the project you set in the last step. 148 | 149 | You can also run the price monitor via command line: 150 | 151 | $ python bin/monitor.py --apikey --days 2 --threshold 1 --project 152 | -------------------------------------------------------------------------------- /scrapy_price_monitor/bin/monitor.py: -------------------------------------------------------------------------------- 1 | """Simple price monitor built with Scrapy and Scrapy Cloud 2 | """ 3 | import argparse 4 | import os 5 | from datetime import datetime, timedelta 6 | 7 | import boto 8 | from hubstorage import HubstorageClient 9 | from jinja2 import Environment, PackageLoader 10 | from price_monitor import settings 11 | from price_monitor.utils import get_product_names, get_retailers_for_product 12 | from w3lib.html import remove_tags 13 | 14 | jinja_env = Environment(loader=PackageLoader('price_monitor', 'templates')) 15 | 16 | 17 | class DealsChecker(object): 18 | 19 | def __init__(self, latest_deals, previous_deals, price_threshold=0): 20 | self.price_threshold = price_threshold 21 | self.latest_deals = latest_deals 22 | self.previous_deals = previous_deals 23 | 24 | def is_from_latest_crawl(self, deal): 25 | """Checks whether the given deal is from the most recent execution. 26 | """ 27 | return deal in self.latest_deals 28 | 29 | def get_best_deal(self): 30 | """Returns the item with the best overall price. self.price_threshold can be set to avoid 31 | considering minor price drops. 32 | """ 33 | best_so_far = min(self.previous_deals, key=lambda x: x.get('price')) 34 | best_from_last = min(self.latest_deals, key=lambda x: x.get('price')) 35 | if best_from_last.get('price') + self.price_threshold < best_so_far.get('price'): 36 | return best_from_last 37 | else: 38 | return best_so_far 39 | 40 | 41 | class DealsFetcher(object): 42 | 43 | def __init__(self, product_name, apikey, project_id, hours): 44 | self.product_name = product_name 45 | project = HubstorageClient(apikey).get_project(project_id) 46 | self.item_store = project.collections.new_store(product_name) 47 | self.load_items_from_last_n_hours(hours) 48 | 49 | def load_items_from_last_n_hours(self, n=24): 50 | """Load items from the last n hours, from the newest to the oldest. 51 | """ 52 | since_time = int((datetime.now() - timedelta(hours=n)).timestamp() * 1000) 53 | self.deals = [item.get('value') for item in self.fetch_deals_newer_than(since_time)] 54 | 55 | def fetch_deals_newer_than(self, since_time): 56 | return list(self.item_store.get(meta=['_key', '_ts'], startts=since_time)) 57 | 58 | def get_latest_deal_from_retailer(self, retailer): 59 | """Returns the most recently extracted deal from a given retailer. 60 | """ 61 | for deals in self.deals: 62 | if retailer in deals.get('url'): 63 | return deals 64 | 65 | def get_deals(self): 66 | """Returns a tuple with (deals from latest crawl, deals from previous crawls) 67 | """ 68 | latest_deals = [ 69 | self.get_latest_deal_from_retailer(retailer) 70 | for retailer in get_retailers_for_product(self.product_name) 71 | ] 72 | previous_deals = [ 73 | deal for deal in self.deals if deal not in latest_deals 74 | ] 75 | return latest_deals, previous_deals 76 | 77 | 78 | def send_email_alert(items): 79 | ses = boto.connect_ses(settings.AWS_ACCESS_KEY, settings.AWS_SECRET_KEY) 80 | html_body = jinja_env.get_template('email.html').render(items=items) 81 | 82 | ses.send_email( 83 | settings.EMAIL_ALERT_FROM, 84 | 'Price drop alert', 85 | remove_tags(html_body), 86 | settings.EMAIL_ALERT_TO, 87 | html_body=html_body 88 | ) 89 | 90 | 91 | def main(args): 92 | items = [] 93 | for prod_name in get_product_names(): 94 | fetcher = DealsFetcher(prod_name, args.apikey, args.project, args.days * 24) 95 | checker = DealsChecker(*fetcher.get_deals(), args.threshold) 96 | best_deal = checker.get_best_deal() 97 | if checker.is_from_latest_crawl(best_deal): 98 | items.append(best_deal) 99 | 100 | if items: 101 | send_email_alert(items) 102 | 103 | 104 | def parse_args(): 105 | parser = argparse.ArgumentParser(description=__doc__) 106 | parser.add_argument('--apikey', default=settings.SHUB_KEY or os.getenv('SHUB_KEY'), 107 | help='API key to use for scrapinghub (fallbacks to SHUB_KEY variable)') 108 | parser.add_argument('--days', type=int, default=1, 109 | help='How many days back to compare with the last price') 110 | parser.add_argument('--threshold', type=float, default=0, 111 | help='A margin to avoid raising alerts with minor price drops') 112 | parser.add_argument('--project', type=int, default=settings.SHUB_PROJ_ID, 113 | help='Project ID to get info from') 114 | 115 | return parser.parse_args() 116 | 117 | 118 | if __name__ == '__main__': 119 | main(parse_args()) 120 | -------------------------------------------------------------------------------- /scrapy_price_monitor/price_monitor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/sample-projects/fdff7a0be50041c30e059568aca0d81d1183ad53/scrapy_price_monitor/price_monitor/__init__.py -------------------------------------------------------------------------------- /scrapy_price_monitor/price_monitor/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class PriceMonitorItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /scrapy_price_monitor/price_monitor/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from price_monitor import settings 3 | from hubstorage import HubstorageClient 4 | from price_monitor.utils import reversed_timestamp, get_product_names 5 | 6 | 7 | class CollectionStoragePipeline(object): 8 | 9 | def open_spider(self, spider): 10 | client = HubstorageClient(auth=settings.SHUB_KEY) 11 | project = client.get_project(settings.SHUB_PROJ_ID) 12 | self.data_stores = {} 13 | for product_name in get_product_names(): 14 | self.data_stores[product_name] = project.collections.new_store(product_name) 15 | 16 | def process_item(self, item, spider): 17 | key = "{}-{}-{}".format( 18 | reversed_timestamp(), item.get('product_name'), item.get('retailer') 19 | ) 20 | self.data_stores[item['product_name']].set({'_key': key, 'value': item}) 21 | return item 22 | -------------------------------------------------------------------------------- /scrapy_price_monitor/price_monitor/resources/urls.json: -------------------------------------------------------------------------------- 1 | { 2 | "headsetlogitech": [ 3 | "https://www.amazon.com/Logitech-Wireless-Headset-Over-Design/dp/B005GTO07O/", 4 | "http://www.bestbuy.com/site/logitech-h600-wireless-headset-black/3436118.p", 5 | "http://www.ebay.com/itm/N-Logitech-Wireless-Headset-H600-Over-The-Head-Design-981-000341-/110985874014" 6 | ], 7 | "webcamlogitech": [ 8 | "https://www.amazon.com/Logitech-Widescreen-Calling-Recording-Desktop/dp/B006JH8T3S/", 9 | "http://www.bestbuy.com/site/logitech-c920-pro-webcam-black/4612476.p?skuId=4612476", 10 | "http://www.ebay.com/itm/Logitech-HD-Pro-Webcam-C920-1080p-Widescreen-Video-Calling-and-Recording-/272381890214" 11 | ], 12 | "amazonechodot": [ 13 | "https://www.amazon.com/dp/B01DFKC2SO", 14 | "http://www.bestbuy.com/site/amazon-echo-dot/5578851.p?skuId=5578851", 15 | "http://www.ebay.com/itm/Amazon-Echo-Dot-2nd-Generation-w-Alexa-Voice-Media-Device-All-New-2016-/201668562192" 16 | ], 17 | "nikoncoolpix": [ 18 | "https://www.amazon.com/Nikon-COOLPIX-B500-Digital-Camera/dp/B01C3LEE9G/", 19 | "http://www.bestbuy.com/site/nikon-coolpix-b500-16-0-megapixel-digital-camera-red/4997500.p?skuId=4997500", 20 | "http://www.ebay.com/itm/Nikon-COOLPIX-B500-Digital-Camera-Red-/162225974018" 21 | ], 22 | "bluemicrophone": [ 23 | "https://www.amazon.com/Blue-Snowball-iCE-Condenser-Microphone/dp/B014PYGTUQ/", 24 | "http://www.bestbuy.com/site/blue-microphones-snowball-usb-cardioid-and-omnidirectional-electret-condenser-vocal-microphone-black/9918056.p?skuId=9918056", 25 | "http://www.ebay.com/itm/Blue-Microphones-Snowball-Black-iCE-Condenser-Microphone-/172260373002" 26 | ] 27 | } 28 | -------------------------------------------------------------------------------- /scrapy_price_monitor/price_monitor/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | 4 | BOT_NAME = 'price_monitor' 5 | SPIDER_MODULES = ['price_monitor.spiders'] 6 | NEWSPIDER_MODULE = 'price_monitor.spiders' 7 | 8 | ROBOTSTXT_OBEY = True 9 | 10 | SHUB_KEY = os.getenv('$SHUB_KEY') 11 | # if you want to run it locally, replace '999999' by your Scrapy Cloud project ID below 12 | SHUB_PROJ_ID = os.getenv('SHUB_JOBKEY', '999999').split('/')[0] 13 | 14 | 15 | # settings for Amazon SES email service 16 | AWS_ACCESS_KEY = os.getenv('$AWS_ACCESS_KEY') 17 | AWS_SECRET_KEY = os.getenv('$AWS_SECRET_KEY') 18 | EMAIL_ALERT_FROM = 'Price Monitor ' 19 | EMAIL_ALERT_TO = ['RECEIVER_EMAIL@provider.com'] 20 | 21 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 22 | ITEM_PIPELINES = { 23 | 'price_monitor.pipelines.CollectionStoragePipeline': 400, 24 | } 25 | 26 | AUTOTHROTTLE_ENABLED = True 27 | # HTTPCACHE_ENABLED = True 28 | -------------------------------------------------------------------------------- /scrapy_price_monitor/price_monitor/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapy_price_monitor/price_monitor/spiders/amazon.py: -------------------------------------------------------------------------------- 1 | from .base_spider import BaseSpider 2 | 3 | 4 | class AmazonSpider(BaseSpider): 5 | name = "amazon.com" 6 | 7 | def parse(self, response): 8 | item = response.meta.get('item', {}) 9 | item['url'] = response.url 10 | item['title'] = response.css("span#productTitle::text").extract_first("").strip() 11 | item['price'] = float( 12 | response.css("span#priceblock_ourprice::text").re_first("\$(.*)") or 0 13 | ) 14 | yield item 15 | -------------------------------------------------------------------------------- /scrapy_price_monitor/price_monitor/spiders/base_spider.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pkgutil 3 | import scrapy 4 | from datetime import datetime 5 | 6 | 7 | class BaseSpider(scrapy.Spider): 8 | 9 | def start_requests(self): 10 | products = json.loads(pkgutil.get_data('price_monitor', 'resources/urls.json').decode()) 11 | for name, urls in products.items(): 12 | for url in urls: 13 | if self.name in url: 14 | now = datetime.now().strftime('%Y/%m/%d %H:%M:%S') 15 | item = {'product_name': name, 'retailer': self.name, 'when': now} 16 | yield scrapy.Request(url, meta={'item': item}) 17 | -------------------------------------------------------------------------------- /scrapy_price_monitor/price_monitor/spiders/bestbuy.py: -------------------------------------------------------------------------------- 1 | from .base_spider import BaseSpider 2 | 3 | 4 | class BestbuySpider(BaseSpider): 5 | name = "bestbuy.com" 6 | 7 | def parse(self, response): 8 | item = response.meta.get('item', {}) 9 | item['url'] = response.url 10 | item['title'] = response.css("div#sku-title > h1 ::text").extract_first().strip() 11 | item['price'] = float( 12 | response.css('div.price-block ::attr(data-customer-price)').extract_first(default=0) 13 | ) 14 | yield item 15 | -------------------------------------------------------------------------------- /scrapy_price_monitor/price_monitor/spiders/ebay.py: -------------------------------------------------------------------------------- 1 | from extruct.w3cmicrodata import MicrodataExtractor 2 | from .base_spider import BaseSpider 3 | 4 | 5 | class EbaySpider(BaseSpider): 6 | name = "ebay.com" 7 | 8 | def parse(self, response): 9 | extractor = MicrodataExtractor() 10 | properties = extractor.extract(response.body_as_unicode()).get('items')[0].get('properties', {}) 11 | item = response.meta.get('item', {}) 12 | item['url'] = response.url 13 | item['title'] = properties.get('name').replace('Details about', '').strip() 14 | item['price'] = float( 15 | properties.get('offers', {}).get('properties', {}).get('price', 0) 16 | ) 17 | yield item 18 | -------------------------------------------------------------------------------- /scrapy_price_monitor/price_monitor/templates/email.html: -------------------------------------------------------------------------------- 1 |

🎉 Hey, we found a good deal! 🎁

2 | 3 | 4 | {% for item in items %} 5 | 12 | {% endfor %} 13 |
6 |

Product: {{item.title}}

7 |

Price: {{item.price}}

8 |

Store: {{item.retailer}}

9 |

Price obtained at: {{item.when}}

10 |

Visit the product page at {{item.retailer}}: {{item.url}}

11 |
14 | 15 | -------------------------------------------------------------------------------- /scrapy_price_monitor/price_monitor/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pkgutil 3 | from datetime import datetime, timedelta 4 | 5 | 6 | def timestamp_from_reversed(reversed): 7 | return datetime(5000, 1, 1) - timedelta(seconds=float(reversed)) 8 | 9 | 10 | def reversed_timestamp(): 11 | return str((datetime(5000, 1, 1) - datetime.now()).total_seconds()) 12 | 13 | 14 | def normalize_name(name): 15 | return name.replace('-', '') 16 | 17 | 18 | def get_product_names(): 19 | return [ 20 | normalize_name(name) 21 | for name in json.loads( 22 | pkgutil.get_data("price_monitor", "resources/urls.json").decode() 23 | ).keys() 24 | ] 25 | 26 | 27 | def get_retailer_name_from_url(url): 28 | return url.split("://")[1].split("/")[0].replace("www.", "") 29 | 30 | 31 | def get_retailers_for_product(product_name): 32 | data = json.loads( 33 | pkgutil.get_data("price_monitor", "resources/urls.json").decode() 34 | ) 35 | return {get_retailer_name_from_url(url) for url in data[product_name]} 36 | -------------------------------------------------------------------------------- /scrapy_price_monitor/requirements.txt: -------------------------------------------------------------------------------- 1 | scrapy 2 | boto 3 | extruct 4 | w3lib 5 | jinja2 6 | -------------------------------------------------------------------------------- /scrapy_price_monitor/scrapinghub.yml: -------------------------------------------------------------------------------- 1 | requirements_file: requirements.txt 2 | stacks: 3 | default: scrapy:1.1-py3 4 | -------------------------------------------------------------------------------- /scrapy_price_monitor/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = price_monitor.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = price_monitor 12 | -------------------------------------------------------------------------------- /scrapy_price_monitor/setup.py: -------------------------------------------------------------------------------- 1 | # Automatically created by: shub deploy 2 | 3 | from setuptools import setup, find_packages 4 | 5 | setup( 6 | name='project', 7 | version='1.0', 8 | packages=find_packages(), 9 | package_data={'price_monitor': ['resources/*.json', 'templates/*.html']}, 10 | scripts=['bin/monitor.py'], 11 | entry_points={'scrapy': ['settings = price_monitor.settings']}, 12 | ) 13 | -------------------------------------------------------------------------------- /splash_based_project/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = splash_based_project.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = splash_based_project 12 | -------------------------------------------------------------------------------- /splash_based_project/splash_based_project/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/sample-projects/fdff7a0be50041c30e059568aca0d81d1183ad53/splash_based_project/splash_based_project/__init__.py -------------------------------------------------------------------------------- /splash_based_project/splash_based_project/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for splash_based_project project 4 | 5 | BOT_NAME = 'splash_based_project' 6 | SPIDER_MODULES = ['splash_based_project.spiders'] 7 | NEWSPIDER_MODULE = 'splash_based_project.spiders' 8 | 9 | # Splash settings 10 | SPLASH_URL = '' # <-- Splash instance URL from Scrapy Cloud 11 | APIKEY = '' # <-- your API key 12 | SPIDER_MIDDLEWARES = { 13 | 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, 14 | } 15 | DOWNLOADER_MIDDLEWARES = { 16 | 'scrapy_splash.SplashCookiesMiddleware': 723, 17 | 'scrapy_splash.SplashMiddleware': 725, 18 | 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, 19 | } 20 | DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter' 21 | -------------------------------------------------------------------------------- /splash_based_project/splash_based_project/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /splash_based_project/splash_based_project/spiders/quotes-js-1.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from scrapy_splash import SplashRequest 3 | 4 | 5 | class QuotesJs1Spider(scrapy.Spider): 6 | """Example spider using Splash to render JavaScript-based pages. 7 | Make sure you configure settings.py according to your Splash 8 | credentials (available on Scrapy Cloud). 9 | """ 10 | 11 | name = 'quotes-js-1' 12 | http_user = '' # <-- your API key goes here 13 | 14 | def start_requests(self): 15 | yield SplashRequest('http://quotes.toscrape.com/js') 16 | 17 | def parse(self, response): 18 | for quote in response.css('div.quote'): 19 | yield { 20 | 'text': quote.css('span.text::text').extract_first(), 21 | 'author': quote.css('span small::text').extract_first(), 22 | 'tags': quote.css('div.tags a.tag::text').extract(), 23 | } 24 | 25 | next_page = response.css('li.next > a::attr(href)').extract_first() 26 | if next_page: 27 | yield SplashRequest(response.urljoin(next_page)) 28 | -------------------------------------------------------------------------------- /splash_based_project/splash_based_project/spiders/quotes-js-2.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from scrapy_splash import SplashRequest 3 | from w3lib.http import basic_auth_header 4 | 5 | 6 | class QuotesJs2Spider(scrapy.Spider): 7 | """Example spider using Splash to render JavaScript-based pages. 8 | Make sure you configure settings.py with your Splash 9 | credentials (available on Scrapy Cloud). 10 | """ 11 | name = 'quotes-js-2' 12 | 13 | def start_requests(self): 14 | yield SplashRequest( 15 | 'http://quotes.toscrape.com/js', 16 | splash_headers={ 17 | 'Authorization': basic_auth_header(self.settings['APIKEY'], ''), 18 | }, 19 | ) 20 | 21 | def parse(self, response): 22 | for quote in response.css('div.quote'): 23 | yield { 24 | 'text': quote.css('span.text::text').extract_first(), 25 | 'author': quote.css('span small::text').extract_first(), 26 | 'tags': quote.css('div.tags a.tag::text').extract(), 27 | } 28 | 29 | next_page = response.css('li.next > a::attr(href)').extract_first() 30 | if next_page: 31 | yield SplashRequest( 32 | response.urljoin(next_page), 33 | splash_headers={ 34 | 'Authorization': basic_auth_header(self.settings['APIKEY'], ''), 35 | }, 36 | ) 37 | -------------------------------------------------------------------------------- /splash_crawlera_example/README.md: -------------------------------------------------------------------------------- 1 | # Splash + Crawlera Example Project 2 | 3 | This example project shows how to use [Crawlera](http://scrapinghub.com/crawlera) 4 | (a smart downloader) and [Splash](https://scrapinghub.com/splash) (a JavaScript 5 | rendering service) with Scrapy spiders. 6 | 7 | 8 | ## How does it work? 9 | 10 | The integration between Splash and Crawlera is done by a 11 | [Lua script](https://github.com/scrapinghub/sample-projects/blob/master/splash_crawlera_example/splash_crawlera_example/scripts/crawlera.lua) 12 | that is sent to Splash with every request created by the spider. This script configures 13 | Splash to use Crawlera as its proxy and also defines a couple rules to avoid doing 14 | useless requests, such as analytics ones, stylesheets, images, etc. 15 | 16 | 17 | ## What do I need to run this project? 18 | 19 | Here's what you'll need: 20 | 21 | - a Splash instance and a Crawlera account: you can get both via Scrapy Cloud billing page 22 | - you can also run Splash in your own machine following the [instructions here](http://splash.readthedocs.io/en/stable/install.html) 23 | - set your Splash settings this project's [settings.py](https://github.com/scrapinghub/sample-projects/blob/master/splash_crawlera_example/splash_crawlera_example/settings.py) 24 | file: 25 | - `SPLASH_URL`: the URL where your Splash instance is available 26 | - `SPLASH_APIKEY`: your Splash API key (required if you're using an instance from Scrapy Cloud) 27 | - set your Crawlera settings in the same file: 28 | - `CRAWLERA_APIKEY`: the API key for your Crawlera user 29 | -------------------------------------------------------------------------------- /splash_crawlera_example/requirements.txt: -------------------------------------------------------------------------------- 1 | scrapy-splash 2 | -------------------------------------------------------------------------------- /splash_crawlera_example/scrapinghub.yml: -------------------------------------------------------------------------------- 1 | requirements_file: requirements.txt 2 | stacks: 3 | default: scrapy:1.3-py3 4 | -------------------------------------------------------------------------------- /splash_crawlera_example/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = splash_crawlera_example.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = splash_crawlera_example 12 | -------------------------------------------------------------------------------- /splash_crawlera_example/setup.py: -------------------------------------------------------------------------------- 1 | # Automatically created by: shub deploy 2 | from setuptools import setup, find_packages 3 | 4 | setup( 5 | name = 'project', 6 | version = '1.0', 7 | packages = find_packages(), 8 | package_data = {'splash_crawlera_example': ['scripts/*.lua',]}, 9 | entry_points = {'scrapy': ['settings = splash_crawlera_example.settings']}, 10 | ) 11 | -------------------------------------------------------------------------------- /splash_crawlera_example/splash_crawlera_example/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/sample-projects/fdff7a0be50041c30e059568aca0d81d1183ad53/splash_crawlera_example/splash_crawlera_example/__init__.py -------------------------------------------------------------------------------- /splash_crawlera_example/splash_crawlera_example/scripts/crawlera.lua: -------------------------------------------------------------------------------- 1 | function use_crawlera(splash) 2 | -- Make sure you pass your Crawlera API key in the 'crawlera_user' arg. 3 | -- Have a look at the file spiders/quotes-js.py to see how to do it. 4 | -- Find your Crawlera credentials in https://app.scrapinghub.com/ 5 | local user = splash.args.crawlera_user 6 | 7 | local host = 'proxy.crawlera.com' 8 | local port = 8010 9 | local session_header = 'X-Crawlera-Session' 10 | local session_id = 'create' 11 | 12 | splash:on_request(function (request) 13 | -- The commented code below can be used to speed up the crawling 14 | -- process. They filter requests to undesired domains and useless 15 | -- resources. Uncomment the ones that make sense to your use case 16 | -- and add your own rules. 17 | 18 | -- Discard requests to advertising and tracking domains. 19 | -- if string.find(request.url, 'doubleclick%.net') or 20 | -- string.find(request.url, 'analytics%.google%.com') then 21 | -- request.abort() 22 | -- return 23 | -- end 24 | 25 | -- Avoid using Crawlera for subresources fetching to increase crawling 26 | -- speed. The example below avoids using Crawlera for URLS starting 27 | -- with 'static.' and the ones ending with '.png'. 28 | -- if string.find(request.url, '://static%.') ~= nil or 29 | -- string.find(request.url, '%.png$') ~= nil then 30 | -- return 31 | -- end 32 | 33 | request:set_header('X-Crawlera-Cookies', 'disable') 34 | request:set_header(session_header, session_id) 35 | request:set_proxy{host, port, username=user, password=''} 36 | end) 37 | 38 | splash:on_response_headers(function (response) 39 | if type(response.headers[session_header]) ~= nil then 40 | session_id = response.headers[session_header] 41 | end 42 | end) 43 | end 44 | 45 | function main(splash) 46 | use_crawlera(splash) 47 | splash:go(splash.args.url) 48 | return splash:html() 49 | end 50 | -------------------------------------------------------------------------------- /splash_crawlera_example/splash_crawlera_example/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | BOT_NAME = 'splash_crawlera_example' 4 | SPIDER_MODULES = ['splash_crawlera_example.spiders'] 5 | NEWSPIDER_MODULE = 'splash_crawlera_example.spiders' 6 | 7 | SPIDER_MIDDLEWARES = { 8 | 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, 9 | } 10 | 11 | DOWNLOADER_MIDDLEWARES = { 12 | 'scrapy_splash.SplashCookiesMiddleware': 723, 13 | 'scrapy_splash.SplashMiddleware': 725, 14 | 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, 15 | } 16 | 17 | CRAWLERA_APIKEY = '' # Your crawlera API key 18 | 19 | # Splash settings 20 | SPLASH_URL = '' # Splash instance URL from Scrapy Cloud 21 | SPLASH_APIKEY = '' # Your API key for the Splash instance hosted on Scrapy Cloud 22 | DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter' 23 | -------------------------------------------------------------------------------- /splash_crawlera_example/splash_crawlera_example/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /splash_crawlera_example/splash_crawlera_example/spiders/quotes-js.py: -------------------------------------------------------------------------------- 1 | from pkgutil import get_data 2 | import scrapy 3 | from scrapy_splash import SplashRequest 4 | from w3lib.http import basic_auth_header 5 | 6 | 7 | class QuotesJsSpider(scrapy.Spider): 8 | name = 'quotes-js' 9 | 10 | def __init__(self, *args, **kwargs): 11 | # to be able to load the Lua script on Scrapy Cloud, make sure your 12 | # project's setup.py file contains the "package_data" setting, similar 13 | # to this project's setup.py 14 | self.LUA_SOURCE = get_data( 15 | 'splash_crawlera_example', 'scripts/crawlera.lua' 16 | ).decode('utf-8') 17 | super(QuotesJsSpider, self).__init__(*args, **kwargs) 18 | 19 | def start_requests(self): 20 | yield SplashRequest( 21 | url='http://quotes.toscrape.com/js/', 22 | endpoint='execute', 23 | splash_headers={ 24 | 'Authorization': basic_auth_header(self.settings['SPLASH_APIKEY'], ''), 25 | }, 26 | args={ 27 | 'lua_source': self.LUA_SOURCE, 28 | 'crawlera_user': self.settings['CRAWLERA_APIKEY'], 29 | }, 30 | # tell Splash to cache the lua script, to avoid sending it for every request 31 | cache_args=['lua_source'], 32 | ) 33 | 34 | def parse(self, response): 35 | for quote in response.css('div.quote'): 36 | yield { 37 | 'text': quote.css('span.text::text').extract_first(), 38 | 'author': quote.css('span small::text').extract_first(), 39 | 'tags': quote.css('div.tags a.tag::text').extract(), 40 | } 41 | next_page = response.css('li.next > a::attr(href)').extract_first() 42 | if next_page: 43 | yield SplashRequest( 44 | url=response.urljoin(next_page), 45 | endpoint='execute', 46 | splash_headers={ 47 | 'Authorization': basic_auth_header(self.settings['SPLASH_APIKEY'], ''), 48 | }, 49 | args={ 50 | 'lua_source': self.LUA_SOURCE, 51 | 'crawlera_user': self.settings['CRAWLERA_APIKEY'], 52 | }, 53 | cache_args=['lua_source'], 54 | ) 55 | -------------------------------------------------------------------------------- /splash_scrapy_spm_headless_proxy_example/README.md: -------------------------------------------------------------------------------- 1 | # Zyte Smart Proxy Headless Proxy + Splash + Scrapy Example Project 2 | 3 | This example project shows how to use [Smart Proxy Manager (Formally Crawlera)](https://www.zyte.com/smart-proxy-manager/) 4 | with [Zyte Smart Proxy Headless Proxy](https://github.com/zytedata/zyte-smartproxy-headless-proxy) 5 | and [Splash](https://www.zyte.com/splash/) (a JavaScript 6 | rendering service) with Scrapy spiders. 7 | 8 | 9 | ## How does it work? 10 | 11 | The integration between Splash and Zyte Smart Proxy Headless Proxy is done by a 12 | [Lua script](https://github.com/scrapinghub/sample-projects/blob/master/splash_scrapy_spm_headless_proxy_example/splash_scrapy_spm_headless_proxy_example/scripts/smart_proxy_manager.lua) 13 | that is sent to Splash with every request created by the spider. This script configures 14 | Splash to use Zyte Smart Proxy Headless Proxy as its proxy and also defines a couple rules to avoid 15 | doing useless requests, such as analytics ones, stylesheets, images, etc. 16 | 17 | 18 | ## What do I need to run this project? 19 | 20 | Here's what you'll need: 21 | 22 | - a Splash instance and a Smart Proxy Manager account: you can get both via Scrapy Cloud billing page 23 | - you can also run Splash in your own machine following the [instructions here](http://splash.readthedocs.io/en/stable/install.html) 24 | - Setup and run Zyte Smart Proxy Headless Proxy using this [documentation](https://docs.zyte.com/smart-proxy-manager/headless.html) 25 | - set your Splash settings this project's [settings.py](https://github.com/scrapinghub/sample-projects/blob/master/splash_scrapy_spm_headless_proxy_example/splash_scrapy_spm_headless_proxy_example/settings.py) 26 | file: 27 | - `SPLASH_URL`: the URL where your Splash instance is available, by default this is set to `http://localhost:8050`. 28 | -------------------------------------------------------------------------------- /splash_scrapy_spm_headless_proxy_example/requirements.txt: -------------------------------------------------------------------------------- 1 | scrapy 2 | scrapy-splash 3 | -------------------------------------------------------------------------------- /splash_scrapy_spm_headless_proxy_example/scrapinghub.yml: -------------------------------------------------------------------------------- 1 | requirements_file: requirements.txt 2 | stacks: 3 | default: scrapy:1.3-py3 4 | -------------------------------------------------------------------------------- /splash_scrapy_spm_headless_proxy_example/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = splash_scrapy_spm_headless_proxy_example.settings 8 | 9 | [deploy] 10 | project = splash_scrapy_spm_headless_proxy_example 11 | -------------------------------------------------------------------------------- /splash_scrapy_spm_headless_proxy_example/setup.py: -------------------------------------------------------------------------------- 1 | # Automatically created by: shub deploy 2 | from setuptools import setup, find_packages 3 | 4 | setup( 5 | name = 'project', 6 | version = '1.0', 7 | packages = find_packages(), 8 | package_data = {'splash_scrapy_spm_headless_proxy_example': ['scripts/*.lua',]}, 9 | entry_points = {'scrapy': ['settings = splash_scrapy_spm_headless_proxy_example.settings']}, 10 | ) 11 | -------------------------------------------------------------------------------- /splash_scrapy_spm_headless_proxy_example/splash_scrapy_spm_headless_proxy_example/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/sample-projects/fdff7a0be50041c30e059568aca0d81d1183ad53/splash_scrapy_spm_headless_proxy_example/splash_scrapy_spm_headless_proxy_example/__init__.py -------------------------------------------------------------------------------- /splash_scrapy_spm_headless_proxy_example/splash_scrapy_spm_headless_proxy_example/scripts/smart_proxy_manager.lua: -------------------------------------------------------------------------------- 1 | function use_crawlera(splash) 2 | local session_header = 'X-Crawlera-Session' 3 | local session_id = 'create' 4 | 5 | splash:on_request(function (request) 6 | -- The commented code below can be used to speed up the crawling 7 | -- process. They filter requests to undesired domains and useless 8 | -- resources. Uncomment the ones that make sense to your use case 9 | -- and add your own rules. 10 | 11 | -- Discard requests to advertising and tracking domains. 12 | -- if string.find(request.url, 'doubleclick%.net') or 13 | -- string.find(request.url, 'analytics%.google%.com') then 14 | -- request.abort() 15 | -- return 16 | -- end 17 | 18 | -- Avoid using Smart Proxy Manager for subresources fetching to increase crawling 19 | -- speed. The example below avoids using Smart Proxy Manager for URLS starting 20 | -- with 'static.' and the ones ending with '.png'. 21 | -- if string.find(request.url, '://static%.') ~= nil or 22 | -- string.find(request.url, '%.png$') ~= nil then 23 | -- return 24 | -- end 25 | 26 | -- Here, Splash will communicate with Zyte SmartProxy (formerly Crawlera) Headless Proxy. 27 | -- Zyte SmartProxy (formerly Crawlera) Headless Proxy should be up and running. 28 | request:set_proxy{"host.docker.internal", 3128} 29 | request:set_header('X-Crawlera-Profile', 'desktop') 30 | request:set_header('X-Crawlera-Cookies', 'disable') 31 | request:set_header(session_header, session_id) 32 | end) 33 | 34 | splash:on_response_headers(function (response) 35 | if type(response.headers[session_header]) ~= nil then 36 | session_id = response.headers[session_header] 37 | end 38 | end) 39 | end 40 | 41 | function main(splash) 42 | use_crawlera(splash) 43 | splash:go(splash.args.url) 44 | splash:wait(1) 45 | return splash:html() 46 | end 47 | -------------------------------------------------------------------------------- /splash_scrapy_spm_headless_proxy_example/splash_scrapy_spm_headless_proxy_example/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | BOT_NAME = 'splash_scrapy_spm_headless_proxy_example' 4 | SPIDER_MODULES = ['splash_scrapy_spm_headless_proxy_example.spiders'] 5 | NEWSPIDER_MODULE = 'splash_scrapy_spm_headless_proxy_example.spiders' 6 | 7 | SPIDER_MIDDLEWARES = { 8 | 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, 9 | } 10 | 11 | DOWNLOADER_MIDDLEWARES = { 12 | 'scrapy_splash.SplashCookiesMiddleware': 723, 13 | 'scrapy_splash.SplashMiddleware': 725, 14 | 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, 15 | } 16 | 17 | # Splash settings 18 | SPLASH_URL = 'http://localhost:8050' # Splash instance URL from Scrapy Cloud 19 | SPLASH_APIKEY = '' # Your API key for the Splash instance hosted on Scrapy Cloud 20 | DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter' 21 | -------------------------------------------------------------------------------- /splash_scrapy_spm_headless_proxy_example/splash_scrapy_spm_headless_proxy_example/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /splash_scrapy_spm_headless_proxy_example/splash_scrapy_spm_headless_proxy_example/spiders/quotes-js.py: -------------------------------------------------------------------------------- 1 | from pkgutil import get_data 2 | import scrapy 3 | from scrapy_splash import SplashRequest 4 | 5 | 6 | class QuotesJsSpider(scrapy.Spider): 7 | name = 'quotes-js' 8 | custom_settings = { 9 | 'RETRY_TIMES': 10, 10 | } 11 | 12 | def __init__(self, *args, **kwargs): 13 | # to be able to load the Lua script on Scrapy Cloud, make sure your 14 | # project's setup.py file contains the "package_data" setting, similar 15 | # to this project's setup.py 16 | self.LUA_SOURCE = get_data( 17 | 'splash_scrapy_spm_headless_proxy_example', 'scripts/smart_proxy_manager.lua' 18 | ).decode('utf-8') 19 | super(QuotesJsSpider, self).__init__(*args, **kwargs) 20 | 21 | def start_requests(self): 22 | yield SplashRequest( 23 | url='http://quotes.toscrape.com/js/', 24 | endpoint='execute', 25 | args={ 26 | 'lua_source': self.LUA_SOURCE, 27 | 'timeout': 60, 28 | }, 29 | # tell Splash to cache the lua script, to avoid sending it for every request 30 | cache_args=['lua_source'], 31 | meta={ 32 | 'max_retry_times': 10, 33 | } 34 | ) 35 | 36 | def parse(self, response): 37 | for quote in response.css('div.quote'): 38 | yield { 39 | 'text': quote.css('span.text::text').extract_first(), 40 | 'author': quote.css('span small::text').extract_first(), 41 | 'tags': quote.css('div.tags a.tag::text').extract(), 42 | } 43 | next_page = response.css('li.next > a::attr(href)').extract_first() 44 | if next_page: 45 | yield SplashRequest( 46 | url=response.urljoin(next_page), 47 | endpoint='execute', 48 | args={'lua_source': self.LUA_SOURCE}, 49 | cache_args=['lua_source'], 50 | ) 51 | -------------------------------------------------------------------------------- /splash_smart_proxy_manager_example/README.md: -------------------------------------------------------------------------------- 1 | # Splash + Smart Proxy Manager Example Project 2 | 3 | This example project shows how to use [Smart Proxy Manager (Formally Crawlera)](https://www.zyte.com/smart-proxy-manager/) and [Splash](https://www.zyte.com/splash/) (a JavaScript 4 | rendering service) with Scrapy spiders. 5 | 6 | 7 | ## How does it work? 8 | 9 | The integration between Splash and Smart Proxy Manager is done by a 10 | [Lua script](https://github.com/scrapinghub/sample-projects/blob/master/splash_crawlera_example/splash_crawlera_example/scripts/crawlera.lua) 11 | that is sent to Splash with every request created by the spider. This script configures 12 | Splash to use Smart Proxy Manager as its proxy and also defines a couple rules to avoid doing 13 | useless requests, such as analytics ones, stylesheets, images, etc. 14 | 15 | 16 | ## What do I need to run this project? 17 | 18 | Here's what you'll need: 19 | 20 | - a Splash instance and a Smart Proxy Manager account: you can get both via Scrapy Cloud billing page 21 | - you can also run Splash in your own machine following the [instructions here](http://splash.readthedocs.io/en/stable/install.html) 22 | - set your Splash settings this project's [settings.py](https://github.com/scrapinghub/sample-projects/blob/master/splash_crawlera_example/splash_crawlera_example/settings.py) 23 | file: 24 | - `SPLASH_URL`: the URL where your Splash instance is available 25 | - `SPLASH_APIKEY`: your Splash API key (required if you're using an instance from Scrapy Cloud) 26 | - set your Crawlera settings in the same file: 27 | - `ZYTE_SMARTPROXY_APIKEY`: the API key for your Smart Proxy Manager user 28 | -------------------------------------------------------------------------------- /splash_smart_proxy_manager_example/requirements.txt: -------------------------------------------------------------------------------- 1 | scrapy 2 | scrapy-splash 3 | -------------------------------------------------------------------------------- /splash_smart_proxy_manager_example/scrapinghub.yml: -------------------------------------------------------------------------------- 1 | requirements_file: requirements.txt 2 | stacks: 3 | default: scrapy:1.3-py3 4 | -------------------------------------------------------------------------------- /splash_smart_proxy_manager_example/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = splash_smart_proxy_manager_example.settings 8 | 9 | [deploy] 10 | project = splash_smart_proxy_manager_example 11 | -------------------------------------------------------------------------------- /splash_smart_proxy_manager_example/setup.py: -------------------------------------------------------------------------------- 1 | # Automatically created by: shub deploy 2 | from setuptools import setup, find_packages 3 | 4 | setup( 5 | name = 'project', 6 | version = '1.0', 7 | packages = find_packages(), 8 | package_data = {'splash_smart_proxy_manager_example': ['scripts/*.lua',]}, 9 | entry_points = {'scrapy': ['settings = splash_smart_proxy_manager_example.settings']}, 10 | ) 11 | -------------------------------------------------------------------------------- /splash_smart_proxy_manager_example/splash_smart_proxy_manager_example/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scrapinghub/sample-projects/fdff7a0be50041c30e059568aca0d81d1183ad53/splash_smart_proxy_manager_example/splash_smart_proxy_manager_example/__init__.py -------------------------------------------------------------------------------- /splash_smart_proxy_manager_example/splash_smart_proxy_manager_example/scripts/smart_proxy_manager.lua: -------------------------------------------------------------------------------- 1 | function use_crawlera(splash) 2 | -- Make sure you pass your Smart Proxy Manager API key in the 'crawlera_user' arg. 3 | -- Have a look at the file spiders/quotes-js.py to see how to do it. 4 | -- Find your Smart Proxy Manager credentials in https://app.zyte.com/ 5 | local user = splash.args.crawlera_user 6 | local password = '' 7 | 8 | local host = 'proxy.zyte.com' 9 | local port = 8011 10 | local session_header = 'X-Crawlera-Session' 11 | local session_id = 'create' 12 | 13 | splash:on_request(function (request) 14 | -- The commented code below can be used to speed up the crawling 15 | -- process. They filter requests to undesired domains and useless 16 | -- resources. Uncomment the ones that make sense to your use case 17 | -- and add your own rules. 18 | 19 | -- Discard requests to advertising and tracking domains. 20 | -- if string.find(request.url, 'doubleclick%.net') or 21 | -- string.find(request.url, 'analytics%.google%.com') then 22 | -- request.abort() 23 | -- return 24 | -- end 25 | 26 | -- Avoid using Smart Proxy Manager for subresources fetching to increase crawling 27 | -- speed. The example below avoids using Smart Proxy Manager for URLS starting 28 | -- with 'static.' and the ones ending with '.png'. 29 | -- if string.find(request.url, '://static%.') ~= nil or 30 | -- string.find(request.url, '%.png$') ~= nil then 31 | -- return 32 | -- end 33 | request:set_proxy(host, port, user, password) 34 | request:set_header('X-Crawlera-Profile', 'desktop') 35 | request:set_header('X-Crawlera-Cookies', 'disable') 36 | request:set_header(session_header, session_id) 37 | end) 38 | 39 | splash:on_response_headers(function (response) 40 | if type(response.headers[session_header]) ~= nil then 41 | session_id = response.headers[session_header] 42 | end 43 | end) 44 | end 45 | 46 | function main(splash) 47 | use_crawlera(splash) 48 | splash:go(splash.args.url) 49 | splash:wait(1) 50 | return splash:html() 51 | end 52 | -------------------------------------------------------------------------------- /splash_smart_proxy_manager_example/splash_smart_proxy_manager_example/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | BOT_NAME = 'splash_smart_proxy_manager_example' 4 | SPIDER_MODULES = ['splash_smart_proxy_manager_example.spiders'] 5 | NEWSPIDER_MODULE = 'splash_smart_proxy_manager_example.spiders' 6 | 7 | SPIDER_MIDDLEWARES = { 8 | 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, 9 | } 10 | 11 | DOWNLOADER_MIDDLEWARES = { 12 | 'scrapy_splash.SplashCookiesMiddleware': 723, 13 | 'scrapy_splash.SplashMiddleware': 725, 14 | 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, 15 | } 16 | 17 | 18 | ZYTE_SMARTPROXY_APIKEY = 'API-KEY' 19 | 20 | # Splash settings 21 | SPLASH_URL = '' # Splash instance URL from Scrapy Cloud 22 | SPLASH_APIKEY = '' # Your API key for the Splash instance hosted on Scrapy Cloud 23 | DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter' 24 | -------------------------------------------------------------------------------- /splash_smart_proxy_manager_example/splash_smart_proxy_manager_example/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /splash_smart_proxy_manager_example/splash_smart_proxy_manager_example/spiders/quotes-js.py: -------------------------------------------------------------------------------- 1 | from pkgutil import get_data 2 | import scrapy 3 | from scrapy_splash import SplashRequest 4 | from w3lib.http import basic_auth_header 5 | 6 | 7 | class QuotesJsSpider(scrapy.Spider): 8 | name = 'quotes-js' 9 | custom_settings = { 10 | 'RETRY_TIMES': 10, 11 | } 12 | 13 | 14 | def __init__(self, *args, **kwargs): 15 | # to be able to load the Lua script on Scrapy Cloud, make sure your 16 | # project's setup.py file contains the "package_data" setting, similar 17 | # to this project's setup.py 18 | self.LUA_SOURCE = get_data( 19 | 'splash_smart_proxy_manager_example', 'scripts/smart_proxy_manager.lua' 20 | ).decode('utf-8') 21 | super(QuotesJsSpider, self).__init__(*args, **kwargs) 22 | 23 | def start_requests(self): 24 | yield SplashRequest( 25 | url='http://quotes.toscrape.com/js/', 26 | endpoint='execute', 27 | splash_headers={ 28 | 'Authorization': basic_auth_header(self.settings['SPLASH_APIKEY'], ''), 29 | }, 30 | args={ 31 | 'lua_source': self.LUA_SOURCE, 32 | 'crawlera_user': self.settings['ZYTE_SMARTPROXY_APIKEY'], 33 | 'timeout': 60, 34 | }, 35 | # tell Splash to cache the lua script, to avoid sending it for every request 36 | cache_args=['lua_source'], 37 | meta={ 38 | 'max_retry_times': 10, 39 | } 40 | ) 41 | 42 | def parse(self, response): 43 | for quote in response.css('div.quote'): 44 | yield { 45 | 'text': quote.css('span.text::text').extract_first(), 46 | 'author': quote.css('span small::text').extract_first(), 47 | 'tags': quote.css('div.tags a.tag::text').extract(), 48 | } 49 | next_page = response.css('li.next > a::attr(href)').extract_first() 50 | if next_page: 51 | yield SplashRequest( 52 | url=response.urljoin(next_page), 53 | endpoint='execute', 54 | splash_headers={ 55 | 'Authorization': basic_auth_header(self.settings['SPLASH_APIKEY'], ''), 56 | }, 57 | args={ 58 | 'lua_source': self.LUA_SOURCE, 59 | 'crawlera_user': self.settings['ZYTE_SMARTPROXY_APIKEY'], 60 | }, 61 | cache_args=['lua_source'], 62 | ) 63 | --------------------------------------------------------------------------------