├── .gitignore
├── README.md
├── quotes_crawler
    ├── README.md
    ├── quotes_crawler
    │   ├── __init__.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   ├── toscrape-csrf-login-v1.py
    │   │   ├── toscrape-csrf-login-v2.py
    │   │   ├── toscrape-css.py
    │   │   ├── toscrape-infinite-scrolling.py
    │   │   ├── toscrape-js.py
    │   │   ├── toscrape-microdata.py
    │   │   ├── toscrape-selenium.py
    │   │   └── toscrape-xpath.py
    ├── requirements.txt
    └── scrapy.cfg
├── sc_custom_image
    ├── README.md
    ├── requirements.txt
    ├── sc_custom_image
    │   ├── __init__.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   └── demo.py
    ├── scrapinghub.yml
    └── scrapy.cfg
├── sc_scripts_demo
    ├── bin
    │   └── check_jobs.py
    ├── requirements.txt
    ├── sc_scripts_demo
    │   ├── __init__.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   ├── bad_spider.py
    │   │   └── good_spider.py
    ├── scrapinghub.yml
    ├── scrapy.cfg
    └── setup.py
├── scrapy_price_monitor
    ├── .gitignore
    ├── README.md
    ├── bin
    │   └── monitor.py
    ├── price_monitor
    │   ├── __init__.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── resources
    │   │   └── urls.json
    │   ├── settings.py
    │   ├── spiders
    │   │   ├── __init__.py
    │   │   ├── amazon.py
    │   │   ├── base_spider.py
    │   │   ├── bestbuy.py
    │   │   └── ebay.py
    │   ├── templates
    │   │   └── email.html
    │   └── utils.py
    ├── requirements.txt
    ├── scrapinghub.yml
    ├── scrapy.cfg
    └── setup.py
├── splash_based_project
    ├── scrapy.cfg
    └── splash_based_project
    │   ├── __init__.py
    │   ├── settings.py
    │   └── spiders
    │       ├── __init__.py
    │       ├── quotes-js-1.py
    │       └── quotes-js-2.py
├── splash_crawlera_example
    ├── README.md
    ├── requirements.txt
    ├── scrapinghub.yml
    ├── scrapy.cfg
    ├── setup.py
    └── splash_crawlera_example
    │   ├── __init__.py
    │   ├── scripts
    │       └── crawlera.lua
    │   ├── settings.py
    │   └── spiders
    │       ├── __init__.py
    │       └── quotes-js.py
├── splash_scrapy_spm_headless_proxy_example
    ├── README.md
    ├── requirements.txt
    ├── scrapinghub.yml
    ├── scrapy.cfg
    ├── setup.py
    └── splash_scrapy_spm_headless_proxy_example
    │   ├── __init__.py
    │   ├── scripts
    │       └── smart_proxy_manager.lua
    │   ├── settings.py
    │   └── spiders
    │       ├── __init__.py
    │       └── quotes-js.py
└── splash_smart_proxy_manager_example
    ├── README.md
    ├── requirements.txt
    ├── scrapinghub.yml
    ├── scrapy.cfg
    ├── setup.py
    └── splash_smart_proxy_manager_example
        ├── __init__.py
        ├── scripts
            └── smart_proxy_manager.lua
        ├── settings.py
        └── spiders
            ├── __init__.py
            └── quotes-js.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | build
3 | *.egg
4 | *.egg-info
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## Zyte (Formally Scrapinghub) Sample Projects
2 | 
3 | This repo contains a few sample projects demonstrating capabilities
4 | of [Zyte (Formally Scrapinghub)](https://zyte.com) technologies.
5 | 
6 | There is not much to see here yet, but stay tuned, we're just getting started!
7 | 


--------------------------------------------------------------------------------
/quotes_crawler/README.md:
--------------------------------------------------------------------------------
 1 | #Spiders for Quotes.Toscrape.com
 2 | 
 3 | This project contains spiders to scrape many variations of the [quotes.toscrape.com](https://quotes.toscrape.com), such as:
 4 | 
 5 | * `toscrape-css`: scrapes [quotes.toscrape.com](https://quotes.toscrape.com) using CSS selectors;
 6 | * `toscrape-xpath`: scrapes [quotes.toscrape.com](https://quotes.toscrape.com) using XPath;
 7 | * `toscrape-microdata`: read the semantic markup data from [quotes.toscrape.com](https://quotes.toscrape.com) using [extruct](https://github.com/scrapinghub/extruct);
 8 | * `toscrape-js`: scrapes the JavaScript-powered version of `Quotes to Scrape`([quotes.toscrape.com/js](https://quotes.toscrape.com/js)) using [js2xml](https://github.com/scrapinghub/js2xml) to parse the data from inside the JavaScript code;
 9 | * `toscrape-selenium`: scrapes the JavaScript-powered version of `Quotes to Scrape`([quotes.toscrape.com/js](https://quotes.toscrape.com/js)) using Selenium + PhantomJS to render the page;
10 | * `toscrape-infinite-scrolling`: scrapes the infinite scrolling version ([quotes.toscrape.com/scroll](https://quotes.toscrape.com/scroll)) via AJAX API calls;
11 | * `toscrape-csrf-login-v1`: authenticates into [quotes.toscrape.com/login](https://quotes.toscrape.com/login) loading the CSRF token manually into the request;
12 | * `toscrape-csrf-login-v2`: authenticates into [quotes.toscrape.com/login](https://quotes.toscrape.com/login) using `FormRequest.from_respose()` to load automatically the CSRF token;
13 | 
14 | 


--------------------------------------------------------------------------------
/quotes_crawler/quotes_crawler/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/sample-projects/fdff7a0be50041c30e059568aca0d81d1183ad53/quotes_crawler/quotes_crawler/__init__.py


--------------------------------------------------------------------------------
/quotes_crawler/quotes_crawler/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class QuotesCrawlerItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/quotes_crawler/quotes_crawler/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class QuotesCrawlerPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/quotes_crawler/quotes_crawler/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for quotes_crawler project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'quotes_crawler'
13 | 
14 | SPIDER_MODULES = ['quotes_crawler.spiders']
15 | NEWSPIDER_MODULE = 'quotes_crawler.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'quotes_crawler (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'quotes_crawler.middlewares.MyCustomSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'quotes_crawler.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'quotes_crawler.pipelines.SomePipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/quotes_crawler/quotes_crawler/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/quotes_crawler/quotes_crawler/spiders/toscrape-csrf-login-v1.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | 
 3 | 
 4 | class ToScrapeCSRFLoginSpiderV1(scrapy.Spider):
 5 |     name = 'toscrape-csrf-login-v1'
 6 |     start_urls = [
 7 |         'http://quotes.toscrape.com/login'
 8 |     ]
 9 | 
10 |     def parse(self, response):
11 |         # Forms with CSRF verification generates a CSRF token for each request
12 |         # and they require that same value in the data the client sends back.
13 |         # WARNING:
14 |         #  This could be done automatically using FormRequest.from_response()
15 |         #  check toscrape-csrf-login-v2.py for reference
16 |         token = response.css("input[name=csrf_token] ::attr(value)").extract_first()
17 |         yield scrapy.FormRequest(
18 |             self.start_urls[0],
19 |             formdata={
20 |                 'csrf_token': token,
21 |                 'username': 'valdir',
22 |                 'password': 'abc'
23 |             },
24 |             callback=self.after_login
25 |         )
26 | 
27 |     def after_login(self, response):
28 |         authenticated = response.css('div.header-box p > a::text').extract_first() == 'Logout'
29 |         for quote in response.css('div.quote'):
30 |             yield {
31 |                 'text': quote.css('span::text').extract_first(),
32 |                 'author': quote.css('small::text').extract_first(),
33 |                 'tags': quote.css('.tags a::text').extract(),
34 |                 'authenticated': authenticated,
35 |             }
36 | 


--------------------------------------------------------------------------------
/quotes_crawler/quotes_crawler/spiders/toscrape-csrf-login-v2.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | 
 3 | 
 4 | class ToScrapeCSRFLoginSpiderV2(scrapy.Spider):
 5 |     name = 'toscrape-csrf-login-v2'
 6 |     start_urls = [
 7 |         'http://quotes.toscrape.com/login'
 8 |     ]
 9 | 
10 |     def parse(self, response):
11 |         # FormRequest.from_response automatically loads all the form data that
12 |         # is in the form present in the response object. This way, we don't
13 |         # have to worry about explicitly loading the CSRF token in the data we
14 |         # will POST to the server.
15 |         yield scrapy.FormRequest.from_response(
16 |             response,
17 |             formdata={
18 |                 'username': 'any',
19 |                 'password': 'doesnt matter'
20 |             },
21 |             callback=self.after_login,
22 |         )
23 | 
24 |     def after_login(self, response):
25 |         authenticated = response.css('div.header-box p > a::text').extract_first() == 'Logout'
26 |         for quote in response.css('div.quote'):
27 |             yield {
28 |                 'text': quote.css('span::text').extract_first(),
29 |                 'author': quote.css('small::text').extract_first(),
30 |                 'tags': quote.css('.tags a::text').extract(),
31 |                 'authenticated': authenticated,
32 |             }
33 | 


--------------------------------------------------------------------------------
/quotes_crawler/quotes_crawler/spiders/toscrape-css.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | 
 4 | 
 5 | class ToScrapeCSSSpider(scrapy.Spider):
 6 |     name = "toscrape-css"
 7 |     start_urls = [
 8 |         'http://quotes.toscrape.com/',
 9 |     ]
10 | 
11 |     def parse(self, response):
12 |         for quote in response.css("div.quote"):
13 |             yield {
14 |                 'text': quote.css("span.text::text").extract_first(),
15 |                 'author': quote.css("small.author::text").extract_first(),
16 |                 'tags': quote.css("div.tags > a.tag::text").extract()
17 |             }
18 | 
19 |         next_page_url = response.css("li.next > a::attr(href)").extract_first()
20 |         if next_page_url is not None:
21 |             yield scrapy.Request(response.urljoin(next_page_url))
22 | 


--------------------------------------------------------------------------------
/quotes_crawler/quotes_crawler/spiders/toscrape-infinite-scrolling.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import scrapy
 3 | 
 4 | 
 5 | # Most AJAX based websites can be scraped by reproducing the API calls made
 6 | # by the browser, as we do in this simple example that scrapes
 7 | # a website paginated via infinite scrolling (quotes.toscrape.com/scroll)
 8 | class ToScrapeInfiniteScrollingSpider(scrapy.Spider):
 9 |     name = 'toscrape-infinite-scrolling'
10 |     base_url = 'http://quotes.toscrape.com/api/quotes?page=%d'
11 |     start_urls = [base_url % 1]
12 | 
13 |     def parse(self, response):
14 |         json_data = json.loads(response.text)
15 |         for quote in json_data['quotes']:
16 |             yield quote
17 |         if json_data['has_next']:
18 |             yield scrapy.Request(self.base_url % (int(json_data['page']) + 1))
19 | 


--------------------------------------------------------------------------------
/quotes_crawler/quotes_crawler/spiders/toscrape-js.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import js2xml
 3 | 
 4 | 
 5 | class ToScrapeJSSpider(scrapy.Spider):
 6 |     name = 'toscrape-js'
 7 |     start_urls = [
 8 |         'http://quotes.toscrape.com/js/'
 9 |     ]
10 | 
11 |     def parse(self, response):
12 |         script = response.xpath('//script[contains(., "var data =")]/text()').extract_first()
13 |         sel = scrapy.Selector(_root=js2xml.parse(script))
14 |         for quote in sel.xpath('//var[@name="data"]/array/object'):
15 |             yield {
16 |                 'text': quote.xpath('string(./property[@name="text"])').extract_first(),
17 |                 'author': quote.xpath(
18 |                     'string(./property[@name="author"]//property[@name="name"])'
19 |                 ).extract_first(),
20 |                 'tags': quote.xpath('./property[@name="tags"]//string/text()').extract(),
21 |             }
22 | 
23 |         link_next = response.css('li.next a::attr("href")').extract_first()
24 |         if link_next:
25 |             yield scrapy.Request(response.urljoin(link_next))
26 | 


--------------------------------------------------------------------------------
/quotes_crawler/quotes_crawler/spiders/toscrape-microdata.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | from extruct.w3cmicrodata import LxmlMicrodataExtractor
 3 | 
 4 | 
 5 | class ToScrapeMicrodataSpider(scrapy.Spider):
 6 |     name = "toscrape-microdata"
 7 |     start_urls = [
 8 |         'http://quotes.toscrape.com/'
 9 |     ]
10 | 
11 |     def parse(self, response):
12 |         extractor = LxmlMicrodataExtractor()
13 |         items = extractor.extract(response.text, response.url)['items']
14 |         for it in items:
15 |             yield it['properties']
16 | 
17 |         next_page_url = response.css("li.next > a::attr(href)").extract_first()
18 |         if next_page_url is not None:
19 |             yield scrapy.Request(response.urljoin(next_page_url))
20 | 


--------------------------------------------------------------------------------
/quotes_crawler/quotes_crawler/spiders/toscrape-selenium.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | import scrapy
 3 | from selenium import webdriver
 4 | 
 5 | 
 6 | # this spider needs PhantomJS (http://phantomjs.org/) installed somewhere in your PATH
 7 | class ToScrapeSeleniumSpider(scrapy.Spider):
 8 |     name = 'toscrape-selenium'
 9 |     start_urls = [
10 |         'http://quotes.toscrape.com/js'
11 |     ]
12 | 
13 |     def __init__(self, *args, **kwargs):
14 |         self.driver = webdriver.PhantomJS()
15 |         super(ToScrapeSeleniumSpider, self).__init__(*args, **kwargs)
16 | 
17 |     def parse(self, response):
18 |         self.driver.get(response.url)
19 |         for quote in self.driver.find_elements_by_css_selector('div.quote'):
20 |             yield {
21 |                 'quote': quote.find_element_by_css_selector("span.text").text,
22 |                 'author': quote.find_element_by_css_selector("small.author").text,
23 |                 'tags': [e.text for e in quote.find_elements_by_class_name('tag')],
24 |             }
25 |         # pagination links are not generated by JS code in this page
26 |         next_page_url = response.css("li.next > a::attr(href)").extract_first()
27 |         if next_page_url is not None:
28 |             yield scrapy.Request(response.urljoin(next_page_url))
29 | 


--------------------------------------------------------------------------------
/quotes_crawler/quotes_crawler/spiders/toscrape-xpath.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | 
 4 | 
 5 | class ToScrapeSpiderXPath(scrapy.Spider):
 6 |     name = 'toscrape-xpath'
 7 |     start_urls = [
 8 |         'http://quotes.toscrape.com/',
 9 |     ]
10 | 
11 |     def parse(self, response):
12 |         for quote in response.xpath('//div[@class="quote"]'):
13 |             yield {
14 |                 'text': quote.xpath('./span[@class="text"]/text()').extract_first(),
15 |                 'author': quote.xpath('.//small[@class="author"]/text()').extract_first(),
16 |                 'tags': quote.xpath('.//div[@class="tags"]/a[@class="tag"]/text()').extract()
17 |             }
18 | 
19 |         next_page_url = response.xpath('//li[@class="next"]/a/@href').extract_first()
20 |         if next_page_url is not None:
21 |             yield scrapy.Request(response.urljoin(next_page_url))
22 | 


--------------------------------------------------------------------------------
/quotes_crawler/requirements.txt:
--------------------------------------------------------------------------------
1 | extruct
2 | js2xml
3 | selenium
4 | 


--------------------------------------------------------------------------------
/quotes_crawler/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = quotes_crawler.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = quotes_crawler
12 | 


--------------------------------------------------------------------------------
/sc_custom_image/README.md:
--------------------------------------------------------------------------------
1 | ## Scrapy Cloud Custom Image
2 | 
3 | Sample Scrapy project demonstrating using PhantomJS and
4 | deploying it to Scrapy Cloud using a custom Docker image.
5 | 


--------------------------------------------------------------------------------
/sc_custom_image/requirements.txt:
--------------------------------------------------------------------------------
1 | selenium
2 | 


--------------------------------------------------------------------------------
/sc_custom_image/sc_custom_image/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/sample-projects/fdff7a0be50041c30e059568aca0d81d1183ad53/sc_custom_image/sc_custom_image/__init__.py


--------------------------------------------------------------------------------
/sc_custom_image/sc_custom_image/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class ScCustomImageItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/sc_custom_image/sc_custom_image/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class ScCustomImagePipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/sc_custom_image/sc_custom_image/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for sc_custom_image project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'sc_custom_image'
13 | 
14 | SPIDER_MODULES = ['sc_custom_image.spiders']
15 | NEWSPIDER_MODULE = 'sc_custom_image.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'sc_custom_image (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'sc_custom_image.middlewares.MyCustomSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'sc_custom_image.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'sc_custom_image.pipelines.SomePipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/sc_custom_image/sc_custom_image/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/sc_custom_image/sc_custom_image/spiders/demo.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | from selenium import webdriver
 3 | 
 4 | 
 5 | class DemoSpider(scrapy.Spider):
 6 |     name = 'demo'
 7 |     start_urls = ['http://quotes.toscrape.com/js']
 8 | 
 9 |     def __init__(self, *args, **kwargs):
10 |         # XXX: needs phantomjs binary available in PATH
11 |         self.driver = webdriver.PhantomJS()
12 |         super(DemoSpider, self).__init__(*args, **kwargs)
13 | 
14 |     def parse(self, response):
15 |         self.driver.get(response.url)
16 |         for quote in self.driver.find_elements_by_css_selector('div.quote'):
17 |             yield {
18 |                 'quote': quote.find_element_by_css_selector('span').text,
19 |                 'author': quote.find_element_by_css_selector('small').text,
20 |             }
21 |         next_page_url = response.css('nav li.next a ::attr(href)').extract_first()
22 |         if next_page_url:
23 |             yield scrapy.Request(response.urljoin(next_page_url))
24 | 


--------------------------------------------------------------------------------
/sc_custom_image/scrapinghub.yml:
--------------------------------------------------------------------------------
1 | project: PUT_YOUR_PROJECT_ID_HERE
2 | requirements_file: ./requirements.txt
3 | image: true
4 | 


--------------------------------------------------------------------------------
/sc_custom_image/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = sc_custom_image.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = sc_custom_image
12 | 


--------------------------------------------------------------------------------
/sc_scripts_demo/bin/check_jobs.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """Simple monitor jobs checker for the last 24 hours
  4 | """
  5 | 
  6 | from __future__ import print_function
  7 | 
  8 | import argparse
  9 | import os
 10 | 
 11 | import boto
 12 | from datetime import datetime
 13 | from datetime import timedelta
 14 | from scrapinghub import Project, Connection
 15 | 
 16 | # Configure your SES credentials here
 17 | AWS_ACCESS_KEY = ''
 18 | AWS_SECRET_KEY = ''
 19 | 
 20 | # Configure the Mail-from here
 21 | DEFAULT_MAIL_FROM = 'Custom Notification <noreply@yourdomain.com>'
 22 | 
 23 | 
 24 | def send_email(recipients, subject, body, mail_from=DEFAULT_MAIL_FROM):
 25 |     """Send an email using AWS Simple Email Service
 26 |     """
 27 |     ses = boto.connect_ses(AWS_ACCESS_KEY, AWS_SECRET_KEY)
 28 |     ses.send_email(mail_from, subject, body, recipients)
 29 |     print('Email sent to %s' % ', '.join(recipients))
 30 | 
 31 | 
 32 | def parse_date(date_str):
 33 |     return datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S')
 34 | 
 35 | 
 36 | def has_job_error(job):
 37 |     success_reason = 'no_reason'
 38 |     return (job.info.get('errors_count', 0) > 0
 39 |             or job.info.get('close_reason') != success_reason)
 40 | 
 41 | 
 42 | def is_job_newer_than(job, since_time):
 43 |     cancelled_before_starting = ('updated_time' not in job.info
 44 |                                  and job.info.get('close_reason') == 'cancelled')
 45 |     if cancelled_before_starting:
 46 |         return False
 47 |     return since_time <= parse_date(job.info['updated_time'])
 48 | 
 49 | 
 50 | def get_last_24h_jobs(apikey, project_id):
 51 |     """Fetch jobs that finished in the last 24 hours
 52 |     """
 53 |     project = Project(Connection(apikey), project_id)
 54 |     since_time = datetime.utcnow() - timedelta(hours=24)
 55 |     jobs = [
 56 |         job for job in project.jobs(state='finished')
 57 |         if is_job_newer_than(job, since_time)
 58 |     ]
 59 |     return jobs
 60 | 
 61 | 
 62 | def render_report(jobs_with_error):
 63 |     """Build a text report for the jobs with errors
 64 |     """
 65 |     output = []
 66 |     for job in jobs_with_error:
 67 |         errors_count = job.info.get('errors_count', 0)
 68 |         close_reason = job.info.get('close_reason')
 69 | 
 70 |         job_id = job.info["id"].split('/')
 71 |         url = 'https://app.scrapinghub.com/p/{0}/job/{1}/{2}'.format(
 72 |             job_id[0], job_id[1], job_id[2])
 73 | 
 74 |         error_message = ['Errors found for job "{0}" ({1}):'.format(
 75 |             job.info['spider'], url)]
 76 |         if errors_count > 0:
 77 |             error_message.append('    There were {} error{}.'.format(
 78 |                 errors_count, '' if errors_count == 1 else 's'))
 79 | 
 80 |         success_reasons = ('no_reason', 'finished')
 81 |         if close_reason not in success_reasons:
 82 |             error_message.append('    Close reason should not be "{}".'.format(
 83 |                 close_reason))
 84 |         output.append('\n'.join(error_message))
 85 | 
 86 |     return '\n\n'.join(output)
 87 | 
 88 | 
 89 | def main(args):
 90 |     job_list = get_last_24h_jobs(args.apikey, args.project_id)
 91 |     jobs_with_errors = [job for job in job_list if has_job_error(job)]
 92 | 
 93 |     if jobs_with_errors:
 94 |         report = render_report(jobs_with_errors)
 95 |         if args.mail:
 96 |             subject = 'Scrapy Cloud - jobs with errors'
 97 |             send_email(args.mail, subject, body=report)
 98 |         else:
 99 |             print(report)
100 |     else:
101 |         print('No errors found.')
102 | 
103 | 
104 | def parse_args():
105 |     parser = argparse.ArgumentParser(description=__doc__)
106 | 
107 |     parser.add_argument('--apikey', default=os.getenv('SHUB_APIKEY', None),
108 |                         help='API key to use for scrapinghub (will fallback '
109 |                              'to SHUB_APIKEY variable)')
110 |     parser.add_argument('project_id', type=int,
111 |                         help='Project ID to get info from.')
112 |     parser.add_argument('--mail', action='append', help='Send output as email')
113 |     args = parser.parse_args()
114 | 
115 |     if not args.apikey:
116 |         parser.error('Please provide an API key with --apikey option')
117 |     return args
118 | 
119 | 
120 | if '__main__' == __name__:
121 |     main(parse_args())
122 | 


--------------------------------------------------------------------------------
/sc_scripts_demo/requirements.txt:
--------------------------------------------------------------------------------
1 | scrapinghub
2 | 


--------------------------------------------------------------------------------
/sc_scripts_demo/sc_scripts_demo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/sample-projects/fdff7a0be50041c30e059568aca0d81d1183ad53/sc_scripts_demo/sc_scripts_demo/__init__.py


--------------------------------------------------------------------------------
/sc_scripts_demo/sc_scripts_demo/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | BOT_NAME = 'sc_scripts_demo'
 4 | 
 5 | SPIDER_MODULES = ['sc_scripts_demo.spiders']
 6 | NEWSPIDER_MODULE = 'sc_scripts_demo.spiders'
 7 | 
 8 | USER_AGENT = 'sc_scripts_demo (http://scrapinghub.com)'
 9 | 
10 | ROBOTSTXT_OBEY = True
11 | 


--------------------------------------------------------------------------------
/sc_scripts_demo/sc_scripts_demo/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/sc_scripts_demo/sc_scripts_demo/spiders/bad_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | 
 4 | 
 5 | class SpiderWithErrors(scrapy.Spider):
 6 |     name = "bad"
 7 |     start_urls = [
 8 |         'http://quotes.toscrape.com/',
 9 |     ]
10 | 
11 |     def parse(self, response):
12 |         raise ValueError('Oops, this spider has errors')
13 | 


--------------------------------------------------------------------------------
/sc_scripts_demo/sc_scripts_demo/spiders/good_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | 
 4 | 
 5 | class NiceWorkingSpider(scrapy.Spider):
 6 |     name = "good"
 7 |     start_urls = [
 8 |         'http://quotes.toscrape.com/',
 9 |     ]
10 | 
11 |     def parse(self, response):
12 |         for quote in response.css("div.quote"):
13 |             yield {
14 |                 'text': quote.css("span.text::text").extract_first(),
15 |                 'author': quote.css("small.author::text").extract_first(),
16 |                 'tags': quote.css("div.tags > a.tag::text").extract()
17 |             }
18 | 
19 |         next_page_url = response.css("li.next > a::attr(href)").extract_first()
20 |         if next_page_url is not None:
21 |             yield scrapy.Request(response.urljoin(next_page_url))
22 | 


--------------------------------------------------------------------------------
/sc_scripts_demo/scrapinghub.yml:
--------------------------------------------------------------------------------
1 | projects:
2 |   default: 105217
3 | requirements_file: requirements.txt
4 | 


--------------------------------------------------------------------------------
/sc_scripts_demo/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = sc_scripts_demo.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = sc_scripts_demo
12 | 


--------------------------------------------------------------------------------
/sc_scripts_demo/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | 
 4 | setup(
 5 |     name='sc_scripts_demo',
 6 |     version='1.0',
 7 |     packages=find_packages(),
 8 |     scripts=[
 9 |         'bin/check_jobs.py',
10 |     ],
11 |     entry_points={
12 |         'scrapy': ['settings = sc_scripts_demo.settings'],
13 |     },
14 | )
15 | 


--------------------------------------------------------------------------------
/scrapy_price_monitor/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | .venv/
83 | venv/
84 | ENV/
85 | 
86 | # Spyder project settings
87 | .spyderproject
88 | 
89 | # Rope project settings
90 | .ropeproject
91 | 
92 | .scrapy


--------------------------------------------------------------------------------
/scrapy_price_monitor/README.md:
--------------------------------------------------------------------------------
  1 | Scrapy Price Monitor
  2 | ====================
  3 | 
  4 | This is a simple price monitor built with [Scrapy](https://github.com/scrapy/scrapy)
  5 | and [Scrapy Cloud](https://scrapinghub.com/scrapy-cloud).
  6 | 
  7 | It is basically a Scrapy project with one spider for each online retailer that
  8 | we want to monitor prices from. In addition to the spiders, there's a Python
  9 | Script that is scheduled to run periodically on Scrapy Cloud, checking whether
 10 | the latest prices are the best ones in a given time span. If so, the monitor
 11 | sends an email alerting you about the price drops.
 12 | 
 13 | 
 14 | ## Including Products to Monitor
 15 | 
 16 | There's a `resources/urls.json` file that lists the URLs from the products that
 17 | we want to monitor. If you just want to include a new product to monitor from
 18 | the already supported retailers, just add a new key for that product and add
 19 | the URL list as its value, such as:
 20 | 
 21 |     {
 22 |         "headsetlogitech": [
 23 |             "https://www.amazon.com/.../B005GTO07O/",
 24 |             "http://www.bestbuy.com/.../3436118.p",
 25 |             "http://www.ebay.com/.../110985874014"
 26 |         ],
 27 |         "NewProduct": [
 28 |             "http://url.for.retailer.x",
 29 |             "http://url.for.retailer.y",
 30 |             "http://url.for.retailer.z"
 31 |         ]
 32 |     }
 33 | 
 34 | 
 35 | ## Supporting Further Retailers
 36 | 
 37 | This project currently only works with 3 online retailers, and you can list them
 38 | running:
 39 | 
 40 |     $ scrapy list
 41 |     amazon.com
 42 |     bestbuy.com
 43 |     ebay.com
 44 | 
 45 | If the retailer that you want to monitor is not yet supported, just create a spider
 46 | to handle the product pages from it. To include a spider for samsclub.com, you
 47 | could run:
 48 | 
 49 |     $ scrapy genspider samsclub.com samsclub.com
 50 | 
 51 | And then, open the spider and add the extraction rules:
 52 | 
 53 |     $ scrapy edit samsclub.com
 54 | 
 55 | Have a look at the current spiders and implement the new ones using the same
 56 | structure, subclassing `BaseSpider` instead of `scrapy.Spider`. This way, your
 57 | spiders will automatically read the URLs list from `resources/urls.json`.
 58 | 
 59 | 
 60 | ## Customizing the Price Monitor
 61 | 
 62 | The price monitor sends an email using Amazon SES service, so to run it you
 63 | have to set both `AWS_ACCESS_KEY` and `AWS_SECRET_KEY` variables in
 64 | `price_monitor/settings.py`. If you want to use another email service,
 65 | you have to rewrite the `send_email_alert` function in
 66 | `price_monitor/bin/monitor.py`.
 67 | 
 68 | The price monitor can be further customized via parameters to the
 69 | `price_monitor/bin/monitor.py` script. We will dig on those parameters
 70 | later when showing how to schedule the project on Scrapy Cloud.
 71 | 
 72 | 
 73 | ## Installing and Running
 74 | 
 75 | 1. Clone this repo:
 76 | 
 77 |         $ git clone git@github.com:stummjr/scrapy_price_monitor.git
 78 | 
 79 | 2. Enter the folder and install the project dependencies:
 80 | 
 81 |         $ cd scrapy_price_monitor
 82 |         $ pip install -r requirements.txt
 83 | 
 84 | 3. Create a free forever account on Scrapy Cloud:
 85 | https://app.scrapinghub.com/account/signup/.
 86 | 
 87 | 4. Create a Scrapy project on Scrapy Cloud and copy the project id from the project URL.
 88 | 
 89 | 5. Install [Scrapinghub command line tool (shub)](https://github.com/scrapinghub/shub):
 90 | 
 91 |         $ pip install shub
 92 | 
 93 | 6. Authenticate using your Scrapinghub API key:
 94 | 
 95 |         $ shub login
 96 | 
 97 | 7. Finally, deploy the local project to your Scrapy Cloud project:
 98 | 
 99 |         $ shub deploy <your_project_id_here>
100 | 
101 | This video also explains how to deploy a Scrapy project to Scrapy Cloud:
102 | https://youtu.be/JYch0zRmcgU
103 | 
104 | 
105 | ## How to Schedule on Scrapy Cloud
106 | 
107 | After you have deployed the project to Scrapy Cloud, it's time to schedule its
108 | execution on Scrapy Cloud.
109 | 
110 | This project has two main components:
111 | 
112 | - the [**spiders**](https://github.com/scrapinghub/sample-projects/blob/master/scrapy_price_monitor/price_monitor/spiders) that collect prices from the retailers' websites
113 | - the [**price monitor script**](https://github.com/scrapinghub/sample-projects/blob/master/scrapy_price_monitor/bin/monitor.py) that checks whether there's a new deal in the latest prices
114 | 
115 | You have to schedule both the spiders and the monitor to run periodically on
116 | Scrapy Cloud. It's a good idea to schedule all the spiders to run at the same
117 | time and schedule the monitor to run about 15 minutes after the spiders.
118 | 
119 | Take a look at this video to learn how to schedule periodic jobs on Scrapy Cloud:
120 | https://youtu.be/JYch0zRmcgU?t=1m51s
121 | 
122 | 
123 | ### Parameters for the Monitor Script
124 | 
125 | The monitor script takes these parameters and you can pass them via the parameters box in the
126 | scheduling dialog:
127 | 
128 | - `--days`: how many days of data we want to compare with the scraped prices.
129 | - `--threshold`: a margin that you can set to avoid getting alerts from minor price changes. For example, if you set it to 1.0, you will only get alerts when the price drop is bigger than $1.00.
130 | - `--apikey`: your Scrapy Cloud API key. You can get it in: https://app.scrapinghub.com/account/apikey.
131 | - `--project`: the Scrapy Cloud project where the monitor is deployed (you can grab it from your project URL at Scrapy Cloud).
132 | 
133 | 
134 | ## Running in a Local Environment
135 | 
136 | You can run this project on Scrapy Cloud or on your local environment. The only dependency
137 | from Scrapy Cloud is the [Collections API](https://doc.scrapinghub.com/api/collections.html),
138 | but the spiders and the monitor can be executed locally.
139 | 
140 | To do that, first add your Scrapy Cloud project id to [settings.py `SHUB_PROJ_ID` variable](https://github.com/scrapinghub/sample-projects/blob/master/scrapy_price_monitor/price_monitor/settings.py#L11).
141 | 
142 | Then run the spiders via command line:
143 | 
144 |     $ scrapy crawl bestbuy.com
145 | 
146 | This will run the spider named as `bestbuy.com` and store the scraped data into
147 | a Scrapy Cloud collection, under the project you set in the last step.
148 | 
149 | You can also run the price monitor via command line:
150 | 
151 |     $ python bin/monitor.py --apikey <SCRAPINGHUB_KEY> --days 2 --threshold 1 --project <PROJ_ID>
152 | 


--------------------------------------------------------------------------------
/scrapy_price_monitor/bin/monitor.py:
--------------------------------------------------------------------------------
  1 | """Simple price monitor built with Scrapy and Scrapy Cloud
  2 | """
  3 | import argparse
  4 | import os
  5 | from datetime import datetime, timedelta
  6 | 
  7 | import boto
  8 | from hubstorage import HubstorageClient
  9 | from jinja2 import Environment, PackageLoader
 10 | from price_monitor import settings
 11 | from price_monitor.utils import get_product_names, get_retailers_for_product
 12 | from w3lib.html import remove_tags
 13 | 
 14 | jinja_env = Environment(loader=PackageLoader('price_monitor', 'templates'))
 15 | 
 16 | 
 17 | class DealsChecker(object):
 18 | 
 19 |     def __init__(self, latest_deals, previous_deals, price_threshold=0):
 20 |         self.price_threshold = price_threshold
 21 |         self.latest_deals = latest_deals
 22 |         self.previous_deals = previous_deals
 23 | 
 24 |     def is_from_latest_crawl(self, deal):
 25 |         """Checks whether the given deal is from the most recent execution.
 26 |         """
 27 |         return deal in self.latest_deals
 28 | 
 29 |     def get_best_deal(self):
 30 |         """Returns the item with the best overall price. self.price_threshold can be set to avoid
 31 |            considering minor price drops.
 32 |         """
 33 |         best_so_far = min(self.previous_deals, key=lambda x: x.get('price'))
 34 |         best_from_last = min(self.latest_deals, key=lambda x: x.get('price'))
 35 |         if best_from_last.get('price') + self.price_threshold < best_so_far.get('price'):
 36 |             return best_from_last
 37 |         else:
 38 |             return best_so_far
 39 | 
 40 | 
 41 | class DealsFetcher(object):
 42 | 
 43 |     def __init__(self, product_name, apikey, project_id, hours):
 44 |         self.product_name = product_name
 45 |         project = HubstorageClient(apikey).get_project(project_id)
 46 |         self.item_store = project.collections.new_store(product_name)
 47 |         self.load_items_from_last_n_hours(hours)
 48 | 
 49 |     def load_items_from_last_n_hours(self, n=24):
 50 |         """Load items from the last n hours, from the newest to the oldest.
 51 |         """
 52 |         since_time = int((datetime.now() - timedelta(hours=n)).timestamp() * 1000)
 53 |         self.deals = [item.get('value') for item in self.fetch_deals_newer_than(since_time)]
 54 | 
 55 |     def fetch_deals_newer_than(self, since_time):
 56 |         return list(self.item_store.get(meta=['_key', '_ts'], startts=since_time))
 57 | 
 58 |     def get_latest_deal_from_retailer(self, retailer):
 59 |         """Returns the most recently extracted deal from a given retailer.
 60 |         """
 61 |         for deals in self.deals:
 62 |             if retailer in deals.get('url'):
 63 |                 return deals
 64 | 
 65 |     def get_deals(self):
 66 |         """Returns a tuple with (deals from latest crawl, deals from previous crawls)
 67 |         """
 68 |         latest_deals = [
 69 |             self.get_latest_deal_from_retailer(retailer)
 70 |             for retailer in get_retailers_for_product(self.product_name)
 71 |         ]
 72 |         previous_deals = [
 73 |             deal for deal in self.deals if deal not in latest_deals
 74 |         ]
 75 |         return latest_deals, previous_deals
 76 | 
 77 | 
 78 | def send_email_alert(items):
 79 |     ses = boto.connect_ses(settings.AWS_ACCESS_KEY, settings.AWS_SECRET_KEY)
 80 |     html_body = jinja_env.get_template('email.html').render(items=items)
 81 | 
 82 |     ses.send_email(
 83 |         settings.EMAIL_ALERT_FROM,
 84 |         'Price drop alert',
 85 |         remove_tags(html_body),
 86 |         settings.EMAIL_ALERT_TO,
 87 |         html_body=html_body
 88 |     )
 89 | 
 90 | 
 91 | def main(args):
 92 |     items = []
 93 |     for prod_name in get_product_names():
 94 |         fetcher = DealsFetcher(prod_name, args.apikey, args.project, args.days * 24)
 95 |         checker = DealsChecker(*fetcher.get_deals(), args.threshold)
 96 |         best_deal = checker.get_best_deal()
 97 |         if checker.is_from_latest_crawl(best_deal):
 98 |             items.append(best_deal)
 99 | 
100 |     if items:
101 |         send_email_alert(items)
102 | 
103 | 
104 | def parse_args():
105 |     parser = argparse.ArgumentParser(description=__doc__)
106 |     parser.add_argument('--apikey', default=settings.SHUB_KEY or os.getenv('SHUB_KEY'),
107 |                         help='API key to use for scrapinghub (fallbacks to SHUB_KEY variable)')
108 |     parser.add_argument('--days', type=int, default=1,
109 |                         help='How many days back to compare with the last price')
110 |     parser.add_argument('--threshold', type=float, default=0,
111 |                         help='A margin to avoid raising alerts with minor price drops')
112 |     parser.add_argument('--project', type=int, default=settings.SHUB_PROJ_ID,
113 |                         help='Project ID to get info from')
114 | 
115 |     return parser.parse_args()
116 | 
117 | 
118 | if __name__ == '__main__':
119 |     main(parse_args())
120 | 


--------------------------------------------------------------------------------
/scrapy_price_monitor/price_monitor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/sample-projects/fdff7a0be50041c30e059568aca0d81d1183ad53/scrapy_price_monitor/price_monitor/__init__.py


--------------------------------------------------------------------------------
/scrapy_price_monitor/price_monitor/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class PriceMonitorItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/scrapy_price_monitor/price_monitor/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from price_monitor import settings
 3 | from hubstorage import HubstorageClient
 4 | from price_monitor.utils import reversed_timestamp, get_product_names
 5 | 
 6 | 
 7 | class CollectionStoragePipeline(object):
 8 | 
 9 |     def open_spider(self, spider):
10 |         client = HubstorageClient(auth=settings.SHUB_KEY)
11 |         project = client.get_project(settings.SHUB_PROJ_ID)
12 |         self.data_stores = {}
13 |         for product_name in get_product_names():
14 |             self.data_stores[product_name] = project.collections.new_store(product_name)
15 | 
16 |     def process_item(self, item, spider):
17 |         key = "{}-{}-{}".format(
18 |             reversed_timestamp(), item.get('product_name'), item.get('retailer')
19 |         )
20 |         self.data_stores[item['product_name']].set({'_key': key, 'value': item})
21 |         return item
22 | 


--------------------------------------------------------------------------------
/scrapy_price_monitor/price_monitor/resources/urls.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "headsetlogitech": [
 3 |         "https://www.amazon.com/Logitech-Wireless-Headset-Over-Design/dp/B005GTO07O/",
 4 |         "http://www.bestbuy.com/site/logitech-h600-wireless-headset-black/3436118.p",
 5 |         "http://www.ebay.com/itm/N-Logitech-Wireless-Headset-H600-Over-The-Head-Design-981-000341-/110985874014"
 6 |     ],
 7 |     "webcamlogitech": [
 8 |         "https://www.amazon.com/Logitech-Widescreen-Calling-Recording-Desktop/dp/B006JH8T3S/",
 9 |         "http://www.bestbuy.com/site/logitech-c920-pro-webcam-black/4612476.p?skuId=4612476",
10 |         "http://www.ebay.com/itm/Logitech-HD-Pro-Webcam-C920-1080p-Widescreen-Video-Calling-and-Recording-/272381890214"
11 |     ],
12 |     "amazonechodot": [
13 |         "https://www.amazon.com/dp/B01DFKC2SO",
14 |         "http://www.bestbuy.com/site/amazon-echo-dot/5578851.p?skuId=5578851",
15 |         "http://www.ebay.com/itm/Amazon-Echo-Dot-2nd-Generation-w-Alexa-Voice-Media-Device-All-New-2016-/201668562192"
16 |     ],
17 |     "nikoncoolpix": [
18 |         "https://www.amazon.com/Nikon-COOLPIX-B500-Digital-Camera/dp/B01C3LEE9G/",
19 |         "http://www.bestbuy.com/site/nikon-coolpix-b500-16-0-megapixel-digital-camera-red/4997500.p?skuId=4997500",
20 |         "http://www.ebay.com/itm/Nikon-COOLPIX-B500-Digital-Camera-Red-/162225974018"
21 |     ],
22 |     "bluemicrophone": [
23 |         "https://www.amazon.com/Blue-Snowball-iCE-Condenser-Microphone/dp/B014PYGTUQ/",
24 |         "http://www.bestbuy.com/site/blue-microphones-snowball-usb-cardioid-and-omnidirectional-electret-condenser-vocal-microphone-black/9918056.p?skuId=9918056",
25 |         "http://www.ebay.com/itm/Blue-Microphones-Snowball-Black-iCE-Condenser-Microphone-/172260373002"
26 |     ]
27 | }
28 | 


--------------------------------------------------------------------------------
/scrapy_price_monitor/price_monitor/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | 
 4 | BOT_NAME = 'price_monitor'
 5 | SPIDER_MODULES = ['price_monitor.spiders']
 6 | NEWSPIDER_MODULE = 'price_monitor.spiders'
 7 | 
 8 | ROBOTSTXT_OBEY = True
 9 | 
10 | SHUB_KEY = os.getenv('$SHUB_KEY')
11 | # if you want to run it locally, replace '999999' by your Scrapy Cloud project ID below
12 | SHUB_PROJ_ID = os.getenv('SHUB_JOBKEY', '999999').split('/')[0]
13 | 
14 | 
15 | # settings for Amazon SES email service
16 | AWS_ACCESS_KEY = os.getenv('$AWS_ACCESS_KEY')
17 | AWS_SECRET_KEY = os.getenv('$AWS_SECRET_KEY')
18 | EMAIL_ALERT_FROM = 'Price Monitor <SENDER_EMAIL@provider.com>'
19 | EMAIL_ALERT_TO = ['RECEIVER_EMAIL@provider.com']
20 | 
21 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
22 | ITEM_PIPELINES = {
23 |     'price_monitor.pipelines.CollectionStoragePipeline': 400,
24 | }
25 | 
26 | AUTOTHROTTLE_ENABLED = True
27 | # HTTPCACHE_ENABLED = True
28 | 


--------------------------------------------------------------------------------
/scrapy_price_monitor/price_monitor/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/scrapy_price_monitor/price_monitor/spiders/amazon.py:
--------------------------------------------------------------------------------
 1 | from .base_spider import BaseSpider
 2 | 
 3 | 
 4 | class AmazonSpider(BaseSpider):
 5 |     name = "amazon.com"
 6 | 
 7 |     def parse(self, response):
 8 |         item = response.meta.get('item', {})
 9 |         item['url'] = response.url
10 |         item['title'] = response.css("span#productTitle::text").extract_first("").strip()
11 |         item['price'] = float(
12 |             response.css("span#priceblock_ourprice::text").re_first("\$(.*)") or 0
13 |         )
14 |         yield item
15 | 


--------------------------------------------------------------------------------
/scrapy_price_monitor/price_monitor/spiders/base_spider.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import pkgutil
 3 | import scrapy
 4 | from datetime import datetime
 5 | 
 6 | 
 7 | class BaseSpider(scrapy.Spider):
 8 | 
 9 |     def start_requests(self):
10 |         products = json.loads(pkgutil.get_data('price_monitor', 'resources/urls.json').decode())
11 |         for name, urls in products.items():
12 |             for url in urls:
13 |                 if self.name in url:
14 |                     now = datetime.now().strftime('%Y/%m/%d %H:%M:%S')
15 |                     item = {'product_name': name, 'retailer': self.name, 'when': now}
16 |                     yield scrapy.Request(url, meta={'item': item})
17 | 


--------------------------------------------------------------------------------
/scrapy_price_monitor/price_monitor/spiders/bestbuy.py:
--------------------------------------------------------------------------------
 1 | from .base_spider import BaseSpider
 2 | 
 3 | 
 4 | class BestbuySpider(BaseSpider):
 5 |     name = "bestbuy.com"
 6 | 
 7 |     def parse(self, response):
 8 |         item = response.meta.get('item', {})
 9 |         item['url'] = response.url
10 |         item['title'] = response.css("div#sku-title > h1 ::text").extract_first().strip()
11 |         item['price'] = float(
12 |             response.css('div.price-block ::attr(data-customer-price)').extract_first(default=0)
13 |         )
14 |         yield item
15 | 


--------------------------------------------------------------------------------
/scrapy_price_monitor/price_monitor/spiders/ebay.py:
--------------------------------------------------------------------------------
 1 | from extruct.w3cmicrodata import MicrodataExtractor
 2 | from .base_spider import BaseSpider
 3 | 
 4 | 
 5 | class EbaySpider(BaseSpider):
 6 |     name = "ebay.com"
 7 | 
 8 |     def parse(self, response):
 9 |         extractor = MicrodataExtractor()
10 |         properties = extractor.extract(response.body_as_unicode()).get('items')[0].get('properties', {})
11 |         item = response.meta.get('item', {})
12 |         item['url'] = response.url
13 |         item['title'] = properties.get('name').replace('Details about', '').strip()
14 |         item['price'] = float(
15 |             properties.get('offers', {}).get('properties', {}).get('price', 0)
16 |         )
17 |         yield item
18 | 


--------------------------------------------------------------------------------
/scrapy_price_monitor/price_monitor/templates/email.html:
--------------------------------------------------------------------------------
 1 | <h1>🎉 Hey, we found a good deal! 🎁</h1>
 2 | <table border="1">
 3 | 
 4 | {% for item in items %}
 5 | <tr><td>
 6 |     <p><strong>Product:</strong> {{item.title}}</p>
 7 |     <p><strong>Price:</strong> {{item.price}}</p>
 8 |     <p><strong>Store:</strong> {{item.retailer}}</p>
 9 |     <p><strong>Price obtained at:</strong> {{item.when}}</p>
10 |     <p>Visit the product page at {{item.retailer}}: <a href="{{item.url}}">{{item.url}}</a></p>
11 | </td></tr>
12 | {% endfor %}
13 | </table>
14 | 
15 | 


--------------------------------------------------------------------------------
/scrapy_price_monitor/price_monitor/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import pkgutil
 3 | from datetime import datetime, timedelta
 4 | 
 5 | 
 6 | def timestamp_from_reversed(reversed):
 7 |     return datetime(5000, 1, 1) - timedelta(seconds=float(reversed))
 8 | 
 9 | 
10 | def reversed_timestamp():
11 |     return str((datetime(5000, 1, 1) - datetime.now()).total_seconds())
12 | 
13 | 
14 | def normalize_name(name):
15 |     return name.replace('-', '')
16 | 
17 | 
18 | def get_product_names():
19 |     return [
20 |         normalize_name(name)
21 |         for name in json.loads(
22 |             pkgutil.get_data("price_monitor", "resources/urls.json").decode()
23 |         ).keys()
24 |     ]
25 | 
26 | 
27 | def get_retailer_name_from_url(url):
28 |         return url.split("://")[1].split("/")[0].replace("www.", "")
29 | 
30 | 
31 | def get_retailers_for_product(product_name):
32 |     data = json.loads(
33 |         pkgutil.get_data("price_monitor", "resources/urls.json").decode()
34 |     )
35 |     return {get_retailer_name_from_url(url) for url in data[product_name]}
36 | 


--------------------------------------------------------------------------------
/scrapy_price_monitor/requirements.txt:
--------------------------------------------------------------------------------
1 | scrapy
2 | boto
3 | extruct
4 | w3lib
5 | jinja2
6 | 


--------------------------------------------------------------------------------
/scrapy_price_monitor/scrapinghub.yml:
--------------------------------------------------------------------------------
1 | requirements_file: requirements.txt
2 | stacks:
3 |   default: scrapy:1.1-py3
4 | 


--------------------------------------------------------------------------------
/scrapy_price_monitor/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = price_monitor.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = price_monitor
12 | 


--------------------------------------------------------------------------------
/scrapy_price_monitor/setup.py:
--------------------------------------------------------------------------------
 1 | # Automatically created by: shub deploy
 2 | 
 3 | from setuptools import setup, find_packages
 4 | 
 5 | setup(
 6 |     name='project',
 7 |     version='1.0',
 8 |     packages=find_packages(),
 9 |     package_data={'price_monitor': ['resources/*.json', 'templates/*.html']},
10 |     scripts=['bin/monitor.py'],
11 |     entry_points={'scrapy': ['settings = price_monitor.settings']},
12 | )
13 | 


--------------------------------------------------------------------------------
/splash_based_project/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = splash_based_project.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = splash_based_project
12 | 


--------------------------------------------------------------------------------
/splash_based_project/splash_based_project/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/sample-projects/fdff7a0be50041c30e059568aca0d81d1183ad53/splash_based_project/splash_based_project/__init__.py


--------------------------------------------------------------------------------
/splash_based_project/splash_based_project/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for splash_based_project project
 4 | 
 5 | BOT_NAME = 'splash_based_project'
 6 | SPIDER_MODULES = ['splash_based_project.spiders']
 7 | NEWSPIDER_MODULE = 'splash_based_project.spiders'
 8 | 
 9 | # Splash settings
10 | SPLASH_URL = ''     # <-- Splash instance URL from Scrapy Cloud
11 | APIKEY = ''         # <-- your API key
12 | SPIDER_MIDDLEWARES = {
13 |     'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
14 | }
15 | DOWNLOADER_MIDDLEWARES = {
16 |     'scrapy_splash.SplashCookiesMiddleware': 723,
17 |     'scrapy_splash.SplashMiddleware': 725,
18 |     'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
19 | }
20 | DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
21 | 


--------------------------------------------------------------------------------
/splash_based_project/splash_based_project/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/splash_based_project/splash_based_project/spiders/quotes-js-1.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | from scrapy_splash import SplashRequest
 3 | 
 4 | 
 5 | class QuotesJs1Spider(scrapy.Spider):
 6 |     """Example spider using Splash to render JavaScript-based pages.
 7 |        Make sure you configure settings.py according to your Splash
 8 |        credentials (available on Scrapy Cloud).
 9 |     """
10 | 
11 |     name = 'quotes-js-1'
12 |     http_user = ''      # <-- your API key goes here
13 | 
14 |     def start_requests(self):
15 |         yield SplashRequest('http://quotes.toscrape.com/js')
16 | 
17 |     def parse(self, response):
18 |         for quote in response.css('div.quote'):
19 |             yield {
20 |                 'text': quote.css('span.text::text').extract_first(),
21 |                 'author': quote.css('span small::text').extract_first(),
22 |                 'tags': quote.css('div.tags a.tag::text').extract(),
23 |             }
24 | 
25 |         next_page = response.css('li.next > a::attr(href)').extract_first()
26 |         if next_page:
27 |             yield SplashRequest(response.urljoin(next_page))
28 | 


--------------------------------------------------------------------------------
/splash_based_project/splash_based_project/spiders/quotes-js-2.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | from scrapy_splash import SplashRequest
 3 | from w3lib.http import basic_auth_header
 4 | 
 5 | 
 6 | class QuotesJs2Spider(scrapy.Spider):
 7 |     """Example spider using Splash to render JavaScript-based pages.
 8 |        Make sure you configure settings.py with your Splash
 9 |        credentials (available on Scrapy Cloud).
10 |     """
11 |     name = 'quotes-js-2'
12 | 
13 |     def start_requests(self):
14 |         yield SplashRequest(
15 |             'http://quotes.toscrape.com/js',
16 |             splash_headers={
17 |                 'Authorization': basic_auth_header(self.settings['APIKEY'], ''),
18 |             },
19 |         )
20 | 
21 |     def parse(self, response):
22 |         for quote in response.css('div.quote'):
23 |             yield {
24 |                 'text': quote.css('span.text::text').extract_first(),
25 |                 'author': quote.css('span small::text').extract_first(),
26 |                 'tags': quote.css('div.tags a.tag::text').extract(),
27 |             }
28 | 
29 |         next_page = response.css('li.next > a::attr(href)').extract_first()
30 |         if next_page:
31 |             yield SplashRequest(
32 |                 response.urljoin(next_page),
33 |                 splash_headers={
34 |                     'Authorization': basic_auth_header(self.settings['APIKEY'], ''),
35 |                 },
36 |             )
37 | 


--------------------------------------------------------------------------------
/splash_crawlera_example/README.md:
--------------------------------------------------------------------------------
 1 | # Splash + Crawlera Example Project
 2 | 
 3 | This example project shows how to use [Crawlera](http://scrapinghub.com/crawlera)
 4 | (a smart downloader) and [Splash](https://scrapinghub.com/splash) (a JavaScript
 5 | rendering service) with Scrapy spiders.
 6 | 
 7 | 
 8 | ## How does it work?
 9 | 
10 | The integration between Splash and Crawlera is done by a
11 | [Lua script](https://github.com/scrapinghub/sample-projects/blob/master/splash_crawlera_example/splash_crawlera_example/scripts/crawlera.lua)
12 | that is sent to Splash with every request created by the spider. This script configures
13 | Splash to use Crawlera as its proxy and also defines a couple rules to avoid doing
14 | useless requests, such as analytics ones, stylesheets, images, etc.
15 | 
16 | 
17 | ## What do I need to run this project?
18 | 
19 | Here's what you'll need:
20 | 
21 | - a Splash instance and a Crawlera account: you can get both via Scrapy Cloud billing page
22 |   - you can also run Splash in your own machine following the [instructions here](http://splash.readthedocs.io/en/stable/install.html)
23 | - set your Splash settings this project's [settings.py](https://github.com/scrapinghub/sample-projects/blob/master/splash_crawlera_example/splash_crawlera_example/settings.py) 
24 | file:
25 |   - `SPLASH_URL`: the URL where your Splash instance is available
26 |   - `SPLASH_APIKEY`: your Splash API key (required if you're using an instance from Scrapy Cloud)
27 | - set your Crawlera settings in the same file:
28 |   - `CRAWLERA_APIKEY`: the API key for your Crawlera user
29 | 


--------------------------------------------------------------------------------
/splash_crawlera_example/requirements.txt:
--------------------------------------------------------------------------------
1 | scrapy-splash
2 | 


--------------------------------------------------------------------------------
/splash_crawlera_example/scrapinghub.yml:
--------------------------------------------------------------------------------
1 | requirements_file: requirements.txt
2 | stacks:
3 |   default: scrapy:1.3-py3
4 | 


--------------------------------------------------------------------------------
/splash_crawlera_example/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = splash_crawlera_example.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = splash_crawlera_example
12 | 


--------------------------------------------------------------------------------
/splash_crawlera_example/setup.py:
--------------------------------------------------------------------------------
 1 | # Automatically created by: shub deploy
 2 | from setuptools import setup, find_packages
 3 | 
 4 | setup(
 5 |     name = 'project',
 6 |     version = '1.0',
 7 |     packages = find_packages(),
 8 |     package_data = {'splash_crawlera_example': ['scripts/*.lua',]},
 9 |     entry_points = {'scrapy': ['settings = splash_crawlera_example.settings']},
10 | )
11 | 


--------------------------------------------------------------------------------
/splash_crawlera_example/splash_crawlera_example/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/sample-projects/fdff7a0be50041c30e059568aca0d81d1183ad53/splash_crawlera_example/splash_crawlera_example/__init__.py


--------------------------------------------------------------------------------
/splash_crawlera_example/splash_crawlera_example/scripts/crawlera.lua:
--------------------------------------------------------------------------------
 1 | function use_crawlera(splash)
 2 |     -- Make sure you pass your Crawlera API key in the 'crawlera_user' arg.
 3 |     -- Have a look at the file spiders/quotes-js.py to see how to do it.
 4 |     -- Find your Crawlera credentials in https://app.scrapinghub.com/
 5 |     local user = splash.args.crawlera_user
 6 | 
 7 |     local host = 'proxy.crawlera.com'
 8 |     local port = 8010
 9 |     local session_header = 'X-Crawlera-Session'
10 |     local session_id = 'create'
11 | 
12 |     splash:on_request(function (request)
13 |         -- The commented code below can be used to speed up the crawling
14 |         -- process. They filter requests to undesired domains and useless
15 |         -- resources. Uncomment the ones that make sense to your use case
16 |         -- and add your own rules.
17 | 
18 |         -- Discard requests to advertising and tracking domains.
19 |         -- if string.find(request.url, 'doubleclick%.net') or
20 |         --    string.find(request.url, 'analytics%.google%.com') then
21 |         --     request.abort()
22 |         --     return
23 |         -- end
24 | 
25 |         -- Avoid using Crawlera for subresources fetching to increase crawling
26 |         -- speed. The example below avoids using Crawlera for URLS starting
27 |         -- with 'static.' and the ones ending with '.png'.
28 |         -- if string.find(request.url, '://static%.') ~= nil or
29 |         --    string.find(request.url, '%.png$') ~= nil then
30 |         --     return
31 |         -- end
32 | 
33 |         request:set_header('X-Crawlera-Cookies', 'disable')
34 |         request:set_header(session_header, session_id)
35 |         request:set_proxy{host, port, username=user, password=''}
36 |     end)
37 | 
38 |     splash:on_response_headers(function (response)
39 |         if type(response.headers[session_header]) ~= nil then
40 |             session_id = response.headers[session_header]
41 |         end
42 |     end)
43 | end
44 | 
45 | function main(splash)
46 |     use_crawlera(splash)
47 |     splash:go(splash.args.url)
48 |     return splash:html()
49 | end
50 | 


--------------------------------------------------------------------------------
/splash_crawlera_example/splash_crawlera_example/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | BOT_NAME = 'splash_crawlera_example'
 4 | SPIDER_MODULES = ['splash_crawlera_example.spiders']
 5 | NEWSPIDER_MODULE = 'splash_crawlera_example.spiders'
 6 | 
 7 | SPIDER_MIDDLEWARES = {
 8 |     'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
 9 | }
10 | 
11 | DOWNLOADER_MIDDLEWARES = {
12 |     'scrapy_splash.SplashCookiesMiddleware': 723,
13 |     'scrapy_splash.SplashMiddleware': 725,
14 |     'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
15 | }
16 | 
17 | CRAWLERA_APIKEY = ''  # Your crawlera API key
18 | 
19 | # Splash settings
20 | SPLASH_URL = ''     # Splash instance URL from Scrapy Cloud
21 | SPLASH_APIKEY = ''  # Your API key for the Splash instance hosted on Scrapy Cloud
22 | DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
23 | 


--------------------------------------------------------------------------------
/splash_crawlera_example/splash_crawlera_example/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/splash_crawlera_example/splash_crawlera_example/spiders/quotes-js.py:
--------------------------------------------------------------------------------
 1 | from pkgutil import get_data
 2 | import scrapy
 3 | from scrapy_splash import SplashRequest
 4 | from w3lib.http import basic_auth_header
 5 | 
 6 | 
 7 | class QuotesJsSpider(scrapy.Spider):
 8 |     name = 'quotes-js'
 9 | 
10 |     def __init__(self, *args, **kwargs):
11 |         # to be able to load the Lua script on Scrapy Cloud, make sure your
12 |         # project's setup.py file contains the "package_data" setting, similar
13 |         # to this project's setup.py
14 |         self.LUA_SOURCE = get_data(
15 |             'splash_crawlera_example', 'scripts/crawlera.lua'
16 |         ).decode('utf-8')
17 |         super(QuotesJsSpider, self).__init__(*args, **kwargs)
18 | 
19 |     def start_requests(self):
20 |         yield SplashRequest(
21 |             url='http://quotes.toscrape.com/js/',
22 |             endpoint='execute',
23 |             splash_headers={
24 |                 'Authorization': basic_auth_header(self.settings['SPLASH_APIKEY'], ''),
25 |             },
26 |             args={
27 |                 'lua_source': self.LUA_SOURCE,
28 |                 'crawlera_user': self.settings['CRAWLERA_APIKEY'],
29 |             },
30 |             # tell Splash to cache the lua script, to avoid sending it for every request
31 |             cache_args=['lua_source'],
32 |         )
33 | 
34 |     def parse(self, response):
35 |         for quote in response.css('div.quote'):
36 |             yield {
37 |                 'text': quote.css('span.text::text').extract_first(),
38 |                 'author': quote.css('span small::text').extract_first(),
39 |                 'tags': quote.css('div.tags a.tag::text').extract(),
40 |             }
41 |         next_page = response.css('li.next > a::attr(href)').extract_first()
42 |         if next_page:
43 |             yield SplashRequest(
44 |                 url=response.urljoin(next_page),
45 |                 endpoint='execute',
46 |                 splash_headers={
47 |                     'Authorization': basic_auth_header(self.settings['SPLASH_APIKEY'], ''),
48 |                 },
49 |                 args={
50 |                     'lua_source': self.LUA_SOURCE,
51 |                     'crawlera_user': self.settings['CRAWLERA_APIKEY'],
52 |                 },
53 |                 cache_args=['lua_source'],
54 |             )
55 | 


--------------------------------------------------------------------------------
/splash_scrapy_spm_headless_proxy_example/README.md:
--------------------------------------------------------------------------------
 1 | # Zyte Smart Proxy Headless Proxy + Splash + Scrapy Example Project
 2 | 
 3 | This example project shows how to use [Smart Proxy Manager (Formally Crawlera)](https://www.zyte.com/smart-proxy-manager/)
 4 | with [Zyte Smart Proxy Headless Proxy](https://github.com/zytedata/zyte-smartproxy-headless-proxy)
 5 | and [Splash](https://www.zyte.com/splash/) (a JavaScript
 6 | rendering service) with Scrapy spiders.
 7 | 
 8 | 
 9 | ## How does it work?
10 | 
11 | The integration between Splash and Zyte Smart Proxy Headless Proxy is done by a
12 | [Lua script](https://github.com/scrapinghub/sample-projects/blob/master/splash_scrapy_spm_headless_proxy_example/splash_scrapy_spm_headless_proxy_example/scripts/smart_proxy_manager.lua)
13 | that is sent to Splash with every request created by the spider. This script configures
14 | Splash to use Zyte Smart Proxy Headless Proxy as its proxy and also defines a couple rules to avoid
15 | doing useless requests, such as analytics ones, stylesheets, images, etc.
16 | 
17 | 
18 | ## What do I need to run this project?
19 | 
20 | Here's what you'll need:
21 | 
22 | - a Splash instance and a Smart Proxy Manager account: you can get both via Scrapy Cloud billing page
23 |   - you can also run Splash in your own machine following the [instructions here](http://splash.readthedocs.io/en/stable/install.html)
24 | - Setup and run Zyte Smart Proxy Headless Proxy using this [documentation](https://docs.zyte.com/smart-proxy-manager/headless.html)
25 | - set your Splash settings this project's [settings.py](https://github.com/scrapinghub/sample-projects/blob/master/splash_scrapy_spm_headless_proxy_example/splash_scrapy_spm_headless_proxy_example/settings.py)
26 | file:
27 |   - `SPLASH_URL`: the URL where your Splash instance is available, by default this is set to `http://localhost:8050`.
28 | 


--------------------------------------------------------------------------------
/splash_scrapy_spm_headless_proxy_example/requirements.txt:
--------------------------------------------------------------------------------
1 | scrapy
2 | scrapy-splash
3 | 


--------------------------------------------------------------------------------
/splash_scrapy_spm_headless_proxy_example/scrapinghub.yml:
--------------------------------------------------------------------------------
1 | requirements_file: requirements.txt
2 | stacks:
3 |   default: scrapy:1.3-py3
4 | 


--------------------------------------------------------------------------------
/splash_scrapy_spm_headless_proxy_example/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = splash_scrapy_spm_headless_proxy_example.settings
 8 | 
 9 | [deploy]
10 | project = splash_scrapy_spm_headless_proxy_example
11 | 


--------------------------------------------------------------------------------
/splash_scrapy_spm_headless_proxy_example/setup.py:
--------------------------------------------------------------------------------
 1 | # Automatically created by: shub deploy
 2 | from setuptools import setup, find_packages
 3 | 
 4 | setup(
 5 |     name = 'project',
 6 |     version = '1.0',
 7 |     packages = find_packages(),
 8 |     package_data = {'splash_scrapy_spm_headless_proxy_example': ['scripts/*.lua',]},
 9 |     entry_points = {'scrapy': ['settings = splash_scrapy_spm_headless_proxy_example.settings']},
10 | )
11 | 


--------------------------------------------------------------------------------
/splash_scrapy_spm_headless_proxy_example/splash_scrapy_spm_headless_proxy_example/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/sample-projects/fdff7a0be50041c30e059568aca0d81d1183ad53/splash_scrapy_spm_headless_proxy_example/splash_scrapy_spm_headless_proxy_example/__init__.py


--------------------------------------------------------------------------------
/splash_scrapy_spm_headless_proxy_example/splash_scrapy_spm_headless_proxy_example/scripts/smart_proxy_manager.lua:
--------------------------------------------------------------------------------
 1 | function use_crawlera(splash)
 2 |     local session_header = 'X-Crawlera-Session'
 3 |     local session_id = 'create'
 4 | 
 5 |     splash:on_request(function (request)
 6 |         -- The commented code below can be used to speed up the crawling
 7 |         -- process. They filter requests to undesired domains and useless
 8 |         -- resources. Uncomment the ones that make sense to your use case
 9 |         -- and add your own rules.
10 | 
11 |         -- Discard requests to advertising and tracking domains.
12 |         -- if string.find(request.url, 'doubleclick%.net') or
13 |         --    string.find(request.url, 'analytics%.google%.com') then
14 |         --     request.abort()
15 |         --     return
16 |         -- end
17 | 
18 |         -- Avoid using Smart Proxy Manager for subresources fetching to increase crawling
19 |         -- speed. The example below avoids using Smart Proxy Manager for URLS starting
20 |         -- with 'static.' and the ones ending with '.png'.
21 |         -- if string.find(request.url, '://static%.') ~= nil or
22 |         --    string.find(request.url, '%.png$') ~= nil then
23 |         --     return
24 |         -- end
25 | 
26 |         -- Here, Splash will communicate with Zyte SmartProxy (formerly Crawlera) Headless Proxy.
27 |         -- Zyte SmartProxy (formerly Crawlera) Headless Proxy should be up and running.
28 |         request:set_proxy{"host.docker.internal", 3128}
29 |         request:set_header('X-Crawlera-Profile', 'desktop')
30 |         request:set_header('X-Crawlera-Cookies', 'disable')
31 |         request:set_header(session_header, session_id)
32 |     end)
33 | 
34 |     splash:on_response_headers(function (response)
35 |         if type(response.headers[session_header]) ~= nil then
36 |             session_id = response.headers[session_header]
37 |         end
38 |     end)
39 | end
40 | 
41 | function main(splash)
42 |     use_crawlera(splash)
43 |     splash:go(splash.args.url)
44 |     splash:wait(1)
45 |     return splash:html()
46 | end
47 | 


--------------------------------------------------------------------------------
/splash_scrapy_spm_headless_proxy_example/splash_scrapy_spm_headless_proxy_example/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | BOT_NAME = 'splash_scrapy_spm_headless_proxy_example'
 4 | SPIDER_MODULES = ['splash_scrapy_spm_headless_proxy_example.spiders']
 5 | NEWSPIDER_MODULE = 'splash_scrapy_spm_headless_proxy_example.spiders'
 6 | 
 7 | SPIDER_MIDDLEWARES = {
 8 |     'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
 9 | }
10 | 
11 | DOWNLOADER_MIDDLEWARES = {
12 |     'scrapy_splash.SplashCookiesMiddleware': 723,
13 |     'scrapy_splash.SplashMiddleware': 725,
14 |     'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
15 | }
16 | 
17 | # Splash settings
18 | SPLASH_URL = 'http://localhost:8050'  # Splash instance URL from Scrapy Cloud
19 | SPLASH_APIKEY = ''  # Your API key for the Splash instance hosted on Scrapy Cloud
20 | DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
21 | 


--------------------------------------------------------------------------------
/splash_scrapy_spm_headless_proxy_example/splash_scrapy_spm_headless_proxy_example/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/splash_scrapy_spm_headless_proxy_example/splash_scrapy_spm_headless_proxy_example/spiders/quotes-js.py:
--------------------------------------------------------------------------------
 1 | from pkgutil import get_data
 2 | import scrapy
 3 | from scrapy_splash import SplashRequest
 4 | 
 5 | 
 6 | class QuotesJsSpider(scrapy.Spider):
 7 |     name = 'quotes-js'
 8 |     custom_settings = {
 9 |         'RETRY_TIMES': 10,
10 |     }
11 | 
12 |     def __init__(self, *args, **kwargs):
13 |         # to be able to load the Lua script on Scrapy Cloud, make sure your
14 |         # project's setup.py file contains the "package_data" setting, similar
15 |         # to this project's setup.py
16 |         self.LUA_SOURCE = get_data(
17 |             'splash_scrapy_spm_headless_proxy_example', 'scripts/smart_proxy_manager.lua'
18 |         ).decode('utf-8')
19 |         super(QuotesJsSpider, self).__init__(*args, **kwargs)
20 | 
21 |     def start_requests(self):
22 |         yield SplashRequest(
23 |             url='http://quotes.toscrape.com/js/',
24 |             endpoint='execute',
25 |             args={
26 |                 'lua_source': self.LUA_SOURCE,
27 |                 'timeout': 60,
28 |             },
29 |             # tell Splash to cache the lua script, to avoid sending it for every request
30 |             cache_args=['lua_source'],
31 |             meta={
32 |                 'max_retry_times': 10,
33 |             }
34 |         )
35 | 
36 |     def parse(self, response):
37 |         for quote in response.css('div.quote'):
38 |             yield {
39 |                 'text': quote.css('span.text::text').extract_first(),
40 |                 'author': quote.css('span small::text').extract_first(),
41 |                 'tags': quote.css('div.tags a.tag::text').extract(),
42 |             }
43 |         next_page = response.css('li.next > a::attr(href)').extract_first()
44 |         if next_page:
45 |             yield SplashRequest(
46 |                 url=response.urljoin(next_page),
47 |                 endpoint='execute',
48 |                 args={'lua_source': self.LUA_SOURCE},
49 |                 cache_args=['lua_source'],
50 |             )
51 | 


--------------------------------------------------------------------------------
/splash_smart_proxy_manager_example/README.md:
--------------------------------------------------------------------------------
 1 | # Splash + Smart Proxy Manager Example Project
 2 | 
 3 | This example project shows how to use [Smart Proxy Manager (Formally Crawlera)](https://www.zyte.com/smart-proxy-manager/)  and [Splash](https://www.zyte.com/splash/) (a JavaScript
 4 | rendering service) with Scrapy spiders.
 5 | 
 6 | 
 7 | ## How does it work?
 8 | 
 9 | The integration between Splash and Smart Proxy Manager is done by a
10 | [Lua script](https://github.com/scrapinghub/sample-projects/blob/master/splash_crawlera_example/splash_crawlera_example/scripts/crawlera.lua)
11 | that is sent to Splash with every request created by the spider. This script configures
12 | Splash to use Smart Proxy Manager as its proxy and also defines a couple rules to avoid doing
13 | useless requests, such as analytics ones, stylesheets, images, etc.
14 | 
15 | 
16 | ## What do I need to run this project?
17 | 
18 | Here's what you'll need:
19 | 
20 | - a Splash instance and a Smart Proxy Manager account: you can get both via Scrapy Cloud billing page
21 |   - you can also run Splash in your own machine following the [instructions here](http://splash.readthedocs.io/en/stable/install.html)
22 | - set your Splash settings this project's [settings.py](https://github.com/scrapinghub/sample-projects/blob/master/splash_crawlera_example/splash_crawlera_example/settings.py)
23 | file:
24 |   - `SPLASH_URL`: the URL where your Splash instance is available
25 |   - `SPLASH_APIKEY`: your Splash API key (required if you're using an instance from Scrapy Cloud)
26 | - set your Crawlera settings in the same file:
27 |   - `ZYTE_SMARTPROXY_APIKEY`: the API key for your Smart Proxy Manager user
28 | 


--------------------------------------------------------------------------------
/splash_smart_proxy_manager_example/requirements.txt:
--------------------------------------------------------------------------------
1 | scrapy
2 | scrapy-splash
3 | 


--------------------------------------------------------------------------------
/splash_smart_proxy_manager_example/scrapinghub.yml:
--------------------------------------------------------------------------------
1 | requirements_file: requirements.txt
2 | stacks:
3 |   default: scrapy:1.3-py3
4 | 


--------------------------------------------------------------------------------
/splash_smart_proxy_manager_example/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = splash_smart_proxy_manager_example.settings
 8 | 
 9 | [deploy]
10 | project = splash_smart_proxy_manager_example
11 | 


--------------------------------------------------------------------------------
/splash_smart_proxy_manager_example/setup.py:
--------------------------------------------------------------------------------
 1 | # Automatically created by: shub deploy
 2 | from setuptools import setup, find_packages
 3 | 
 4 | setup(
 5 |     name = 'project',
 6 |     version = '1.0',
 7 |     packages = find_packages(),
 8 |     package_data = {'splash_smart_proxy_manager_example': ['scripts/*.lua',]},
 9 |     entry_points = {'scrapy': ['settings = splash_smart_proxy_manager_example.settings']},
10 | )
11 | 


--------------------------------------------------------------------------------
/splash_smart_proxy_manager_example/splash_smart_proxy_manager_example/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scrapinghub/sample-projects/fdff7a0be50041c30e059568aca0d81d1183ad53/splash_smart_proxy_manager_example/splash_smart_proxy_manager_example/__init__.py


--------------------------------------------------------------------------------
/splash_smart_proxy_manager_example/splash_smart_proxy_manager_example/scripts/smart_proxy_manager.lua:
--------------------------------------------------------------------------------
 1 | function use_crawlera(splash)
 2 |     -- Make sure you pass your Smart Proxy Manager API key in the 'crawlera_user' arg.
 3 |     -- Have a look at the file spiders/quotes-js.py to see how to do it.
 4 |     -- Find your Smart Proxy Manager credentials in https://app.zyte.com/
 5 |     local user = splash.args.crawlera_user
 6 |     local password = ''
 7 | 
 8 |     local host = 'proxy.zyte.com'
 9 |     local port = 8011
10 |     local session_header = 'X-Crawlera-Session'
11 |     local session_id = 'create'
12 | 
13 |     splash:on_request(function (request)
14 |         -- The commented code below can be used to speed up the crawling
15 |         -- process. They filter requests to undesired domains and useless
16 |         -- resources. Uncomment the ones that make sense to your use case
17 |         -- and add your own rules.
18 | 
19 |         -- Discard requests to advertising and tracking domains.
20 |         -- if string.find(request.url, 'doubleclick%.net') or
21 |         --    string.find(request.url, 'analytics%.google%.com') then
22 |         --     request.abort()
23 |         --     return
24 |         -- end
25 | 
26 |         -- Avoid using Smart Proxy Manager for subresources fetching to increase crawling
27 |         -- speed. The example below avoids using Smart Proxy Manager for URLS starting
28 |         -- with 'static.' and the ones ending with '.png'.
29 |         -- if string.find(request.url, '://static%.') ~= nil or
30 |         --    string.find(request.url, '%.png$') ~= nil then
31 |         --     return
32 |         -- end
33 |         request:set_proxy(host, port, user, password)
34 |         request:set_header('X-Crawlera-Profile', 'desktop')
35 |         request:set_header('X-Crawlera-Cookies', 'disable')
36 |         request:set_header(session_header, session_id)
37 |     end)
38 | 
39 |     splash:on_response_headers(function (response)
40 |         if type(response.headers[session_header]) ~= nil then
41 |             session_id = response.headers[session_header]
42 |         end
43 |     end)
44 | end
45 | 
46 | function main(splash)
47 |     use_crawlera(splash)
48 |     splash:go(splash.args.url)
49 |     splash:wait(1)
50 |     return splash:html()
51 | end
52 | 


--------------------------------------------------------------------------------
/splash_smart_proxy_manager_example/splash_smart_proxy_manager_example/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | BOT_NAME = 'splash_smart_proxy_manager_example'
 4 | SPIDER_MODULES = ['splash_smart_proxy_manager_example.spiders']
 5 | NEWSPIDER_MODULE = 'splash_smart_proxy_manager_example.spiders'
 6 | 
 7 | SPIDER_MIDDLEWARES = {
 8 |     'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,
 9 | }
10 | 
11 | DOWNLOADER_MIDDLEWARES = {
12 |     'scrapy_splash.SplashCookiesMiddleware': 723,
13 |     'scrapy_splash.SplashMiddleware': 725,
14 |     'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
15 | }
16 | 
17 | 
18 | ZYTE_SMARTPROXY_APIKEY = 'API-KEY'
19 | 
20 | # Splash settings
21 | SPLASH_URL = ''  # Splash instance URL from Scrapy Cloud
22 | SPLASH_APIKEY = ''  # Your API key for the Splash instance hosted on Scrapy Cloud
23 | DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'
24 | 


--------------------------------------------------------------------------------
/splash_smart_proxy_manager_example/splash_smart_proxy_manager_example/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/splash_smart_proxy_manager_example/splash_smart_proxy_manager_example/spiders/quotes-js.py:
--------------------------------------------------------------------------------
 1 | from pkgutil import get_data
 2 | import scrapy
 3 | from scrapy_splash import SplashRequest
 4 | from w3lib.http import basic_auth_header
 5 | 
 6 | 
 7 | class QuotesJsSpider(scrapy.Spider):
 8 |     name = 'quotes-js'
 9 |     custom_settings = {
10 |         'RETRY_TIMES': 10,
11 |     }
12 | 
13 | 
14 |     def __init__(self, *args, **kwargs):
15 |         # to be able to load the Lua script on Scrapy Cloud, make sure your
16 |         # project's setup.py file contains the "package_data" setting, similar
17 |         # to this project's setup.py
18 |         self.LUA_SOURCE = get_data(
19 |             'splash_smart_proxy_manager_example', 'scripts/smart_proxy_manager.lua'
20 |         ).decode('utf-8')
21 |         super(QuotesJsSpider, self).__init__(*args, **kwargs)
22 | 
23 |     def start_requests(self):
24 |         yield SplashRequest(
25 |             url='http://quotes.toscrape.com/js/',
26 |             endpoint='execute',
27 |             splash_headers={
28 |                 'Authorization': basic_auth_header(self.settings['SPLASH_APIKEY'], ''),
29 |             },
30 |             args={
31 |                 'lua_source': self.LUA_SOURCE,
32 |                 'crawlera_user': self.settings['ZYTE_SMARTPROXY_APIKEY'],
33 |                 'timeout': 60,
34 |             },
35 |             # tell Splash to cache the lua script, to avoid sending it for every request
36 |             cache_args=['lua_source'],
37 |             meta={
38 |                 'max_retry_times': 10,
39 |             }
40 |         )
41 | 
42 |     def parse(self, response):
43 |         for quote in response.css('div.quote'):
44 |             yield {
45 |                 'text': quote.css('span.text::text').extract_first(),
46 |                 'author': quote.css('span small::text').extract_first(),
47 |                 'tags': quote.css('div.tags a.tag::text').extract(),
48 |             }
49 |         next_page = response.css('li.next > a::attr(href)').extract_first()
50 |         if next_page:
51 |             yield SplashRequest(
52 |                 url=response.urljoin(next_page),
53 |                 endpoint='execute',
54 |                 splash_headers={
55 |                     'Authorization': basic_auth_header(self.settings['SPLASH_APIKEY'], ''),
56 |                 },
57 |                 args={
58 |                     'lua_source': self.LUA_SOURCE,
59 |                     'crawlera_user': self.settings['ZYTE_SMARTPROXY_APIKEY'],
60 |                 },
61 |                 cache_args=['lua_source'],
62 |             )
63 | 


--------------------------------------------------------------------------------