├── selenium_demo
    ├── __init__.py
    ├── spiders
    │   ├── __init__.py
    │   └── dribbble_spider.py
    ├── pipelines.py
    ├── items.py
    ├── settings.py
    └── middlewares.py
├── requirements.txt
├── chromedriver
├── scrapy.cfg
├── LICENSE
├── .gitignore
└── README.md


/selenium_demo/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Scrapy>=1.7.3
2 | selenium>=3.141.0
3 | 


--------------------------------------------------------------------------------
/chromedriver:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/harrywang/scrapy-selenium-demo/HEAD/chromedriver


--------------------------------------------------------------------------------
/selenium_demo/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = selenium_demo.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = selenium_demo
12 | 


--------------------------------------------------------------------------------
/selenium_demo/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class SeleniumDemoPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/selenium_demo/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://docs.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class SeleniumDemoItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Harry Wang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | .DS_Store
107 | 
108 | /local_output/*.html
109 | /local_output/*.json
110 | 
111 | # sqlite
112 | *.db
113 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Scrapy + Selenium Demo
 2 | 
 3 | This repo contains the code for Part V of my tutorial: A Minimalist End-to-End Scrapy Tutorial (https://medium.com/p/11e350bcdec0).
 4 | 
 5 | The website to crawl is https://dribbble.com/designers, which is an infinite scroll page.
 6 | 
 7 | I borrowed some code from "[Web Scraping: A Less Brief Overview of Scrapy and Selenium, Part II](https://towardsdatascience.com/web-scraping-a-less-brief-overview-of-scrapy-and-selenium-part-ii-3ad290ce7ba1)" - many thanks to the author!
 8 | 
 9 | ## Setup
10 | Tested with Python 3.6 via virtual environment:
11 | ```shell
12 | $ python3.6 -m venv venv
13 | $ source venv/bin/activate
14 | $ pip install -r requirements.txt
15 | ```
16 | 
17 | Chrome Driver:
18 | 
19 | You need to download the chrome driver from: https://chromedriver.chromium.org/downloads
20 | 
21 | Note: the version of the driver must match the version of chrome installed on your machine for this to work.
22 | 
23 | For example, this repo uses the chromedriver 77.0.3865.40 that supports Chrome version 77 - you need to make sure installed Chrome is version 77 (check it from Menu--> Chrome --> About Google Chrome)
24 | 
25 | ## Run
26 | 
27 | Run `scrapy crawl dribbble`, which should start an instance of Chrome and scroll to the bottom of the page automatically. The extracted data is logged to the console.
28 | 
29 | ## Use ProxyMesh with Scrapy
30 | 
31 | You must set the `http_proxy` environment variable, then activate the HttpProxyMiddleware.
32 | 
33 | For HTTP:
34 | 
35 | ```bash
36 | $ export http_proxy=http://USERNAME:PASSWORD@HOST:PORT
37 | ```
38 | 
39 | such as:
40 | ```bash
41 | $ export http_proxy=http://harrywang:mypassword@us-wa.proxymesh.com:31280
42 | ```
43 | 
44 | For HTTPS:
45 | 
46 | For https requests, you should use  IP authentication, and remove USERNAME:PASSWORD@ from the http_proxy variable.
47 | 
48 | To activate the HttpProxyMiddleware, uncomment the following part in `settings.py`:
49 | 
50 | ```python
51 | DOWNLOADER_MIDDLEWARES = {
52 |     'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 100,
53 | }
54 | ```
55 | ## Use ProxyMesh with Selenium
56 | 
57 | IP authentication must be set first: add the IP of the machine running this script to you ProxyMesh account for IP authentication. Then, uncomment the following two lines in the spider file.
58 | 
59 | ```python
60 | # PROXY = "us-wa.proxymesh.com:31280"
61 | # chrome_options.add_argument('--proxy-server=%s' % PROXY)
62 | ```
63 | 


--------------------------------------------------------------------------------
/selenium_demo/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for selenium_demo project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://docs.scrapy.org/en/latest/topics/settings.html
 9 | #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'selenium_demo'
13 | 
14 | SPIDER_MODULES = ['selenium_demo.spiders']
15 | NEWSPIDER_MODULE = 'selenium_demo.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'selenium_demo (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'selenium_demo.middlewares.SeleniumDemoSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
55 | # DOWNLOADER_MIDDLEWARES = {
56 | #    'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 100,
57 | # }
58 | 
59 | # Enable or disable extensions
60 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'selenium_demo.pipelines.SeleniumDemoPipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/selenium_demo/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class SeleniumDemoSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Request, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class SeleniumDemoDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/selenium_demo/spiders/dribbble_spider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import os
 3 | from time import sleep
 4 | from scrapy.selector import Selector
 5 | from selenium import webdriver
 6 | from selenium.webdriver.chrome.options import Options
 7 | from selenium.webdriver.common.keys import Keys
 8 | 
 9 | basedir = os.path.dirname(os.path.realpath('__file__'))
10 | 
11 | class DribbbleSpider(scrapy.Spider):
12 |     name = "dribbble"
13 |     allowed_domains = ["dribbble.com"]
14 |     start_urls = ['https://dribbble.com/designers']
15 | 
16 | 
17 |     def parse(self, response):
18 | 
19 |         # download the chrome driver from https://sites.google.com/a/chromium.org/chromedriver/downloads
20 |         # the version of the driver must match the version of chrome installed to work
21 | 
22 |         # instantiate a chrome options object so you can set the size and headless preference
23 |         chrome_options = Options()
24 |         chrome_options.add_argument("--window-size=1920x1080")
25 | 
26 |         # comment out the following line if you don't want to actually show Chrome instance
27 |         # but you can still see that the crawling is working via output in console
28 | 
29 |         # chrome_options.add_argument("--headless")
30 | 
31 | 
32 |         # comment out the following two lines to setup ProxyMesh service
33 |         # make sure you add the IP of the machine running this script to you ProxyMesh account for IP authentication
34 |         # IP:PORT or HOST:PORT you get this in your account once you pay for a plan
35 | 
36 |         # PROXY = "us-wa.proxymesh.com:31280"
37 |         # chrome_options.add_argument('--proxy-server=%s' % PROXY)
38 | 
39 |         chrome_driver_path = os.path.join(basedir, 'chromedriver')
40 |         driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=chrome_driver_path)
41 | 
42 |         driver.get('https://dribbble.com/designers')
43 |         scrapy_selector = Selector(text = driver.page_source)
44 | 
45 |         self.logger.info("*********** before scrolling ************")
46 |         self.logger.info(scrapy_selector.css('.vcard a[data-subject]::text').getall())
47 |         self.logger.info(len(scrapy_selector.css('.vcard a[data-subject]::text').getall()))
48 | 
49 |         # designer page with an infinite scroll
50 |         last_height = driver.execute_script("return document.body.scrollHeight")
51 |         SCROLL_PAUSE_TIME = 5
52 |         MAX_SCROLL = 10
53 |         i = 0
54 |         while i <= MAX_SCROLL:
55 |             driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
56 |             i += 1
57 |             # IMPORTANT!!!
58 |             # you have to get the selector again after each scrolling
59 |             # in order to get the newly loaded contents
60 |             scrapy_selector = Selector(text = driver.page_source)
61 |             self.logger.info("*********** during scrolling ************")
62 |             self.logger.info("Total scrolls executed: {}".format(i))
63 |             self.logger.info("this is the current designer names extracted: {}".format(scrapy_selector.css('.vcard a[data-subject]::text').getall()))
64 |             self.logger.info("Total names extracted: {}".format(len(scrapy_selector.css('.vcard a[data-subject]::text').getall())))
65 | 
66 |             sleep(SCROLL_PAUSE_TIME)
67 |             new_height = driver.execute_script("return document.body.scrollHeight")
68 |             if new_height == last_height:
69 |                 break
70 |             last_height = new_height
71 | 
72 |         self.logger.info("*********** scrolling done ************")
73 |         self.logger.info("final designer names extracted: {}".format(scrapy_selector.css('.vcard a[data-subject]::text').getall()))
74 |         self.logger.info("Final total names extracted: {}".format(len(scrapy_selector.css('.vcard a[data-subject]::text').getall())))
75 | 
76 |         # the following demostrates how to find the search location box
77 |         # enter "New York" and then click the search button
78 |         search_location = driver.find_element_by_css_selector('#location-selectized').send_keys('New York')
79 |         sleep(1)
80 |         search_button = driver.find_element_by_css_selector('input[type="submit"]')
81 |         search_button.click()
82 |         sleep(5)
83 |         driver.quit()
84 | 


--------------------------------------------------------------------------------