├── selenium_demo ├── __init__.py ├── spiders │ ├── __init__.py │ └── dribbble_spider.py ├── pipelines.py ├── items.py ├── settings.py └── middlewares.py ├── requirements.txt ├── chromedriver ├── scrapy.cfg ├── LICENSE ├── .gitignore └── README.md /selenium_demo/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Scrapy>=1.7.3 2 | selenium>=3.141.0 3 | -------------------------------------------------------------------------------- /chromedriver: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/harrywang/scrapy-selenium-demo/HEAD/chromedriver -------------------------------------------------------------------------------- /selenium_demo/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = selenium_demo.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = selenium_demo 12 | -------------------------------------------------------------------------------- /selenium_demo/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class SeleniumDemoPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /selenium_demo/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://docs.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class SeleniumDemoItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Harry Wang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | .DS_Store 107 | 108 | /local_output/*.html 109 | /local_output/*.json 110 | 111 | # sqlite 112 | *.db 113 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scrapy + Selenium Demo 2 | 3 | This repo contains the code for Part V of my tutorial: A Minimalist End-to-End Scrapy Tutorial (https://medium.com/p/11e350bcdec0). 4 | 5 | The website to crawl is https://dribbble.com/designers, which is an infinite scroll page. 6 | 7 | I borrowed some code from "[Web Scraping: A Less Brief Overview of Scrapy and Selenium, Part II](https://towardsdatascience.com/web-scraping-a-less-brief-overview-of-scrapy-and-selenium-part-ii-3ad290ce7ba1)" - many thanks to the author! 8 | 9 | ## Setup 10 | Tested with Python 3.6 via virtual environment: 11 | ```shell 12 | $ python3.6 -m venv venv 13 | $ source venv/bin/activate 14 | $ pip install -r requirements.txt 15 | ``` 16 | 17 | Chrome Driver: 18 | 19 | You need to download the chrome driver from: https://chromedriver.chromium.org/downloads 20 | 21 | Note: the version of the driver must match the version of chrome installed on your machine for this to work. 22 | 23 | For example, this repo uses the chromedriver 77.0.3865.40 that supports Chrome version 77 - you need to make sure installed Chrome is version 77 (check it from Menu--> Chrome --> About Google Chrome) 24 | 25 | ## Run 26 | 27 | Run `scrapy crawl dribbble`, which should start an instance of Chrome and scroll to the bottom of the page automatically. The extracted data is logged to the console. 28 | 29 | ## Use ProxyMesh with Scrapy 30 | 31 | You must set the `http_proxy` environment variable, then activate the HttpProxyMiddleware. 32 | 33 | For HTTP: 34 | 35 | ```bash 36 | $ export http_proxy=http://USERNAME:PASSWORD@HOST:PORT 37 | ``` 38 | 39 | such as: 40 | ```bash 41 | $ export http_proxy=http://harrywang:mypassword@us-wa.proxymesh.com:31280 42 | ``` 43 | 44 | For HTTPS: 45 | 46 | For https requests, you should use IP authentication, and remove USERNAME:PASSWORD@ from the http_proxy variable. 47 | 48 | To activate the HttpProxyMiddleware, uncomment the following part in `settings.py`: 49 | 50 | ```python 51 | DOWNLOADER_MIDDLEWARES = { 52 | 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 100, 53 | } 54 | ``` 55 | ## Use ProxyMesh with Selenium 56 | 57 | IP authentication must be set first: add the IP of the machine running this script to you ProxyMesh account for IP authentication. Then, uncomment the following two lines in the spider file. 58 | 59 | ```python 60 | # PROXY = "us-wa.proxymesh.com:31280" 61 | # chrome_options.add_argument('--proxy-server=%s' % PROXY) 62 | ``` 63 | -------------------------------------------------------------------------------- /selenium_demo/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for selenium_demo project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://docs.scrapy.org/en/latest/topics/settings.html 9 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'selenium_demo' 13 | 14 | SPIDER_MODULES = ['selenium_demo.spiders'] 15 | NEWSPIDER_MODULE = 'selenium_demo.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'selenium_demo (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'selenium_demo.middlewares.SeleniumDemoSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 55 | # DOWNLOADER_MIDDLEWARES = { 56 | # 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 100, 57 | # } 58 | 59 | # Enable or disable extensions 60 | # See https://docs.scrapy.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'selenium_demo.pipelines.SeleniumDemoPipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /selenium_demo/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class SeleniumDemoSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Request, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class SeleniumDemoDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /selenium_demo/spiders/dribbble_spider.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | import os 3 | from time import sleep 4 | from scrapy.selector import Selector 5 | from selenium import webdriver 6 | from selenium.webdriver.chrome.options import Options 7 | from selenium.webdriver.common.keys import Keys 8 | 9 | basedir = os.path.dirname(os.path.realpath('__file__')) 10 | 11 | class DribbbleSpider(scrapy.Spider): 12 | name = "dribbble" 13 | allowed_domains = ["dribbble.com"] 14 | start_urls = ['https://dribbble.com/designers'] 15 | 16 | 17 | def parse(self, response): 18 | 19 | # download the chrome driver from https://sites.google.com/a/chromium.org/chromedriver/downloads 20 | # the version of the driver must match the version of chrome installed to work 21 | 22 | # instantiate a chrome options object so you can set the size and headless preference 23 | chrome_options = Options() 24 | chrome_options.add_argument("--window-size=1920x1080") 25 | 26 | # comment out the following line if you don't want to actually show Chrome instance 27 | # but you can still see that the crawling is working via output in console 28 | 29 | # chrome_options.add_argument("--headless") 30 | 31 | 32 | # comment out the following two lines to setup ProxyMesh service 33 | # make sure you add the IP of the machine running this script to you ProxyMesh account for IP authentication 34 | # IP:PORT or HOST:PORT you get this in your account once you pay for a plan 35 | 36 | # PROXY = "us-wa.proxymesh.com:31280" 37 | # chrome_options.add_argument('--proxy-server=%s' % PROXY) 38 | 39 | chrome_driver_path = os.path.join(basedir, 'chromedriver') 40 | driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=chrome_driver_path) 41 | 42 | driver.get('https://dribbble.com/designers') 43 | scrapy_selector = Selector(text = driver.page_source) 44 | 45 | self.logger.info("*********** before scrolling ************") 46 | self.logger.info(scrapy_selector.css('.vcard a[data-subject]::text').getall()) 47 | self.logger.info(len(scrapy_selector.css('.vcard a[data-subject]::text').getall())) 48 | 49 | # designer page with an infinite scroll 50 | last_height = driver.execute_script("return document.body.scrollHeight") 51 | SCROLL_PAUSE_TIME = 5 52 | MAX_SCROLL = 10 53 | i = 0 54 | while i <= MAX_SCROLL: 55 | driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") 56 | i += 1 57 | # IMPORTANT!!! 58 | # you have to get the selector again after each scrolling 59 | # in order to get the newly loaded contents 60 | scrapy_selector = Selector(text = driver.page_source) 61 | self.logger.info("*********** during scrolling ************") 62 | self.logger.info("Total scrolls executed: {}".format(i)) 63 | self.logger.info("this is the current designer names extracted: {}".format(scrapy_selector.css('.vcard a[data-subject]::text').getall())) 64 | self.logger.info("Total names extracted: {}".format(len(scrapy_selector.css('.vcard a[data-subject]::text').getall()))) 65 | 66 | sleep(SCROLL_PAUSE_TIME) 67 | new_height = driver.execute_script("return document.body.scrollHeight") 68 | if new_height == last_height: 69 | break 70 | last_height = new_height 71 | 72 | self.logger.info("*********** scrolling done ************") 73 | self.logger.info("final designer names extracted: {}".format(scrapy_selector.css('.vcard a[data-subject]::text').getall())) 74 | self.logger.info("Final total names extracted: {}".format(len(scrapy_selector.css('.vcard a[data-subject]::text').getall()))) 75 | 76 | # the following demostrates how to find the search location box 77 | # enter "New York" and then click the search button 78 | search_location = driver.find_element_by_css_selector('#location-selectized').send_keys('New York') 79 | sleep(1) 80 | search_button = driver.find_element_by_css_selector('input[type="submit"]') 81 | search_button.click() 82 | sleep(5) 83 | driver.quit() 84 | --------------------------------------------------------------------------------