├── .gitignore ├── README.md ├── scrapy.cfg └── tor ├── __init__.py ├── items.py ├── middlewares.py ├── pipelines.py ├── settings.py └── spiders └── __init__.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # scrapy-tor 2 | This is a scrapy project skeleton with Tor integration 3 | 4 | # How to get started 5 | Beacuse scrapy does not work with SOCKS proxy, you'll need to set up a web proxy server that relays requests to Tor. 6 | You can install [Polipo](http://www.pps.univ-paris-diderot.fr/~jch/software/polipo/), a lightweight web proxy. Then point Polipo to Tor's listening port, which is 9050 by default. 7 | 8 | Uncomment or add the following lines to Polipo's config file `etc/polipo/config` to set up Polipo. 9 | ``` 10 | socksParentProxy = localhost:9050 11 | disableLocalInterface=true 12 | diskCacheRoot = "" 13 | ``` 14 | The function `ProxyMiddleware` defined in `middlewares.py` will relay all scrapy's requests to Polipo's default port of 8123 15 | 16 | Don't forget to start Polipo and Tor before scraping! 17 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = tor.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = tor 12 | -------------------------------------------------------------------------------- /tor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cvibhagool/scrapy-tor/00bcebdb57bbde478b7e708217b4f9899cf4270c/tor/__init__.py -------------------------------------------------------------------------------- /tor/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class TorItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /tor/middlewares.py: -------------------------------------------------------------------------------- 1 | from scrapy.conf import settings 2 | 3 | class ProxyMiddleware(object): 4 | #Overide the request process by making it go through Tor 5 | def process_request(self, request, spider): 6 | request.meta['proxy'] = settings.['HTTP_PROXY'] -------------------------------------------------------------------------------- /tor/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class TorPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /tor/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for tor project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'tor' 13 | 14 | SPIDER_MODULES = ['tor.spiders'] 15 | NEWSPIDER_MODULE = 'tor.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'tor (+http://www.yourdomain.com)' 20 | 21 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 22 | #CONCURRENT_REQUESTS=32 23 | 24 | # Configure a delay for requests for the same website (default: 0) 25 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 26 | # See also autothrottle settings and docs 27 | #DOWNLOAD_DELAY=3 28 | # The download delay setting will honor only one of: 29 | #CONCURRENT_REQUESTS_PER_DOMAIN=16 30 | #CONCURRENT_REQUESTS_PER_IP=16 31 | 32 | # Disable cookies (enabled by default) 33 | #COOKIES_ENABLED=False 34 | 35 | # Disable Telnet Console (enabled by default) 36 | #TELNETCONSOLE_ENABLED=False 37 | 38 | # Override the default request headers: 39 | #DEFAULT_REQUEST_HEADERS = { 40 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 41 | # 'Accept-Language': 'en', 42 | #} 43 | 44 | # Enable or disable spider middlewares 45 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 46 | #SPIDER_MIDDLEWARES = { 47 | # 'tor.middlewares.MyCustomSpiderMiddleware': 543, 48 | #} 49 | 50 | # Enable or disable downloader middlewares 51 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 52 | 53 | #This points to your local proxy server that talks to Tor 54 | HTTP_PROXY = 'http://127.0.0.1:8123' 55 | DOWNLOADER_MIDDLEWARES = { 56 | #Tor Middleware 57 | 'tor.middlewares.ProxyMiddleware': 400 58 | } 59 | 60 | # Enable or disable extensions 61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 62 | #EXTENSIONS = { 63 | # 'scrapy.telnet.TelnetConsole': None, 64 | #} 65 | 66 | # Configure item pipelines 67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 68 | #ITEM_PIPELINES = { 69 | # 'tor.pipelines.SomePipeline': 300, 70 | #} 71 | 72 | # Enable and configure the AutoThrottle extension (disabled by default) 73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 74 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay 75 | #AUTOTHROTTLE_ENABLED=True 76 | # The initial download delay 77 | #AUTOTHROTTLE_START_DELAY=5 78 | # The maximum download delay to be set in case of high latencies 79 | #AUTOTHROTTLE_MAX_DELAY=60 80 | # Enable showing throttling stats for every response received: 81 | #AUTOTHROTTLE_DEBUG=False 82 | 83 | # Enable and configure HTTP caching (disabled by default) 84 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 85 | #HTTPCACHE_ENABLED=True 86 | #HTTPCACHE_EXPIRATION_SECS=0 87 | #HTTPCACHE_DIR='httpcache' 88 | #HTTPCACHE_IGNORE_HTTP_CODES=[] 89 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage' 90 | -------------------------------------------------------------------------------- /tor/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | --------------------------------------------------------------------------------