├── .gitignore
├── README.md
├── scrapy.cfg
└── tor
    ├── __init__.py
    ├── items.py
    ├── middlewares.py
    ├── pipelines.py
    ├── settings.py
    └── spiders
        └── __init__.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | 
55 | # Sphinx documentation
56 | docs/_build/
57 | 
58 | # PyBuilder
59 | target/


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # scrapy-tor
 2 | This is a scrapy project skeleton with Tor integration
 3 | 
 4 | # How to get started
 5 | Beacuse scrapy does not work with SOCKS proxy, you'll need to set up a web proxy server that relays requests to Tor. 
 6 | You can install [Polipo](http://www.pps.univ-paris-diderot.fr/~jch/software/polipo/), a lightweight web proxy. Then point Polipo to Tor's listening port, which is 9050 by default.
 7 | 
 8 | Uncomment or add the following lines to Polipo's config file `etc/polipo/config` to set up Polipo.
 9 | ```
10 | socksParentProxy = localhost:9050
11 | disableLocalInterface=true
12 | diskCacheRoot = ""
13 | ```
14 | The function `ProxyMiddleware` defined in `middlewares.py` will relay all scrapy's requests to Polipo's default port of 8123
15 | 
16 | Don't forget to start Polipo and Tor before scraping!
17 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = tor.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = tor
12 | 


--------------------------------------------------------------------------------
/tor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cvibhagool/scrapy-tor/00bcebdb57bbde478b7e708217b4f9899cf4270c/tor/__init__.py


--------------------------------------------------------------------------------
/tor/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class TorItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/tor/middlewares.py:
--------------------------------------------------------------------------------
1 | from scrapy.conf import settings
2 | 
3 | class ProxyMiddleware(object):
4 |     #Overide the request process by making it go through Tor
5 |     def process_request(self, request, spider):
6 |         request.meta['proxy'] = settings.['HTTP_PROXY']


--------------------------------------------------------------------------------
/tor/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class TorPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/tor/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for tor project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'tor'
13 | 
14 | SPIDER_MODULES = ['tor.spiders']
15 | NEWSPIDER_MODULE = 'tor.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'tor (+http://www.yourdomain.com)'
20 | 
21 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
22 | #CONCURRENT_REQUESTS=32
23 | 
24 | # Configure a delay for requests for the same website (default: 0)
25 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
26 | # See also autothrottle settings and docs
27 | #DOWNLOAD_DELAY=3
28 | # The download delay setting will honor only one of:
29 | #CONCURRENT_REQUESTS_PER_DOMAIN=16
30 | #CONCURRENT_REQUESTS_PER_IP=16
31 | 
32 | # Disable cookies (enabled by default)
33 | #COOKIES_ENABLED=False
34 | 
35 | # Disable Telnet Console (enabled by default)
36 | #TELNETCONSOLE_ENABLED=False
37 | 
38 | # Override the default request headers:
39 | #DEFAULT_REQUEST_HEADERS = {
40 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
41 | #   'Accept-Language': 'en',
42 | #}
43 | 
44 | # Enable or disable spider middlewares
45 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
46 | #SPIDER_MIDDLEWARES = {
47 | #    'tor.middlewares.MyCustomSpiderMiddleware': 543,
48 | #}
49 | 
50 | # Enable or disable downloader middlewares
51 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
52 | 
53 | #This points to your local proxy server that talks to Tor
54 | HTTP_PROXY = 'http://127.0.0.1:8123'
55 | DOWNLOADER_MIDDLEWARES = {
56 |   #Tor Middleware
57 |   'tor.middlewares.ProxyMiddleware': 400
58 | }
59 | 
60 | # Enable or disable extensions
61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
62 | #EXTENSIONS = {
63 | #    'scrapy.telnet.TelnetConsole': None,
64 | #}
65 | 
66 | # Configure item pipelines
67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
68 | #ITEM_PIPELINES = {
69 | #    'tor.pipelines.SomePipeline': 300,
70 | #}
71 | 
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
74 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay
75 | #AUTOTHROTTLE_ENABLED=True
76 | # The initial download delay
77 | #AUTOTHROTTLE_START_DELAY=5
78 | # The maximum download delay to be set in case of high latencies
79 | #AUTOTHROTTLE_MAX_DELAY=60
80 | # Enable showing throttling stats for every response received:
81 | #AUTOTHROTTLE_DEBUG=False
82 | 
83 | # Enable and configure HTTP caching (disabled by default)
84 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
85 | #HTTPCACHE_ENABLED=True
86 | #HTTPCACHE_EXPIRATION_SECS=0
87 | #HTTPCACHE_DIR='httpcache'
88 | #HTTPCACHE_IGNORE_HTTP_CODES=[]
89 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
90 | 


--------------------------------------------------------------------------------
/tor/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------