├── .gitignore ├── PythonScrapyBasicSetup ├── __init__.py ├── pipelines.py ├── items.py ├── middlewares │ ├── __init__.py │ ├── user_agent.py │ └── proxy.py ├── spiders │ ├── __init__.py │ ├── iptester.py │ └── uatester.py ├── run.py ├── settings.py └── data │ └── user_agents.xml ├── scrapy.cfg ├── License.md └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /PythonScrapyBasicSetup/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | default = PythonScrapyBasicSetup.settings 3 | -------------------------------------------------------------------------------- /PythonScrapyBasicSetup/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | class PythonscrapybasicsetupPipeline(object): 4 | def process_item(self, item, spider): 5 | return item 6 | -------------------------------------------------------------------------------- /PythonScrapyBasicSetup/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import scrapy 4 | 5 | class ProxyItem(scrapy.Item): 6 | protocol = scrapy.Field() 7 | address = scrapy.Field() 8 | port = scrapy.Field() 9 | -------------------------------------------------------------------------------- /PythonScrapyBasicSetup/middlewares/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /PythonScrapyBasicSetup/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /PythonScrapyBasicSetup/spiders/iptester.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # just for ip address testing purposes 3 | 4 | import logging 5 | import scrapy 6 | from bs4 import BeautifulSoup 7 | 8 | class IPTesterSpider(scrapy.Spider): 9 | name = 'IPtester' 10 | allowed_domains = ['icanhazip.com'] 11 | start_urls = ( 12 | 'https://icanhazip.com', 13 | ) 14 | 15 | def parse(self, response): 16 | soup = BeautifulSoup(response.body, 'html.parser') 17 | ip_address = soup.get_text().rstrip('\n') 18 | if ip_address: 19 | logging.info('IP ADDRESS = %s', ip_address) 20 | else: 21 | logging.info('IP ADDRESS NOT FOUND') 22 | -------------------------------------------------------------------------------- /PythonScrapyBasicSetup/spiders/uatester.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # just for user agent testing purposes 3 | 4 | import logging 5 | import scrapy 6 | from bs4 import BeautifulSoup 7 | 8 | class UATesterSpider(scrapy.Spider): 9 | name = 'UAtester' 10 | allowed_domains = ['whatsmyuseragent.org'] 11 | start_urls = ( 12 | 'http://whatsmyuseragent.org/', 13 | ) 14 | 15 | def parse(self, response): 16 | soup = BeautifulSoup(response.body, 'html.parser') 17 | user_agent = soup.p.text 18 | if user_agent: 19 | logging.info('USER AGENT = %s', user_agent) 20 | else: 21 | logging.info('USER AGENT NOT FOUND') 22 | -------------------------------------------------------------------------------- /PythonScrapyBasicSetup/middlewares/user_agent.py: -------------------------------------------------------------------------------- 1 | import random 2 | from xml.dom import minidom 3 | from scrapy.utils.project import get_project_settings 4 | 5 | class RandomUserAgentMiddleware(object): 6 | settings = get_project_settings() 7 | source_path = 'data/user_agents.xml' 8 | 9 | def __init__(self, *args, **kwargs): 10 | xmldoc = minidom.parse(self.source_path) 11 | items = xmldoc.getElementsByTagName('useragent') 12 | user_agents = [item.attributes['value'].value for item in items] 13 | 14 | self.settings.set('USER_AGENT_LIST', user_agents) 15 | 16 | def process_request(self, request, spider): 17 | user_agent = random.choice(self.settings.get('USER_AGENT_LIST')) 18 | if user_agent: 19 | request.headers.setdefault('User-Agent', user_agent) 20 | -------------------------------------------------------------------------------- /PythonScrapyBasicSetup/run.py: -------------------------------------------------------------------------------- 1 | from twisted.internet import reactor, defer 2 | from scrapy.crawler import CrawlerRunner 3 | from scrapy.utils.log import configure_logging 4 | from scrapy.utils.project import get_project_settings 5 | 6 | from spiders.iptester import IPTesterSpider 7 | from spiders.uatester import UATesterSpider 8 | 9 | def run(): 10 | configure_logging() 11 | # importing project settings for further usage 12 | # mainly because of the middlewares 13 | settings = get_project_settings() 14 | runner = CrawlerRunner(settings) 15 | 16 | # running spiders sequentially (non-distributed) 17 | @defer.inlineCallbacks 18 | def crawl(): 19 | yield runner.crawl(IPTesterSpider) 20 | yield runner.crawl(UATesterSpider) 21 | reactor.stop() 22 | 23 | crawl() 24 | reactor.run() # block until the last call 25 | 26 | run() 27 | -------------------------------------------------------------------------------- /License.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Matej Bašić 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /PythonScrapyBasicSetup/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | BOT_NAME = 'PythonScrapyBasicSetup' 4 | 5 | SPIDER_MODULES = ['PythonScrapyBasicSetup.spiders'] 6 | NEWSPIDER_MODULE = 'PythonScrapyBasicSetup.spiders' 7 | 8 | # maximum concurrent requests performed by Scrapy (default: 16) 9 | CONCURRENT_REQUESTS = 32 10 | # timeout for processing DNS queries in seconds(float) (default: 60) 11 | DNS_TIMEOUT = 10 12 | # time(sec) that the downloader will wait before timing out 13 | DOWNLOAD_TIMEOUT = 24 14 | 15 | # use telnet console 16 | TELNETCONSOLE_ENABLED = False 17 | 18 | # delay for requests for the same website (default: 0) 19 | # DOWNLOAD_DELAY = 3 20 | # download delay setting will honor only one of: 21 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 22 | # CONCURRENT_REQUESTS_PER_IP = 16 23 | 24 | # maximum number of times to retry 25 | RETRY_TIMES = 2 26 | # HTTP response codes to retry 27 | RETRY_HTTP_CODES = [500, 502, 503, 504] 28 | 29 | # disable cookies 30 | COOKIES_ENABLED = False 31 | 32 | # TOR SETTINGS 33 | HTTP_PROXY = 'http://127.0.0.1:8118' 34 | AUTH_PASSWORD = 'secretPassword' 35 | CONTROL_PORT = 9051 36 | 37 | # if defined, it will force exit nodes to be from given country/countries 38 | # http://www.b3rn3d.com/blog/2014/03/05/tor-country-codes/ 39 | # EXIT_NODES = '{br}' 40 | 41 | # number of HTTTP request before the IP change 42 | # delete or set to None if you don't want to use it 43 | MAX_REQ_PER_IP = 1000 44 | 45 | # downloader middlewares 46 | DOWNLOADER_MIDDLEWARES = { 47 | 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, 48 | 'PythonScrapyBasicSetup.middlewares.user_agent.RandomUserAgentMiddleware': 400, 49 | # 'PythonScrapyBasicSetup.middlewares.proxy.HttpProxyMiddleware': 410, 50 | 'PythonScrapyBasicSetup.middlewares.proxy.TorProxyMiddleware': 410 51 | } 52 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PythonScrapyBasicSetup 2 | Basic setup with random user agents and proxy addresses for [Python Scrapy Framework](http://scrapy.org/). 3 | 4 | ### Setup 5 | ##### 1. Install Scrapy Framework 6 | 7 | ``` 8 | pip install Scrapy 9 | ``` 10 | [Detailed installation guide](https://doc.scrapy.org/en/1.3/intro/install.html) 11 | 12 | ##### 2. Install [Beautiful Soup 4](https://www.crummy.com/software/BeautifulSoup) for proxy middleware based on proxydocker lists 13 | 14 | ``` 15 | pip install beautifulsoup4 16 | ``` 17 | [Detailed installation guide](https://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-beautiful-soup) 18 | ##### 3. Install [Tor](https://www.torproject.org/), [Stem](https://stem.torproject.org/) (controller library for Tor), and [Privoxy](https://www.privoxy.org/) (HTTP proxy server). 19 | 20 | ``` 21 | apt-get install tor python-stem privoxy 22 | ``` 23 | Hash a password with Tor: 24 | ``` 25 | tor --hash-password secretPassword 26 | ``` 27 | Then copy a hashed password and paste it with control port to ```/etc/tor/torrc```: 28 | ``` 29 | ControlPort 9051 30 | HashedControlPassword 16:72C8ADB0E34F8DA1606BB154886604F708236C0D0A54557A07B00CAB73 31 | ``` 32 | Restart Tor: 33 | ``` 34 | sudo /etc/init.d/tor restart 35 | ``` 36 | Enable Privoxy forwarding by adding next line to ```/etc/privoxy/config```: 37 | ``` 38 | forward-socks5 / localhost:9050 . 39 | ``` 40 | Restart Privoxy: 41 | ``` 42 | sudo /etc/init.d/privoxy restart 43 | ``` 44 | Both Tor and Privoxy should be up & running (check ```netstat -l```). If you used different password or control port, update ```settings.py```. 45 | 46 | If you get some errors regarding the pyOpenSSL (check this [issue](https://github.com/scrapy/scrapy/issues/2473)), try to downgrade the Twisted engine: 47 | ``` 48 | pip install Twisted==16.4.1 49 | ``` 50 | 51 | ### Usage 52 | To see what it does just: 53 | ``` 54 | python run.py 55 | ``` 56 | Project contains three middleware classes in ```middlewares``` directory. ```ProxyMiddleware``` downloads IP proxy addresses and before every process request chooses one randomly. ```TorMiddleware``` has a similar purpose, but it relies on Tor network. ```RandomUserAgentMiddleware``` downloads user agent strings and saves them into ```'USER_AGENT_LIST'``` settings list. It also selects one UA randomly before every process request. Middlewares are activated in ```settings.py``` file. 57 | This project also contains two spiders just for testing purposes, ```spiders/iptester.py``` and ```spiders/uatester.py```. You can run them individually: 58 | ``` 59 | scrapy crawl UAtester 60 | scrapy crawl IPtester 61 | ``` 62 | ```run.py``` file is a also good example how to include and run your spiders sequentially from one script. 63 | 64 | If you have any questions or problems, feel free to create a new issue. 65 | Scrap responsibly! 66 | -------------------------------------------------------------------------------- /PythonScrapyBasicSetup/middlewares/proxy.py: -------------------------------------------------------------------------------- 1 | import random 2 | import logging 3 | import urllib2 4 | from stem import Signal 5 | from stem.control import Controller 6 | from bs4 import BeautifulSoup 7 | from scrapy.utils.project import get_project_settings 8 | 9 | class TorProxyMiddleware(object): 10 | 11 | def __init__(self): 12 | self.import_settings() 13 | self.req_counter = 0 14 | 15 | def change_ip_address(self): 16 | with Controller.from_port(port=self.control_port) as controller: 17 | controller.authenticate(self.password) 18 | controller.signal(Signal.NEWNYM) 19 | controller.close() 20 | 21 | def import_settings(self): 22 | settings = get_project_settings() 23 | self.password = settings['AUTH_PASSWORD'] 24 | self.http_proxy = settings['HTTP_PROXY'] 25 | self.control_port = settings['CONTROL_PORT'] 26 | self.max_req_per_ip = settings['MAX_REQ_PER_IP'] 27 | 28 | self.exit_nodes = settings['EXIT_NODES'] 29 | if self.exit_nodes: 30 | with Controller.from_port(port=self.control_port) as controller: 31 | controller.authenticate(self.password) 32 | controller.set_conf('ExitNodes', self.exit_nodes) 33 | controller.close() 34 | 35 | def process_request(self, request, spider): 36 | self.req_counter += 1 37 | if self.max_req_per_ip is not None and self.req_counter > self.max_req_per_ip: 38 | self.req_counter = 0 39 | self.change_ip_address() 40 | 41 | request.meta['proxy'] = self.http_proxy 42 | logging.info('Using proxy: %s', request.meta['proxy']) 43 | return None 44 | 45 | class HttpProxyMiddleware(object): 46 | proxies = [] 47 | max_proxies = 100 48 | source = { 49 | 'port': 8080, 50 | 'type': 'HTTP', 51 | 'url': 'https://www.proxydocker.com/search?port=%d&type=%s&anonymity=All&country=All&city=All' 52 | } 53 | 54 | def __init__(self): 55 | self.query_proxies() 56 | 57 | def _build_source_url(self): 58 | return self.source['url'] % (self.source['port'], self.source['type']) 59 | 60 | def query_proxies(self): 61 | request = urllib2.urlopen(self._build_source_url()) 62 | if request.getcode() == 200: 63 | i = 0 64 | soup = BeautifulSoup(request, 'html.parser') 65 | for row in soup.find_all('tr'): 66 | cells = row.findAll('td') 67 | if len(cells) > 2: 68 | self.proxies.append({ 69 | 'address': cells[0].text.strip(), 70 | 'protocol': cells[1].text.lower().strip() 71 | }) 72 | i += 1 73 | if i == self.max_proxies: 74 | break 75 | request.close() 76 | 77 | def process_request(self, request, spider): 78 | proxy = random.choice(self.proxies) 79 | request.meta['proxy'] = proxy['protocol'] + '://' + proxy['address'] 80 | logging.info('Using proxy: %s', request.meta['proxy']) 81 | 82 | def remove_failed_proxy(self, request, spider): 83 | failed_proxy = request.meta['proxy'] 84 | logging.log(logging.DEBUG, 'Removing failed proxy...') 85 | try: 86 | i = 0 87 | for proxy in self.proxies: 88 | if proxy['address'] in failed_proxy: 89 | del self.proxies[i] 90 | proxies_num = len(self.proxies) 91 | logging.log(logging.DEBUG, \ 92 | 'Removed failed proxy <%s>, %d proxies left', failed_proxy, proxies_num) 93 | if proxies_num == 0: 94 | self.query_proxies() 95 | return True 96 | i += 1 97 | except KeyError: 98 | logging.log(logging.ERROR, 'Error while removing failed proxy') 99 | return False 100 | 101 | def process_exception(self, request, exception, spider): 102 | if self.remove_failed_proxy(request, spider): 103 | return request 104 | return None 105 | 106 | def process_response(self, request, response, spider): 107 | # really brutal filter 108 | if response.status == 200: 109 | return response 110 | return request 111 | -------------------------------------------------------------------------------- /PythonScrapyBasicSetup/data/user_agents.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | --------------------------------------------------------------------------------