├── .gitignore ├── README.md ├── example-project ├── example │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── dealsplus.py └── scrapy.cfg ├── run_example.sh ├── scrapy_proxynova ├── __init__.py ├── middleware.py └── proxies.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | 21 | # Installer logs 22 | pip-log.txt 23 | 24 | # Unit test / coverage reports 25 | .coverage 26 | .tox 27 | nosetests.xml 28 | 29 | # Translations 30 | *.mo 31 | 32 | # Mr Developer 33 | .mr.developer.cfg 34 | .project 35 | .pydevproject 36 | .*.swp 37 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | scrapy-proxynova 2 | ================ 3 | 4 | Use scrapy with a list of proxies generated from proxynova.com 5 | 6 | The first run will generate the list of proxies from and store it in the cache. 7 | 8 | It will individually check each proxy to see if they work and remove the ones that timed out or cannot connect to. 9 | 10 | Example: 11 | 12 | ./run_example.sh 13 | 14 | To regenerate the proxy list, run: python proxies.py 15 | 16 | In settings.py add the following line: 17 | DOWNLOADER_MIDDLEWARES = { 18 | 'scrapy_proxynova.middleware.HttpProxyMiddleware': 543 19 | } 20 | 21 | Options 22 | ------- 23 | 24 | Set these options in the `settings.py`. 25 | 26 | * PROXY_SERVER_LIST_CACHE_FILE — a file to store proxies list. Default: `proxies.txt`. 27 | * PROXY_BYPASS_PERCENT — probability for a connection to use a direct connection and not use a proxy 28 | -------------------------------------------------------------------------------- /example-project/example/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/darthbear/scrapy-proxynova/da654a7352936fd9f2240969b3e178774803d92f/example-project/example/__init__.py -------------------------------------------------------------------------------- /example-project/example/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class Deal(Item): 9 | title = Field() 10 | url = Field() 11 | -------------------------------------------------------------------------------- /example-project/example/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/topics/item-pipeline.html 5 | 6 | class ExamplePipeline(object): 7 | def process_item(self, item, spider): 8 | return item 9 | -------------------------------------------------------------------------------- /example-project/example/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for example project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/topics/settings.html 7 | # 8 | 9 | BOT_NAME = 'example' 10 | 11 | SPIDER_MODULES = ['example.spiders'] 12 | NEWSPIDER_MODULE = 'example.spiders' 13 | 14 | ITEM_PIPELINES = [ 15 | 'example.pipelines.ExamplePipeline', 16 | ] 17 | 18 | DOWNLOADER_MIDDLEWARES = { 19 | 'scrapy_proxynova.middleware.HttpProxyMiddleware': 543, 20 | 'scrapy.contrib.downloadermiddleware.downloadtimeout.DownloadTimeoutMiddleware':100 21 | } 22 | 23 | PROXY_SERVER_LIST_CACHE_FILE='/tmp/__proxy_servers.txt' 24 | 25 | DOWNLOAD_TIMEOUT=30 26 | 27 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 28 | #USER_AGENT = 'example (+http://www.yourdomain.com)' 29 | -------------------------------------------------------------------------------- /example-project/example/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /example-project/example/spiders/dealsplus.py: -------------------------------------------------------------------------------- 1 | from scrapy.spider import BaseSpider 2 | from scrapy.selector import HtmlXPathSelector 3 | from scrapy.http.request import Request 4 | 5 | from example.items import Deal 6 | 7 | 8 | class DealsplusSpider(BaseSpider): 9 | name = "dealsplus" 10 | allowed_domains = ["dealspl.us"] 11 | start_urls = [ 12 | "http://dealspl.us/deals/hot/recent", 13 | ] 14 | 15 | def parse(self, response): 16 | hxs = HtmlXPathSelector(response) 17 | items = hxs.select("//table[@id='allDealTable']/tr/td") 18 | 19 | for item in items: 20 | deal = Deal() 21 | deal['title'] = item.select(".//div[@class='deal_img_span']/a/@title").extract()[0] 22 | deal['url'] = item.select(".//div[@class='deal_img_span']/a/@href").extract()[0] 23 | yield deal 24 | 25 | nextPage = hxs.select("//a[@class='box_a' and contains(text(), 'Next')]/@href") 26 | if not not nextPage: 27 | yield Request("http://dealspl.us%s"%nextPage.extract()[0], self.parse) 28 | -------------------------------------------------------------------------------- /example-project/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/topics/scrapyd.html 5 | 6 | [settings] 7 | default = example.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = example 12 | -------------------------------------------------------------------------------- /run_example.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | (cd scrapy_proxynova; python proxies.py us 5 10 /tmp/__proxy_servers.txt) 4 | export PYTHONPATH=$PYTHONPATH:$PWD 5 | (cd example-project; scrapy crawl dealsplus) 6 | -------------------------------------------------------------------------------- /scrapy_proxynova/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/darthbear/scrapy-proxynova/da654a7352936fd9f2240969b3e178774803d92f/scrapy_proxynova/__init__.py -------------------------------------------------------------------------------- /scrapy_proxynova/middleware.py: -------------------------------------------------------------------------------- 1 | from proxies import Proxies 2 | from scrapy import log 3 | import random 4 | 5 | class HttpProxyMiddleware(object): 6 | def __init__(self, proxy_file, proxy_bypass_percent, **kwargs): 7 | self.bypass_percent = int(proxy_bypass_percent) 8 | self.proxies = Proxies(proxy_file, **kwargs) 9 | 10 | @classmethod 11 | def from_crawler(cls, crawler): 12 | return cls( 13 | crawler.settings.get( 14 | 'PROXY_SERVER_LIST_CACHE_FILE', 15 | 'proxies.txt' 16 | ), 17 | crawler.settings.get( 18 | 'PROXY_BYPASS_PERCENT', 19 | 0 20 | ), 21 | logger=lambda message: log.msg(message), 22 | ) 23 | 24 | def process_request(self, request, spider): 25 | n = random.randint(0, 99) 26 | if n >= self.bypass_percent: 27 | proxy = self.proxies.get_proxy() 28 | log.msg('Using proxy ' + proxy, spider=spider) 29 | request.meta['proxy'] = 'http://' + proxy 30 | else: 31 | if 'proxy' in request.meta: 32 | del request.meta['proxy'] 33 | log.msg('No proxy used', spider=spider) 34 | -------------------------------------------------------------------------------- /scrapy_proxynova/proxies.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import re 4 | import requests 5 | import sys 6 | import StringIO 7 | 8 | countries = { 9 | 'br': 'Brazil', 10 | 'cn': 'China', 11 | 'id': 'Indonesia', 12 | 'th': 'Thailand', 13 | 've': 'Venezuela', 14 | 'eg': 'Egypt', 15 | 'us': 'United States', 16 | 'pe': 'Peru', 17 | 'ru': 'Russia', 18 | 'tw': 'Taiwan', 19 | 'ae': 'United Arab Emirates', 20 | 'in': 'India', 21 | 'ar': 'Argentina', 22 | 'za': 'South Africa', 23 | 'co': 'Colombia', 24 | 'de': 'Germany', 25 | 'ua': 'Ukraine', 26 | 'hk': 'Hong Kong', 27 | 'fr': 'France', 28 | 'mx': 'Mexico', 29 | 'pl': 'Poland', 30 | 'bd': 'Bangladesh', 31 | 'it': 'Italy', 32 | 'ec': 'Ecuador', 33 | 'gb': 'United Kingdom', 34 | 'jp': 'Japan', 35 | 'nl': 'Netherlands', 36 | 'tr': 'Turkey', 37 | 'cl': 'Chile', 38 | 'pk': 'Pakistan', 39 | 'ca': 'Canada', 40 | 'mn': 'Mongolia', 41 | 'cz': 'Czech Republic', 42 | 'kr': 'South Korea', 43 | 'my': 'Malaysia', 44 | 'kh': 'Cambodia', 45 | 'ma': 'Morocco', 46 | 'rs': 'Serbia', 47 | 'bn': 'Brunei Darussalam', 48 | 'ir': 'Iran', 49 | 'iq': 'Iraq', 50 | 'hu': 'Hungary', 51 | 'bg': 'Bulgaria', 52 | 'es': 'Spain', 53 | 'vn': 'Vietnam', 54 | 'lb': 'Lebanon', 55 | 'ng': 'Nigeria', 56 | 'ro': 'Romania', 57 | 'eu': 'European Union', 58 | 'ph': 'Philippines', 59 | } 60 | 61 | 62 | def get_proxies(country=None, timeout=None, limit=None, logger=None): 63 | country = 'us' if country is None else country 64 | timeout = 1 if timeout is None else timeout 65 | limit = 10 if limit is None else limit 66 | if logger is None: 67 | def logger(msg): 68 | print msg 69 | 70 | if country not in countries: 71 | raise RuntimeError('Not allowed country code.') 72 | 73 | base_url = ('http://www.proxynova.com/proxy_list.txt?country={country}') 74 | 75 | proxies = [] 76 | 77 | url = base_url.format(**locals()) 78 | response = requests.get(url) 79 | 80 | ip_port_pattern = re.compile("(\d+\.\d+\.\d+\.\d+:\d+)") 81 | buf = StringIO.StringIO(response.content) 82 | line = buf.readline() 83 | while line: 84 | if len(proxies) == limit: 85 | break 86 | 87 | match = ip_port_pattern.match(line) 88 | if match: 89 | server = match.group(0) 90 | try: 91 | response = requests.get( 92 | 'http://www.linkedin.com', 93 | proxies=dict(http=server), 94 | timeout=timeout, 95 | ) 96 | if 'Company Directory' in response.content: 97 | logger('Found alive proxy: ' + server) 98 | proxies.append(server) 99 | else: 100 | logger( 101 | 'Error while reading data from ' 102 | 'proxy {0}. Skipping...'.format(server) 103 | ) 104 | except Exception, e: 105 | logger('An error occured: {}. Skipping server {}'.format( 106 | e, 107 | server 108 | )) 109 | line = buf.readline() 110 | 111 | return proxies 112 | 113 | 114 | class Proxies(object): 115 | def __init__(self, proxy_file, **kwargs): 116 | self.proxy_file = proxy_file 117 | if os.path.exists(self.proxy_file): 118 | with open(self.proxy_file) as f: 119 | self.proxies = [] 120 | for line in iter(f): 121 | server = line.strip() 122 | if len(server) > 0: 123 | self.proxies.append(line) 124 | if len(self.proxies) == 0: 125 | raise IOError("Proxy file '%s' is empty"%self.proxy_file) 126 | else: 127 | raise IOError("Cannot find proxy file '%s'"%self.proxy_file) 128 | 129 | def get_proxy(self): 130 | return random.choice(self.proxies) 131 | 132 | 133 | if __name__ == '__main__': 134 | if len(sys.argv) < 5: 135 | sys.exit('Usage: %s '%sys.argv[0]) 136 | 137 | country = sys.argv[1].lower() 138 | timeout = int(sys.argv[2]) 139 | max_proxies = int(sys.argv[3]) 140 | proxy_file = sys.argv[4] 141 | 142 | try: 143 | proxies = get_proxies(country, timeout, max_proxies) 144 | except Exception, e: 145 | sys.stderr.write("Error while fetching proxies from proxynova.com: %s\n"%e) 146 | sys.exit(1) 147 | 148 | if len(proxies) == 0: 149 | sys.stderr.write("Error: Cannot find any available proxies\n") 150 | sys.exit(1) 151 | else: 152 | with open(proxy_file, 'w+') as f: 153 | f.write('\n'.join(proxies)) 154 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name='scrapy-proxynova', 5 | version='0.1.0', 6 | description='Allows scrapy to use proxy list from proxynova.com', 7 | keywords='scrapy proxy', 8 | license='New BSD License', 9 | author="Alexander Artemenko, Francois Dang Ngoc", 10 | author_email='svetlyak.40wt@gmail.com, francois.dangngoc@gmail.com', 11 | url='http://github.com/darthbear/scrapy-proxynova/', 12 | classifiers=[ 13 | 'Development Status :: 4 - Beta', 14 | 'Intended Audience :: Developers', 15 | 'License :: OSI Approved :: BSD License', 16 | 'Programming Language :: Python', 17 | ], 18 | packages=[ 19 | 'scrapy_proxynova', 20 | ], 21 | install_requires=[ 22 | 'requests' 23 | ], 24 | ) 25 | --------------------------------------------------------------------------------