├── .gitignore
├── README.md
├── example-project
    ├── example
    │   ├── __init__.py
    │   ├── items.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   └── dealsplus.py
    └── scrapy.cfg
├── run_example.sh
├── scrapy_proxynova
    ├── __init__.py
    ├── middleware.py
    └── proxies.py
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | 
 3 | # C extensions
 4 | *.so
 5 | 
 6 | # Packages
 7 | *.egg
 8 | *.egg-info
 9 | dist
10 | build
11 | eggs
12 | parts
13 | bin
14 | var
15 | sdist
16 | develop-eggs
17 | .installed.cfg
18 | lib
19 | lib64
20 | 
21 | # Installer logs
22 | pip-log.txt
23 | 
24 | # Unit test / coverage reports
25 | .coverage
26 | .tox
27 | nosetests.xml
28 | 
29 | # Translations
30 | *.mo
31 | 
32 | # Mr Developer
33 | .mr.developer.cfg
34 | .project
35 | .pydevproject
36 | .*.swp
37 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | scrapy-proxynova
 2 | ================
 3 | 
 4 | Use scrapy with a list of proxies generated from proxynova.com
 5 | 
 6 | The first run will generate the list of proxies from <http://proxynova.com> and store it in the cache.
 7 | 
 8 | It will individually check each proxy to see if they work and remove the ones that timed out or cannot connect to.
 9 | 
10 | Example:
11 | 
12 |     ./run_example.sh
13 | 
14 | To regenerate the proxy list, run: python proxies.py
15 | 
16 | In settings.py add the following line:
17 | DOWNLOADER_MIDDLEWARES = {
18 |         'scrapy_proxynova.middleware.HttpProxyMiddleware': 543
19 | }   
20 | 
21 | Options
22 | -------
23 | 
24 | Set these options in the `settings.py`.
25 | 
26 | * PROXY_SERVER_LIST_CACHE_FILE — a file to store proxies list. Default: `proxies.txt`.
27 | * PROXY_BYPASS_PERCENT — probability for a connection to use a direct connection and not use a proxy
28 | 


--------------------------------------------------------------------------------
/example-project/example/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/darthbear/scrapy-proxynova/da654a7352936fd9f2240969b3e178774803d92f/example-project/example/__init__.py


--------------------------------------------------------------------------------
/example-project/example/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # http://doc.scrapy.org/topics/items.html
 5 | 
 6 | from scrapy.item import Item, Field
 7 | 
 8 | class Deal(Item):
 9 |     title = Field()
10 |     url = Field()
11 | 


--------------------------------------------------------------------------------
/example-project/example/pipelines.py:
--------------------------------------------------------------------------------
1 | # Define your item pipelines here
2 | #
3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
4 | # See: http://doc.scrapy.org/topics/item-pipeline.html
5 | 
6 | class ExamplePipeline(object):
7 |     def process_item(self, item, spider):
8 |         return item
9 | 


--------------------------------------------------------------------------------
/example-project/example/settings.py:
--------------------------------------------------------------------------------
 1 | # Scrapy settings for example project
 2 | #
 3 | # For simplicity, this file contains only the most important settings by
 4 | # default. All the other settings are documented here:
 5 | #
 6 | #     http://doc.scrapy.org/topics/settings.html
 7 | #
 8 | 
 9 | BOT_NAME = 'example'
10 | 
11 | SPIDER_MODULES = ['example.spiders']
12 | NEWSPIDER_MODULE = 'example.spiders'
13 | 
14 | ITEM_PIPELINES = [
15 |     'example.pipelines.ExamplePipeline',
16 | ]
17 | 
18 | DOWNLOADER_MIDDLEWARES = {
19 |         'scrapy_proxynova.middleware.HttpProxyMiddleware': 543,
20 |         'scrapy.contrib.downloadermiddleware.downloadtimeout.DownloadTimeoutMiddleware':100
21 | }   
22 | 
23 | PROXY_SERVER_LIST_CACHE_FILE='/tmp/__proxy_servers.txt'
24 |     
25 | DOWNLOAD_TIMEOUT=30
26 | 
27 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
28 | #USER_AGENT = 'example (+http://www.yourdomain.com)'
29 | 


--------------------------------------------------------------------------------
/example-project/example/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/example-project/example/spiders/dealsplus.py:
--------------------------------------------------------------------------------
 1 | from scrapy.spider import BaseSpider
 2 | from scrapy.selector import HtmlXPathSelector
 3 | from scrapy.http.request import Request
 4 | 
 5 | from example.items import Deal
 6 | 
 7 | 
 8 | class DealsplusSpider(BaseSpider):
 9 |     name = "dealsplus"
10 |     allowed_domains = ["dealspl.us"]
11 |     start_urls = [
12 |         "http://dealspl.us/deals/hot/recent",
13 |     ]
14 | 
15 |     def parse(self, response):
16 |         hxs = HtmlXPathSelector(response)
17 |         items = hxs.select("//table[@id='allDealTable']/tr/td")
18 | 
19 |         for item in items:
20 |             deal = Deal()
21 |             deal['title'] = item.select(".//div[@class='deal_img_span']/a/@title").extract()[0]
22 |             deal['url'] = item.select(".//div[@class='deal_img_span']/a/@href").extract()[0]
23 |             yield deal
24 | 
25 | 	nextPage = hxs.select("//a[@class='box_a' and contains(text(), 'Next')]/@href")
26 | 	if not not nextPage:
27 | 		yield Request("http://dealspl.us%s"%nextPage.extract()[0], self.parse)
28 | 


--------------------------------------------------------------------------------
/example-project/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = example.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = example
12 | 


--------------------------------------------------------------------------------
/run_example.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | (cd scrapy_proxynova; python proxies.py us 5 10 /tmp/__proxy_servers.txt)
4 | export PYTHONPATH=$PYTHONPATH:$PWD
5 | (cd example-project; scrapy crawl dealsplus)
6 | 


--------------------------------------------------------------------------------
/scrapy_proxynova/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/darthbear/scrapy-proxynova/da654a7352936fd9f2240969b3e178774803d92f/scrapy_proxynova/__init__.py


--------------------------------------------------------------------------------
/scrapy_proxynova/middleware.py:
--------------------------------------------------------------------------------
 1 | from proxies import Proxies
 2 | from scrapy import log
 3 | import random
 4 | 
 5 | class HttpProxyMiddleware(object):
 6 |     def __init__(self, proxy_file, proxy_bypass_percent, **kwargs):
 7 | 	self.bypass_percent = int(proxy_bypass_percent)
 8 |         self.proxies = Proxies(proxy_file, **kwargs)
 9 | 
10 |     @classmethod
11 |     def from_crawler(cls, crawler):
12 |         return cls(
13 |             crawler.settings.get(
14 |                 'PROXY_SERVER_LIST_CACHE_FILE',
15 |                 'proxies.txt'
16 |             ),
17 |             crawler.settings.get(
18 |             	'PROXY_BYPASS_PERCENT',
19 | 		0
20 | 	    ),
21 |             logger=lambda message: log.msg(message),
22 |         )
23 | 
24 |     def process_request(self, request, spider):
25 | 	n = random.randint(0, 99)
26 | 	if n >= self.bypass_percent:
27 |             proxy = self.proxies.get_proxy()
28 |             log.msg('Using proxy ' + proxy, spider=spider)
29 |             request.meta['proxy'] = 'http://' + proxy
30 | 	else:
31 | 	    if 'proxy' in request.meta:
32 | 	        del request.meta['proxy']
33 |             log.msg('No proxy used', spider=spider)
34 | 


--------------------------------------------------------------------------------
/scrapy_proxynova/proxies.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import re
  4 | import requests
  5 | import sys
  6 | import StringIO
  7 | 
  8 | countries = {
  9 |     'br': 'Brazil',
 10 |     'cn': 'China',
 11 |     'id': 'Indonesia',
 12 |     'th': 'Thailand',
 13 |     've': 'Venezuela',
 14 |     'eg': 'Egypt',
 15 |     'us': 'United States',
 16 |     'pe': 'Peru',
 17 |     'ru': 'Russia',
 18 |     'tw': 'Taiwan',
 19 |     'ae': 'United Arab Emirates',
 20 |     'in': 'India',
 21 |     'ar': 'Argentina',
 22 |     'za': 'South Africa',
 23 |     'co': 'Colombia',
 24 |     'de': 'Germany',
 25 |     'ua': 'Ukraine',
 26 |     'hk': 'Hong Kong',
 27 |     'fr': 'France',
 28 |     'mx': 'Mexico',
 29 |     'pl': 'Poland',
 30 |     'bd': 'Bangladesh',
 31 |     'it': 'Italy',
 32 |     'ec': 'Ecuador',
 33 |     'gb': 'United Kingdom',
 34 |     'jp': 'Japan',
 35 |     'nl': 'Netherlands',
 36 |     'tr': 'Turkey',
 37 |     'cl': 'Chile',
 38 |     'pk': 'Pakistan',
 39 |     'ca': 'Canada',
 40 |     'mn': 'Mongolia',
 41 |     'cz': 'Czech Republic',
 42 |     'kr': 'South Korea',
 43 |     'my': 'Malaysia',
 44 |     'kh': 'Cambodia',
 45 |     'ma': 'Morocco',
 46 |     'rs': 'Serbia',
 47 |     'bn': 'Brunei Darussalam',
 48 |     'ir': 'Iran',
 49 |     'iq': 'Iraq',
 50 |     'hu': 'Hungary',
 51 |     'bg': 'Bulgaria',
 52 |     'es': 'Spain',
 53 |     'vn': 'Vietnam',
 54 |     'lb': 'Lebanon',
 55 |     'ng': 'Nigeria',
 56 |     'ro': 'Romania',
 57 |     'eu': 'European Union',
 58 |     'ph': 'Philippines',
 59 | }
 60 | 
 61 | 
 62 | def get_proxies(country=None, timeout=None, limit=None, logger=None):
 63 |     country = 'us' if country is None else country
 64 |     timeout = 1 if timeout is None else timeout
 65 |     limit = 10 if limit is None else limit
 66 |     if logger is None:
 67 |         def logger(msg):
 68 |             print msg
 69 | 
 70 |     if country not in countries:
 71 |         raise RuntimeError('Not allowed country code.')
 72 | 
 73 |     base_url = ('http://www.proxynova.com/proxy_list.txt?country={country}')
 74 | 
 75 |     proxies = []
 76 | 
 77 |     url = base_url.format(**locals())
 78 |     response = requests.get(url)
 79 | 
 80 |     ip_port_pattern = re.compile("(\d+\.\d+\.\d+\.\d+:\d+)")
 81 |     buf = StringIO.StringIO(response.content)
 82 |     line = buf.readline()
 83 |     while line:
 84 |         if len(proxies) == limit:
 85 |            break
 86 | 
 87 | 	match = ip_port_pattern.match(line)
 88 |         if match:
 89 | 	    server = match.group(0)
 90 |             try:
 91 | 	        response = requests.get(
 92 | 		              'http://www.linkedin.com',
 93 | 		              proxies=dict(http=server),
 94 |                               timeout=timeout,
 95 |                            )
 96 | 	        if 'Company Directory' in response.content:
 97 | 	            logger('Found alive proxy: ' + server)
 98 | 	            proxies.append(server)
 99 | 	        else:
100 | 	            logger(
101 | 		        'Error while reading data from '
102 | 		        'proxy {0}. Skipping...'.format(server)
103 | 	            )
104 |             except Exception, e:
105 | 	        logger('An error occured: {}. Skipping server {}'.format(
106 | 	               e,
107 | 	               server
108 | 	         ))
109 | 	line = buf.readline()
110 | 
111 |     return proxies
112 | 
113 | 
114 | class Proxies(object):
115 |     def __init__(self, proxy_file, **kwargs):
116 |         self.proxy_file = proxy_file
117 |         if os.path.exists(self.proxy_file):
118 |             with open(self.proxy_file) as f:
119 | 		self.proxies = []
120 | 		for line in iter(f):
121 | 		    server = line.strip()
122 | 		    if len(server) > 0:
123 | 		        self.proxies.append(line)
124 | 	        if len(self.proxies) == 0:
125 | 		    raise IOError("Proxy file '%s' is empty"%self.proxy_file)
126 |         else:
127 | 	    raise IOError("Cannot find proxy file '%s'"%self.proxy_file)
128 | 
129 |     def get_proxy(self):
130 |         return random.choice(self.proxies)
131 | 
132 | 
133 | if __name__ == '__main__':
134 |     if len(sys.argv) < 5:
135 | 	sys.exit('Usage: %s <country code> <timeout in secs> <max proxies> <output file>'%sys.argv[0])
136 | 
137 |     country = sys.argv[1].lower()
138 |     timeout = int(sys.argv[2])
139 |     max_proxies = int(sys.argv[3])
140 |     proxy_file = sys.argv[4]
141 | 
142 |     try:
143 |         proxies = get_proxies(country, timeout, max_proxies)
144 |     except Exception, e:
145 |         sys.stderr.write("Error while fetching proxies from proxynova.com: %s\n"%e)
146 | 	sys.exit(1)
147 | 
148 |     if len(proxies) == 0:
149 |         sys.stderr.write("Error: Cannot find any available proxies\n")
150 | 	sys.exit(1)
151 |     else:
152 |         with open(proxy_file, 'w+') as f:
153 | 	    f.write('\n'.join(proxies))
154 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(
 4 |     name='scrapy-proxynova',
 5 |     version='0.1.0',
 6 |     description='Allows scrapy to use proxy list from proxynova.com',
 7 |     keywords='scrapy proxy',
 8 |     license='New BSD License',
 9 |     author="Alexander Artemenko, Francois Dang Ngoc",
10 |     author_email='svetlyak.40wt@gmail.com, francois.dangngoc@gmail.com',
11 |     url='http://github.com/darthbear/scrapy-proxynova/',
12 |     classifiers=[
13 |         'Development Status :: 4 - Beta',
14 |         'Intended Audience :: Developers',
15 |         'License :: OSI Approved :: BSD License',
16 |         'Programming Language :: Python',
17 |     ],
18 |     packages=[
19 |         'scrapy_proxynova',
20 |     ],
21 |     install_requires=[
22 |         'requests'
23 |     ],
24 | )
25 | 


--------------------------------------------------------------------------------