├── README.md
├── __init__.py
├── proxy_provider.txt
├── proxypool
    ├── __init__.py
    ├── crawler.py
    ├── db.py
    ├── error.py
    ├── getter.py
    ├── importer.py
    ├── scheduler.py
    ├── setting.py
    ├── tester.py
    └── utils.py
├── requirements.txt
└── run.py


/README.md:
--------------------------------------------------------------------------------
  1 | # Proxy池-模块说明
  2 | 
  3 | ##### 由于爬虫业务大部分涉及电商、行业数据、资讯价值数据，被爬方一般为了反爬取，大部分会采用初级的反爬措施，其中较为常用的就是IP限制。
  4 | ##### 为方便爬取网站，搭建了IP池模块，供大家可以使用。
  5 | 
  6 | * 项目结构：
  7 | ```
  8 | proxypool
  9 | │   __init__.py
 10 | │   proxy_provider.txt    
 11 | │   requirements.txt 
 12 | │   run.py
 13 | │   README.md
 14 | └───proxypool
 15 |     │   __init__.py
 16 |     │   crawler.py
 17 |     │   db.py
 18 |     │   error.py  
 19 |     │   getter.py
 20 |     │   importer.py
 21 |     │   scheduler.py
 22 |     │   setting.py
 23 |     │   tester.py
 24 |     │   utils.py
 25 |     │
 26 | 
 27 | ```
 28 | 
 29 | * IP代理池调用
 30 | 
 31 |   * 调用Get_Proxy()中的 process_request 返回随机得分高的IP
 32 | ```python
 33 | import logging
 34 | import redis
 35 | from random import choice
 36 | 
 37 | class Get_Proxy():
 38 |     def __init__(self,host,port):
 39 |         self.logger = logging.getLogger(__name__)
 40 |         self.REDIS_KEY = 'proxies' #
 41 |         self.MAX_SCORE = 100
 42 |         # 连接数据库
 43 |         pool = redis.ConnectionPool(host=host,port=port)
 44 |         self.db = redis.StrictRedis(connection_pool=pool)
 45 | 
 46 |     def get_random_proxy(self):
 47 |         # 数据库拿数据IP，优先得分高的IP
 48 |         result = self.db.zrangebyscore(self.REDIS_KEY,self.MAX_SCORE,self.MAX_SCORE)
 49 |         if len(result):
 50 |             return choice(result)
 51 |         else:
 52 |             result = self.db.zrevrange(self.REDIS_KEY,0,100)
 53 |             if len(result):
 54 |                 return choice(result)
 55 |             else:
 56 |                 raise EOFError    
 57 |     def process_request(self):
 58 |         proxy = self.get_random_proxy()
 59 |         if proxy:
 60 |             proxy_uri = 'http://{proxy}'.format(proxy=proxy.decode('utf-8'))
 61 |             self.logger.debug("正在使用代理："+proxy_uri)
 62 |             return proxy_uri
 63 | ```
 64 | * 模块可分享点
 65 |   *  使用元类实现方法的自动调用[crawler.py]
 66 |   * 由于使用元类识别以crawl_的方法，因此拓展抓取方法请按crawl_XX 定义[crawler.py]
 67 | 
 68 | ```python
 69 | class ProxyMetaclass(type):
 70 |     def __new__(cls, name, bases, attrs):
 71 |         count = 0
 72 |         attrs['__CrawlFunc__'] = []
 73 |         for k, v in attrs.items():
 74 |             if 'crawl_' in k:
 75 |                 attrs['__CrawlFunc__'].append(k)
 76 |                 count += 1
 77 |         attrs['__CrawlFuncCount__'] = count
 78 |         return type.__new__(cls, name, bases, attrs)
 79 | ```
 80 | 
 81 | ```python
 82 | class Crawler(object, metaclass=ProxyMetaclass):
 83 |     def get_proxies(self, callback):
 84 |         proxies = []
 85 |         for proxy in eval("self.{}()".format(callback)):
 86 |             print('成功获取到代理', proxy)
 87 |             proxies.append(proxy)
 88 |         return proxies
 89 |        
 90 |     def crawl_XX(self):
 91 | 
 92 |         XXXXXXXXX
 93 | ```
 94 | 
 95 | * 添加测试网站（建议爬取哪个网站就添加哪个网站进入测试）[setting.py]
 96 | 
 97 | ```python
 98 | # 测试，建议抓哪个网站测哪个
 99 | TEST_URL = [
100 |     'https://tech.china.com/',
101 | ]
102 | ```
103 | 
104 | * 抓取的代理信息[proxy_provider.txt]
105 |  
106 | 
107 | ```
108 | 代理：
109 | https://proxy.mimvp.com/free.php?proxy=in_hp
110 | http://www.coobobo.com/free-http-proxy
111 | http://ip.zdaye.com/
112 | http://www.mayidaili.com/free/anonymous/%E9%AB%98%E5%8C%BF
113 | http://http.taiyangruanjian.com/
114 | http://http.zhimaruanjian.com/
115 | http://ip.jiangxianli.com
116 | 
117 | 66代理
118 | 云代理
119 | 快代理
120 | 西刺代理
121 | 无忧代理
122 | 免费IP代理
123 | ```
124 | 
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 
131 | 
132 | 
133 | 
134 | 
135 | 
136 | 
137 | 
138 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wdcrgb/proxypool/2ba6c1c577722868b457dd4c56facdf2ecf3af73/__init__.py


--------------------------------------------------------------------------------
/proxy_provider.txt:
--------------------------------------------------------------------------------
 1 | 代理：
 2 | https://proxy.mimvp.com/free.php?proxy=in_hp
 3 | http://www.coobobo.com/free-http-proxy
 4 | http://ip.zdaye.com/
 5 | http://www.mayidaili.com/free/anonymous/%E9%AB%98%E5%8C%BF
 6 | http://http.taiyangruanjian.com/
 7 | http://http.zhimaruanjian.com/
 8 | http://ip.jiangxianli.com
 9 | 
10 | 66代理
11 | 云代理
12 | 快代理
13 | 西刺代理
14 | 无忧代理
15 | 免费IP代理


--------------------------------------------------------------------------------
/proxypool/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wdcrgb/proxypool/2ba6c1c577722868b457dd4c56facdf2ecf3af73/proxypool/__init__.py


--------------------------------------------------------------------------------
/proxypool/crawler.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import re
  3 | from .utils import get_page
  4 | from pyquery import PyQuery as pq
  5 | 
  6 | 
  7 | class ProxyMetaclass(type):
  8 |     def __new__(cls, name, bases, attrs):
  9 |         count = 0
 10 |         attrs['__CrawlFunc__'] = []
 11 |         for k, v in attrs.items():
 12 |             if 'crawl_' in k:
 13 |                 attrs['__CrawlFunc__'].append(k)
 14 |                 count += 1
 15 |         attrs['__CrawlFuncCount__'] = count
 16 |         return type.__new__(cls, name, bases, attrs)
 17 | 
 18 | 
 19 | class Crawler(object, metaclass=ProxyMetaclass):
 20 |     def get_proxies(self, callback):
 21 |         proxies = []
 22 |         for proxy in eval("self.{}()".format(callback)):
 23 |             print('成功获取到代理', proxy)
 24 |             proxies.append(proxy)
 25 |         return proxies
 26 |        
 27 |     def crawl_daili66(self, page_count=4):
 28 |         """
 29 |         获取代理66
 30 |         :param page_count: 页码
 31 |         :return: 代理
 32 |         """
 33 |         start_url = 'http://www.66ip.cn/{}.html'
 34 |         urls = [start_url.format(page) for page in range(1, page_count + 1)]
 35 |         for url in urls:
 36 |             print('Crawling', url)
 37 |             html = get_page(url)
 38 |             if html:
 39 |                 doc = pq(html)
 40 |                 trs = doc('.containerbox table tr:gt(0)').items()
 41 |                 for tr in trs:
 42 |                     ip = tr.find('td:nth-child(1)').text()
 43 |                     port = tr.find('td:nth-child(2)').text()
 44 |                     yield ':'.join([ip, port])
 45 | 
 46 |     def crawl_ip3366(self):
 47 |         for page in range(1, 4):
 48 |             start_url = 'http://www.ip3366.net/free/?stype=1&page={}'.format(page)
 49 |             html = get_page(start_url)
 50 |             ip_address = re.compile('<tr>\s*<td>(.*?)</td>\s*<td>(.*?)</td>')
 51 |             # \s * 匹配空格，起到换行作用
 52 |             re_ip_address = ip_address.findall(html)
 53 |             for address, port in re_ip_address:
 54 |                 result = address+':'+ port
 55 |                 yield result.replace(' ', '')
 56 |     
 57 |     def crawl_kuaidaili(self):
 58 |         for i in range(1, 4):
 59 |             start_url = 'http://www.kuaidaili.com/free/inha/{}/'.format(i)
 60 |             html = get_page(start_url)
 61 |             if html:
 62 |                 ip_address = re.compile('<td data-title="IP">(.*?)</td>') 
 63 |                 re_ip_address = ip_address.findall(html)
 64 |                 port = re.compile('<td data-title="PORT">(.*?)</td>')
 65 |                 re_port = port.findall(html)
 66 |                 for address,port in zip(re_ip_address, re_port):
 67 |                     address_port = address+':'+port
 68 |                     yield address_port.replace(' ','')
 69 | 
 70 |     def crawl_xicidaili(self):
 71 |         for i in range(1, 3):
 72 |             start_url = 'http://www.xicidaili.com/nn/{}'.format(i)
 73 |             headers = {
 74 |                 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
 75 |                 'Cookie':'_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJWRjYzc5MmM1MTBiMDMzYTUzNTZjNzA4NjBhNWRjZjliBjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMUp6S2tXT3g5a0FCT01ndzlmWWZqRVJNek1WanRuUDBCbTJUN21GMTBKd3M9BjsARg%3D%3D--2a69429cb2115c6a0cc9a86e0ebe2800c0d471b3',
 76 |                 'Host':'www.xicidaili.com',
 77 |                 'Referer':'http://www.xicidaili.com/nn/3',
 78 |                 'Upgrade-Insecure-Requests':'1',
 79 |             }
 80 |             html = get_page(start_url, options=headers)
 81 |             if html:
 82 |                 find_trs = re.compile('<tr class.*?>(.*?)</tr>', re.S)
 83 |                 trs = find_trs.findall(html)
 84 |                 for tr in trs:
 85 |                     find_ip = re.compile('<td>(\d+\.\d+\.\d+\.\d+)</td>') 
 86 |                     re_ip_address = find_ip.findall(tr)
 87 |                     find_port = re.compile('<td>(\d+)</td>')
 88 |                     re_port = find_port.findall(tr)
 89 |                     for address,port in zip(re_ip_address, re_port):
 90 |                         address_port = address+':'+port
 91 |                         yield address_port.replace(' ','')
 92 |     
 93 |     def crawl_ip3366(self):
 94 |         for i in range(1, 4):
 95 |             start_url = 'http://www.ip3366.net/?stype=1&page={}'.format(i)
 96 |             html = get_page(start_url)
 97 |             if html:
 98 |                 find_tr = re.compile('<tr>(.*?)</tr>', re.S)
 99 |                 trs = find_tr.findall(html)
100 |                 for s in range(1, len(trs)):
101 |                     find_ip = re.compile('<td>(\d+\.\d+\.\d+\.\d+)</td>')
102 |                     re_ip_address = find_ip.findall(trs[s])
103 |                     find_port = re.compile('<td>(\d+)</td>')
104 |                     re_port = find_port.findall(trs[s])
105 |                     for address,port in zip(re_ip_address, re_port):
106 |                         address_port = address+':'+port
107 |                         yield address_port.replace(' ','')
108 |     
109 |     def crawl_iphai(self):
110 |         start_url = 'http://www.iphai.com/'
111 |         html = get_page(start_url)
112 |         if html:
113 |             find_tr = re.compile('<tr>(.*?)</tr>', re.S)
114 |             trs = find_tr.findall(html)
115 |             for s in range(1, len(trs)):
116 |                 find_ip = re.compile('<td>\s+(\d+\.\d+\.\d+\.\d+)\s+</td>', re.S)
117 |                 re_ip_address = find_ip.findall(trs[s])
118 |                 find_port = re.compile('<td>\s+(\d+)\s+</td>', re.S)
119 |                 re_port = find_port.findall(trs[s])
120 |                 for address,port in zip(re_ip_address, re_port):
121 |                     address_port = address+':'+port
122 |                     yield address_port.replace(' ','')
123 | 
124 |     def crawl_data5u(self):
125 |         start_url = 'http://www.data5u.com/free/gngn/index.shtml'
126 |         headers = {
127 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
128 |             'Accept-Encoding': 'gzip, deflate',
129 |             'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
130 |             'Cache-Control': 'max-age=0',
131 |             'Connection': 'keep-alive',
132 |             'Cookie': 'JSESSIONID=47AA0C887112A2D83EE040405F837A86',
133 |             'Host': 'www.data5u.com',
134 |             'Referer': 'http://www.data5u.com/free/index.shtml',
135 |             'Upgrade-Insecure-Requests': '1',
136 |             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36',
137 |         }
138 |         html = get_page(start_url, options=headers)
139 |         if html:
140 |             ip_address = re.compile('<span><li>(\d+\.\d+\.\d+\.\d+)</li>.*?<li class=\"port.*?>(\d+)</li>', re.S)
141 |             re_ip_address = ip_address.findall(html)
142 |             for address, port in re_ip_address:
143 |                 result = address + ':' + port
144 |                 yield result.replace(' ', '')
145 | 
146 |     # 免费IP代理库
147 |     def crawl_jiangxianli(self):
148 |         start_url = 'http://ip.jiangxianli.com'
149 |         html = get_page(start_url)
150 |         if html:
151 |             ip_address = re.compile('\<button class\=\"btn btn\-sm btn\-copy\"\s*data\-url\=\"(http|https)\:\/\/(\d+\.\d+\.\d+\.\d+:\d+)\"', re.S)
152 |             re_ip_address = ip_address.findall(html)
153 |             for ip_address in re_ip_address:
154 |                 yield ip_address[1].replace(' ', '')
155 | 
156 | 


--------------------------------------------------------------------------------
/proxypool/db.py:
--------------------------------------------------------------------------------
  1 | import redis
  2 | 
  3 | from .error import PoolEmptyError
  4 | from .setting import REDIS_HOST, REDIS_PORT, REDIS_PASSWORD, REDIS_KEY
  5 | from .setting import MAX_SCORE, MIN_SCORE, INITIAL_SCORE
  6 | from random import choice
  7 | import re
  8 | 
  9 | 
 10 | class RedisClient(object):
 11 |     def __init__(self, host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD):
 12 |         """
 13 |         初始化
 14 |         :param host: Redis 地址
 15 |         :param port: Redis 端口
 16 |         :param password: Redis密码
 17 |         """
 18 |         pool = redis.ConnectionPool(host=host, port=port)
 19 |         self.db = redis.StrictRedis(connection_pool=pool)
 20 |         # self.db = redis.StrictRedis(host=host, port=port, password=password, decode_responses=True)
 21 | 
 22 |     
 23 |     def add(self, proxy, score=INITIAL_SCORE):
 24 |         """
 25 |         添加代理，设置分数为最高
 26 |         :param proxy: 代理
 27 |         :param score: 分数
 28 |         :return: 添加结果
 29 |         """
 30 |         if not re.match('\d+\.\d+\.\d+\.\d+\:\d+', proxy):
 31 |             print('代理不符合规范', proxy, '丢弃')
 32 |             return
 33 |         if not self.db.zscore(REDIS_KEY, proxy):
 34 |             try:
 35 |                 return self.db.zadd(REDIS_KEY, score, proxy)
 36 |             except:
 37 |                 # 兼容 Redis 3.X
 38 |                 return self.db.zadd(REDIS_KEY,{proxy:score})
 39 |     
 40 |     def random(self):
 41 |         """
 42 |         随机获取有效代理，首先尝试获取最高分数代理，如果不存在，按照排名获取，否则异常
 43 |         :return: 随机代理
 44 |         """
 45 |         result = self.db.zrangebyscore(REDIS_KEY, MAX_SCORE, MAX_SCORE)
 46 |         if len(result):
 47 |             return choice(result)
 48 |         else:
 49 |             result = self.db.zrevrange(REDIS_KEY, 0, 100)
 50 |             if len(result):
 51 |                 return choice(result)
 52 |             else:
 53 |                 raise PoolEmptyError
 54 |     
 55 |     def decrease(self, proxy):
 56 |         """
 57 |         代理值减一分，小于最小值则删除
 58 |         :param proxy: 代理
 59 |         :return: 修改后的代理分数
 60 |         """
 61 |         score = self.db.zscore(REDIS_KEY, proxy)
 62 |         if score and score > MIN_SCORE:
 63 |             # print('代理', proxy, '当前分数', score, '减1')
 64 |             return self.db.zincrby(REDIS_KEY, -1, proxy)
 65 |         else:
 66 |             print('代理', proxy, '当前分数', score, '移除')
 67 |             return self.db.zrem(REDIS_KEY, proxy)
 68 |     
 69 |     def exists(self, proxy):
 70 |         """
 71 |         判断是否存在
 72 |         :param proxy: 代理
 73 |         :return: 是否存在
 74 |         """
 75 |         return not self.db.zscore(REDIS_KEY, proxy) == None
 76 |     
 77 |     def max(self, proxy):
 78 |         """
 79 |         将代理设置为MAX_SCORE
 80 |         :param proxy: 代理
 81 |         :return: 设置结果
 82 |         """
 83 |         print('代理', proxy, '可用，设置为', MAX_SCORE)
 84 |         return self.db.zadd(REDIS_KEY, {proxy: MAX_SCORE})
 85 | 
 86 |     
 87 |     def count(self):
 88 |         """
 89 |         获取数量
 90 |         :return: 数量
 91 |         """
 92 |         return self.db.zcard(REDIS_KEY)
 93 |     
 94 |     def all(self):
 95 |         """
 96 |         获取全部代理
 97 |         :return: 全部代理列表
 98 |         """
 99 |         return self.db.zrangebyscore(REDIS_KEY, MIN_SCORE, MAX_SCORE)
100 |     
101 |     def batch(self, start, stop):
102 |         """
103 |         批量获取
104 |         :param start: 开始索引
105 |         :param stop: 结束索引
106 |         :return: 代理列表
107 |         """
108 |         return self.db.zrevrange(REDIS_KEY, start, stop - 1)
109 | 
110 | 
111 | if __name__ == '__main__':
112 |     conn = RedisClient()
113 |     result = conn.batch(680, 688)
114 |     print(result)
115 | 


--------------------------------------------------------------------------------
/proxypool/error.py:
--------------------------------------------------------------------------------
1 | class PoolEmptyError(Exception):
2 | 
3 |     def __init__(self):
4 |         Exception.__init__(self)
5 | 
6 |     def __str__(self):
7 |         return repr('代理池已经枯竭')
8 | 


--------------------------------------------------------------------------------
/proxypool/getter.py:
--------------------------------------------------------------------------------
 1 | from .tester import Tester
 2 | from .db import RedisClient
 3 | from .crawler import Crawler
 4 | from .setting import *
 5 | import sys
 6 | 
 7 | class Getter():
 8 |     def __init__(self):
 9 |         self.redis = RedisClient()
10 |         self.crawler = Crawler()
11 |     
12 |     def is_over_threshold(self):
13 |         """
14 |         判断是否达到了代理池限制
15 |         """
16 |         if self.redis.count() >= POOL_UPPER_THRESHOLD:
17 |             return True
18 |         else:
19 |             return False
20 |     
21 |     def run(self):
22 |         print('获取器开始执行')
23 |         if not self.is_over_threshold():
24 |             for callback_label in range(self.crawler.__CrawlFuncCount__):
25 |                 callback = self.crawler.__CrawlFunc__[callback_label]
26 |                 # 获取代理
27 |                 proxies = self.crawler.get_proxies(callback)
28 |                 sys.stdout.flush()
29 |                 for proxy in proxies:
30 |                     self.redis.add(proxy)
31 | 


--------------------------------------------------------------------------------
/proxypool/importer.py:
--------------------------------------------------------------------------------
 1 | from .db import RedisClient
 2 | 
 3 | conn = RedisClient()
 4 | 
 5 | def set(proxy):
 6 |     result = conn.add(proxy)
 7 |     print(proxy)
 8 |     print('采集成功' if result else '采集失败')
 9 | 
10 | 
11 | def crawl_aip_ip():
12 |     # 拓展 日后爬取付费api
13 |     pass
14 | 
15 | 
16 | if __name__ == '__main__':
17 |     crawl_aip_ip()
18 | 


--------------------------------------------------------------------------------
/proxypool/scheduler.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from multiprocessing import Process
 3 | from .getter import Getter
 4 | from .tester import Tester
 5 | from .db import RedisClient
 6 | from .setting import *
 7 | 
 8 | 
 9 | class Scheduler():
10 |     def schedule_tester(self, cycle=TESTER_CYCLE):
11 |         """
12 |         定时测试代理
13 |         """
14 |         tester = Tester()
15 |         while True:
16 |             print('测试器开始运行')
17 |             tester.run()
18 |             time.sleep(cycle)
19 |     
20 |     def schedule_getter(self, cycle=GETTER_CYCLE):
21 |         """
22 |         定时获取代理
23 |         """
24 |         getter = Getter()
25 |         while True:
26 |             print('开始抓取代理')
27 |             getter.run()
28 |             time.sleep(cycle)
29 |     
30 |     def run(self):
31 |         print('代理池开始运行')
32 |         
33 |         if TESTER_ENABLED:
34 |             tester_process = Process(target=self.schedule_tester)
35 |             tester_process.start()
36 |         
37 |         if GETTER_ENABLED:
38 |             getter_process = Process(target=self.schedule_getter)
39 |             getter_process.start()
40 | 


--------------------------------------------------------------------------------
/proxypool/setting.py:
--------------------------------------------------------------------------------
 1 | # Redis数据库地址
 2 | REDIS_HOST = '127.0.0.1'
 3 | 
 4 | # Redis端口
 5 | REDIS_PORT = 6379
 6 | 
 7 | # Redis密码
 8 | REDIS_PASSWORD = None
 9 | 
10 | REDIS_KEY = 'proxies'
11 | 
12 | # 代理分数
13 | MAX_SCORE = 100
14 | MIN_SCORE = 0
15 | INITIAL_SCORE = 10
16 | 
17 | VALID_STATUS_CODES = [200, 302]
18 | 
19 | # 代理池数量界限
20 | POOL_UPPER_THRESHOLD = 50000
21 | 
22 | # 检查周期
23 | TESTER_CYCLE = 20
24 | # 获取周期
25 | GETTER_CYCLE = 300
26 | 
27 | # 测试，建议抓哪个网站测哪个
28 | TEST_URL = [
29 |     'https://tech.china.com/',
30 | ]
31 | 
32 | # 开关
33 | TESTER_ENABLED = True
34 | GETTER_ENABLED = True
35 | 
36 | # 最大批测试量
37 | BATCH_TEST_SIZE = 10
38 | 


--------------------------------------------------------------------------------
/proxypool/tester.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import aiohttp
 3 | import time
 4 | import sys
 5 | try:
 6 |     from aiohttp import ClientError
 7 | except:
 8 |     from aiohttp import ClientProxyConnectionError as ProxyConnectionError
 9 | from .db import RedisClient
10 | from .setting import *
11 | from random import choice
12 | 
13 | 
14 | class Tester(object):
15 |     def __init__(self):
16 |         self.redis = RedisClient()
17 |     
18 |     async def test_single_proxy(self, proxy):
19 |         """
20 |         测试单个代理
21 |         :param proxy:
22 |         :return:
23 |         """
24 |         conn = aiohttp.TCPConnector(verify_ssl=False)
25 |         async with aiohttp.ClientSession(connector=conn) as session:
26 |             try:
27 |                 if isinstance(proxy, bytes):
28 |                     proxy = proxy.decode('utf-8')
29 |                 real_proxy = 'http://' + proxy
30 |                 print('正在测试', proxy)
31 |                 async with session.get(choice(TEST_URL), proxy=real_proxy, timeout=15, allow_redirects=False) as response:
32 |                     print(response.url, response.status)
33 |                     if response.status in VALID_STATUS_CODES:
34 |                         self.redis.max(proxy)
35 |                         print('代理可用', proxy)
36 | 
37 |                     else:
38 |                         self.redis.decrease(proxy)
39 |                         print('请求响应码不合法 ', response.status, 'IP', proxy)
40 |             except (ClientError, aiohttp.client_exceptions.ClientConnectorError, asyncio.TimeoutError, AttributeError):
41 |                 self.redis.decrease(proxy)
42 |                 # print('代理请求失败', proxy)
43 |     
44 |     def run(self):
45 |         """
46 |         测试主函数
47 |         :return:
48 |         """
49 |         print('测试器开始运行')
50 |         try:
51 |             count = self.redis.count()
52 |             print('当前剩余', count, '个代理')
53 |             for i in range(0, count, BATCH_TEST_SIZE):
54 |                 start = i
55 |                 stop = min(i + BATCH_TEST_SIZE, count)
56 |                 print('正在测试第', start + 1, '-', stop, '个代理')
57 |                 test_proxies = self.redis.batch(start, stop)
58 |                 loop = asyncio.get_event_loop()
59 |                 tasks = [self.test_single_proxy(proxy) for proxy in test_proxies]
60 |                 loop.run_until_complete(asyncio.wait(tasks))
61 |                 sys.stdout.flush()
62 |                 time.sleep(5)
63 |         except Exception as e:
64 |             print('测试器发生错误', e.args)
65 | 


--------------------------------------------------------------------------------
/proxypool/utils.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from requests.exceptions import ConnectionError
 3 | 
 4 | base_headers = {
 5 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
 6 |     'Accept-Encoding': 'gzip, deflate, sdch',
 7 |     'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7'
 8 | }
 9 | 
10 | 
11 | def get_page(url, options={}):
12 |     """
13 |     抓取代理
14 |     :param url:
15 |     :param options:
16 |     :return:
17 |     """
18 |     headers = dict(base_headers, **options)
19 |     print('正在抓取', url)
20 |     try:
21 |         response = requests.get(url, headers=headers)
22 |         print('抓取成功', url, response.status_code)
23 |         if response.status_code == 200:
24 |             return response.text
25 |     except ConnectionError:
26 |         print('抓取失败', url)
27 |         return None
28 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | aiohttp>=1.3.3
2 | redis
3 | requests
4 | pyquery
5 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
 1 | # from .proxypool.scheduler import Scheduler
 2 | from proxypool.scheduler import Scheduler
 3 | import sys
 4 | import io
 5 | 
 6 | sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
 7 | 
 8 | 
 9 | def main():
10 |     try:
11 |         s = Scheduler()
12 |         s.run()
13 |     except:
14 |         main()
15 | 
16 | 
17 | if __name__ == '__main__':
18 |     main()
19 | 


--------------------------------------------------------------------------------