├── .gitignore
├── HttpProxyMiddleware.py
├── HttpProxyMiddlewareTest
├── HttpProxyMiddlewareTest
│ ├── HttpProxyMiddleware.py
│ ├── __init__.py
│ ├── fetch_free_proxyes.py
│ ├── items.py
│ ├── pipelines.py
│ ├── proxyes.data
│ ├── settings.py
│ └── spiders
│ │ ├── __init__.py
│ │ └── test.py
├── README.org
├── proxyes.dat
└── scrapy.cfg
├── IPBanTest
├── IPBanTest
│ ├── README.org
│ ├── __init__.py
│ ├── settings.py
│ ├── urls.py
│ ├── views.py
│ └── wsgi.py
├── db.sqlite3
└── manage.py
├── LICENSE
├── README.org
└── fetch_free_proxyes.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 |
5 | # C extensions
6 | *.so
7 |
8 | # Distribution / packaging
9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 |
26 | # PyInstaller
27 | # Usually these files are written by a python script from a template
28 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 |
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 |
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 |
46 | # Translations
47 | *.mo
48 | *.pot
49 |
50 | # Django stuff:
51 | *.log
52 |
53 | # Sphinx documentation
54 | docs/_build/
55 |
56 | # PyBuilder
57 | target/
58 | cookie
59 | index.html
60 | bak.py
61 |
--------------------------------------------------------------------------------
/HttpProxyMiddleware.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | import os
4 | import logging
5 | from datetime import datetime, timedelta
6 | from twisted.web._newclient import ResponseNeverReceived
7 | from twisted.internet.error import TimeoutError, ConnectionRefusedError, ConnectError
8 | from crawler import fetch_free_proxyes
9 |
10 | logger = logging.getLogger(__name__)
11 |
12 | class HttpProxyMiddleware(object):
13 | # 遇到这些类型的错误直接当做代理不可用处理掉, 不再传给retrymiddleware
14 | DONT_RETRY_ERRORS = (TimeoutError, ConnectionRefusedError, ResponseNeverReceived, ConnectError, ValueError)
15 |
16 | def __init__(self, use_https):
17 | # 保存上次不用代理直接连接的时间点
18 | self.last_no_proxy_time = datetime.now()
19 | # 一定分钟数后切换回不用代理, 因为用代理影响到速度
20 | self.recover_interval = 20
21 | # 一个proxy如果没用到这个数字就被发现老是超时, 则永久移除该proxy. 设为0则不会修改代理文件.
22 | self.dump_count_threshold = 20
23 | # 存放代理列表的文件, 每行一个代理, 格式为proto://ip:port, 这个文件会被修改, 注意备份
24 | self.proxy_file = "proxyes.dat"
25 | # 是否在超时的情况下禁用代理
26 | self.invalid_proxy_flag = True
27 | # 当有效代理小于这个数时(包括直连), 从网上抓取新的代理, 可以将这个数设为为了满足每个ip被要求输入验证码后得到足够休息时间所需要的代理数
28 | # 例如爬虫在十个可用代理之间切换时, 每个ip经过数分钟才再一次轮到自己, 这样就能get一些请求而不用输入验证码.
29 | # 如果这个数过小, 例如两个, 爬虫用A ip爬了没几个就被ban, 换了一个又爬了没几次就被ban, 这样整个爬虫就会处于一种忙等待的状态, 影响效率
30 | self.extend_proxy_threshold = 10
31 | # 初始化代理列表
32 | self.proxyes = [{"proxy": None, "valid": True, "count": 0}]
33 | # 初始时使用0号代理(即无代理)
34 | self.proxy_index = 0
35 | # 表示可信代理的数量(如自己搭建的HTTP代理)+1(不用代理直接连接)
36 | self.fixed_proxy = len(self.proxyes)
37 | # 上一次抓新代理的时间
38 | self.last_fetch_proxy_time = datetime.now()
39 | # 每隔固定时间强制抓取新代理(min)
40 | self.fetch_proxy_interval = 120
41 | # 一个将被设为invalid的代理如果已经成功爬取大于这个参数的页面, 将不会被invalid
42 | self.invalid_proxy_threshold = 200
43 | # 使用http代理还是https代理
44 | self.use_https = use_https
45 | # 从文件读取初始代理
46 | if os.path.exists(self.proxy_file):
47 | with open(self.proxy_file, "r") as fd:
48 | lines = fd.readlines()
49 | for line in lines:
50 | line = line.strip()
51 | if not line or self.url_in_proxyes(line):
52 | continue
53 | self.proxyes.append({"proxy": line,
54 | "valid": True,
55 | "count": 0})
56 |
57 | @classmethod
58 | def from_crawler(cls, crawler):
59 | use_https = crawler.settings.getbool('HTTPS_PROXY')
60 | return cls(use_https)
61 |
62 | def url_in_proxyes(self, url):
63 | """
64 | 返回一个代理url是否在代理列表中
65 | """
66 | for p in self.proxyes:
67 | if url == p["proxy"]:
68 | return True
69 | return False
70 |
71 | def reset_proxyes(self):
72 | """
73 | 将所有count>=指定阈值的代理重置为valid,
74 | """
75 | logger.info("reset proxyes to valid")
76 | for p in self.proxyes:
77 | if p["count"] >= self.dump_count_threshold:
78 | p["valid"] = True
79 |
80 | def fetch_new_proxyes(self):
81 | """
82 | 从网上抓取新的代理添加到代理列表中
83 | """
84 | logger.info("extending proxyes using fetch_free_proxyes.py")
85 | new_proxyes = fetch_free_proxyes.fetch_all(https=self.use_https)
86 | logger.info("new proxyes: %s" % new_proxyes)
87 | self.last_fetch_proxy_time = datetime.now()
88 |
89 | for np in new_proxyes:
90 | if self.url_in_proxyes(np):
91 | continue
92 | else:
93 | self.proxyes.append({"proxy": np,
94 | "valid": True,
95 | "count": 0})
96 | if self.len_valid_proxy() < self.extend_proxy_threshold: # 如果发现抓不到什么新的代理了, 缩小threshold以避免白费功夫
97 | self.extend_proxy_threshold -= 1
98 |
99 | def len_valid_proxy(self):
100 | """
101 | 返回proxy列表中有效的代理数量
102 | """
103 | count = 0
104 | for p in self.proxyes:
105 | if p["valid"]:
106 | count += 1
107 | return count
108 |
109 | def inc_proxy_index(self, current=-1):
110 | """
111 | 将代理列表的索引移到下一个有效代理的位置
112 | 如果发现代理列表只有fixed_proxy项有效, 重置代理列表
113 | 如果还发现已经距离上次抓代理过了指定时间, 则抓取新的代理
114 | """
115 | assert self.proxyes[0]["valid"]
116 | if current != -1 and self.proxy_index != current:
117 | return
118 | while True:
119 | self.proxy_index = (self.proxy_index + 1) % len(self.proxyes)
120 | if self.proxyes[self.proxy_index]["valid"]:
121 | break
122 |
123 | # 两轮proxy_index==0的时间间隔过短, 说明出现了验证码抖动,扩展代理列表
124 | if self.proxy_index == 0 and datetime.now() < self.last_no_proxy_time + timedelta(minutes=2):
125 | logger.info("captcha thrashing")
126 | self.fetch_new_proxyes()
127 |
128 | if self.len_valid_proxy() <= self.fixed_proxy or self.len_valid_proxy() < self.extend_proxy_threshold: # 如果代理列表中有效的代理不足的话重置为valid
129 | self.reset_proxyes()
130 |
131 | if self.len_valid_proxy() < self.extend_proxy_threshold: # 代理数量仍然不足, 抓取新的代理
132 | logger.info("valid proxy < threshold: %d/%d" % (self.len_valid_proxy(), self.extend_proxy_threshold))
133 | self.fetch_new_proxyes()
134 |
135 | logger.info("now using new proxy: %s" % self.proxyes[self.proxy_index]["proxy"])
136 |
137 | # 一定时间没更新后可能出现了在目前的代理不断循环不断验证码错误的情况, 强制抓取新代理
138 | #if datetime.now() > self.last_fetch_proxy_time + timedelta(minutes=self.fetch_proxy_interval):
139 | # logger.info("%d munites since last fetch" % self.fetch_proxy_interval)
140 | # self.fetch_new_proxyes()
141 |
142 | def set_proxy(self, request):
143 | """
144 | 将request设置使用为当前的或下一个有效代理
145 | """
146 | proxy = self.proxyes[self.proxy_index]
147 | if not proxy["valid"]:
148 | self.inc_proxy_index()
149 | proxy = self.proxyes[self.proxy_index]
150 |
151 | if self.proxy_index == 0: # 每次不用代理直接下载时更新self.last_no_proxy_time
152 | self.last_no_proxy_time = datetime.now()
153 |
154 | if proxy["proxy"]:
155 | request.meta["proxy"] = proxy["proxy"]
156 | elif "proxy" in request.meta.keys():
157 | del request.meta["proxy"]
158 | request.meta["proxy_index"] = self.proxy_index
159 | proxy["count"] += 1
160 |
161 | def invalid_proxy(self, index):
162 | """
163 | 将index指向的proxy设置为invalid,
164 | 并调整当前proxy_index到下一个有效代理的位置
165 | """
166 | if index < self.fixed_proxy: # 可信代理永远不会设为invalid
167 | logger.info("fixed proxy will not be invalid: %s" % self.proxyes[index])
168 | self.inc_proxy_index(index)
169 | return
170 |
171 | if self.proxyes[index]["valid"]:
172 | logger.info("invalidate %s" % self.proxyes[index])
173 | self.proxyes[index]["valid"] = False
174 | if index == self.proxy_index:
175 | self.inc_proxy_index()
176 |
177 | if self.proxyes[index]["count"] < self.dump_count_threshold:
178 | self.dump_valid_proxy()
179 |
180 | def dump_valid_proxy(self):
181 | """
182 | 保存代理列表中有效的代理到文件
183 | """
184 | if self.dump_count_threshold <= 0:
185 | return
186 | logger.info("dumping proxyes to file")
187 | with open(self.proxy_file, "w") as fd:
188 | for i in range(self.fixed_proxy, len(self.proxyes)):
189 | p = self.proxyes[i]
190 | if p["valid"] or p["count"] >= self.dump_count_threshold:
191 | fd.write(p["proxy"]+"\n") # 只保存有效的代理
192 |
193 | def process_request(self, request, spider):
194 | """
195 | 将request设置为使用代理
196 | """
197 | if self.proxy_index > 0 and datetime.now() > (self.last_no_proxy_time + timedelta(minutes=self.recover_interval)):
198 | logger.info("After %d minutes later, recover from using proxy" % self.recover_interval)
199 | self.last_no_proxy_time = datetime.now()
200 | self.proxy_index = 0
201 | request.meta["dont_redirect"] = True # 有些代理会把请求重定向到一个莫名其妙的地址
202 |
203 | # spider发现parse error, 要求更换代理
204 | if "change_proxy" in request.meta.keys() and request.meta["change_proxy"]:
205 | logger.info("change proxy request get by spider: %s" % request)
206 | self.invalid_proxy(request.meta["proxy_index"])
207 | request.meta["change_proxy"] = False
208 | self.set_proxy(request)
209 |
210 | def process_response(self, request, response, spider):
211 | """
212 | 检查response.status, 根据status是否在允许的状态码中决定是否切换到下一个proxy, 或者禁用proxy
213 | """
214 | if "proxy" in request.meta.keys():
215 | logger.debug("%s %s %s" % (request.meta["proxy"], response.status, request.url))
216 | else:
217 | logger.debug("None %s %s" % (response.status, request.url))
218 |
219 | # status不是正常的200而且不在spider声明的正常爬取过程中可能出现的
220 | # status列表中, 则认为代理无效, 切换代理
221 | if response.status != 200 \
222 | and (not hasattr(spider, "website_possible_httpstatus_list") \
223 | or response.status not in spider.website_possible_httpstatus_list):
224 | logger.info("response status[%d] not in spider.website_possible_httpstatus_list" % response.status)
225 | self.invalid_proxy(request.meta["proxy_index"])
226 | new_request = request.copy()
227 | new_request.dont_filter = True
228 | return new_request
229 | else:
230 | return response
231 |
232 | def process_exception(self, request, exception, spider):
233 | """
234 | 处理由于使用代理导致的连接异常
235 | """
236 | logger.debug("%s exception: %s" % (self.proxyes[request.meta["proxy_index"]]["proxy"], exception))
237 | request_proxy_index = request.meta["proxy_index"]
238 |
239 | # 只有当proxy_index>fixed_proxy-1时才进行比较, 这样能保证至少本地直连是存在的.
240 | if isinstance(exception, self.DONT_RETRY_ERRORS):
241 | if request_proxy_index > self.fixed_proxy - 1 and self.invalid_proxy_flag: # WARNING 直连时超时的话换个代理还是重试? 这是策略问题
242 | if self.proxyes[request_proxy_index]["count"] < self.invalid_proxy_threshold:
243 | self.invalid_proxy(request_proxy_index)
244 | elif request_proxy_index == self.proxy_index: # 虽然超时,但是如果之前一直很好用,也不设为invalid
245 | self.inc_proxy_index()
246 | else: # 简单的切换而不禁用
247 | if request.meta["proxy_index"] == self.proxy_index:
248 | self.inc_proxy_index()
249 | new_request = request.copy()
250 | new_request.dont_filter = True
251 | return new_request
252 |
--------------------------------------------------------------------------------
/HttpProxyMiddlewareTest/HttpProxyMiddlewareTest/HttpProxyMiddleware.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | import os
4 | import logging
5 | from datetime import datetime, timedelta
6 | from twisted.web._newclient import ResponseNeverReceived
7 | from twisted.internet.error import TimeoutError, ConnectionRefusedError, ConnectError
8 | import fetch_free_proxyes
9 |
10 | logger = logging.getLogger(__name__)
11 |
12 | class HttpProxyMiddleware(object):
13 | # 遇到这些类型的错误直接当做代理不可用处理掉, 不再传给retrymiddleware
14 | DONT_RETRY_ERRORS = (TimeoutError, ConnectionRefusedError, ResponseNeverReceived, ConnectError, ValueError)
15 |
16 | def __init__(self, settings):
17 | # 保存上次不用代理直接连接的时间点
18 | self.last_no_proxy_time = datetime.now()
19 | # 一定分钟数后切换回不用代理, 因为用代理影响到速度
20 | self.recover_interval = 20
21 | # 一个proxy如果没用到这个数字就被发现老是超时, 则永久移除该proxy. 设为0则不会修改代理文件.
22 | self.dump_count_threshold = 20
23 | # 存放代理列表的文件, 每行一个代理, 格式为ip:port, 注意没有http://, 而且这个文件会被修改, 注意备份
24 | self.proxy_file = "proxyes.dat"
25 | # 是否在超时的情况下禁用代理
26 | self.invalid_proxy_flag = True
27 | # 当有效代理小于这个数时(包括直连), 从网上抓取新的代理, 可以将这个数设为为了满足每个ip被要求输入验证码后得到足够休息时间所需要的代理数
28 | # 例如爬虫在十个可用代理之间切换时, 每个ip经过数分钟才再一次轮到自己, 这样就能get一些请求而不用输入验证码.
29 | # 如果这个数过小, 例如两个, 爬虫用A ip爬了没几个就被ban, 换了一个又爬了没几次就被ban, 这样整个爬虫就会处于一种忙等待的状态, 影响效率
30 | self.extend_proxy_threshold = 10
31 | # 初始化代理列表
32 | self.proxyes = [{"proxy": None, "valid": True, "count": 0}]
33 | # 初始时使用0号代理(即无代理)
34 | self.proxy_index = 0
35 | # 表示可信代理的数量(如自己搭建的HTTP代理)+1(不用代理直接连接)
36 | self.fixed_proxy = len(self.proxyes)
37 | # 上一次抓新代理的时间
38 | self.last_fetch_proxy_time = datetime.now()
39 | # 每隔固定时间强制抓取新代理(min)
40 | self.fetch_proxy_interval = 120
41 | # 一个将被设为invalid的代理如果已经成功爬取大于这个参数的页面, 将不会被invalid
42 | self.invalid_proxy_threshold = 200
43 | # 从文件读取初始代理
44 | if os.path.exists(self.proxy_file):
45 | with open(self.proxy_file, "r") as fd:
46 | lines = fd.readlines()
47 | for line in lines:
48 | line = line.strip()
49 | if not line or self.url_in_proxyes("http://" + line):
50 | continue
51 | self.proxyes.append({"proxy": "http://" + line,
52 | "valid": True,
53 | "count": 0})
54 |
55 | @classmethod
56 | def from_crawler(cls, crawler):
57 | return cls(crawler.settings)
58 |
59 | def url_in_proxyes(self, url):
60 | """
61 | 返回一个代理url是否在代理列表中
62 | """
63 | for p in self.proxyes:
64 | if url == p["proxy"]:
65 | return True
66 | return False
67 |
68 | def reset_proxyes(self):
69 | """
70 | 将所有count>=指定阈值的代理重置为valid,
71 | """
72 | logger.info("reset proxyes to valid")
73 | for p in self.proxyes:
74 | if p["count"] >= self.dump_count_threshold:
75 | p["valid"] = True
76 |
77 | def fetch_new_proxyes(self):
78 | """
79 | 从网上抓取新的代理添加到代理列表中
80 | """
81 | logger.info("extending proxyes using fetch_free_proxyes.py")
82 | new_proxyes = fetch_free_proxyes.fetch_all()
83 | logger.info("new proxyes: %s" % new_proxyes)
84 | self.last_fetch_proxy_time = datetime.now()
85 |
86 | for np in new_proxyes:
87 | if self.url_in_proxyes("http://" + np):
88 | continue
89 | else:
90 | self.proxyes.append({"proxy": "http://" + np,
91 | "valid": True,
92 | "count": 0})
93 | if self.len_valid_proxy() < self.extend_proxy_threshold: # 如果发现抓不到什么新的代理了, 缩小threshold以避免白费功夫
94 | self.extend_proxy_threshold -= 1
95 |
96 | def len_valid_proxy(self):
97 | """
98 | 返回proxy列表中有效的代理数量
99 | """
100 | count = 0
101 | for p in self.proxyes:
102 | if p["valid"]:
103 | count += 1
104 | return count
105 |
106 | def inc_proxy_index(self):
107 | """
108 | 将代理列表的索引移到下一个有效代理的位置
109 | 如果发现代理列表只有fixed_proxy项有效, 重置代理列表
110 | 如果还发现已经距离上次抓代理过了指定时间, 则抓取新的代理
111 | """
112 | assert self.proxyes[0]["valid"]
113 | while True:
114 | self.proxy_index = (self.proxy_index + 1) % len(self.proxyes)
115 | if self.proxyes[self.proxy_index]["valid"]:
116 | break
117 |
118 | # 两轮proxy_index==0的时间间隔过短, 说明出现了验证码抖动,扩展代理列表
119 | if self.proxy_index == 0 and datetime.now() < self.last_no_proxy_time + timedelta(minutes=2):
120 | logger.info("captcha thrashing")
121 | self.fetch_new_proxyes()
122 |
123 | if self.len_valid_proxy() <= self.fixed_proxy or self.len_valid_proxy() < self.extend_proxy_threshold: # 如果代理列表中有效的代理不足的话重置为valid
124 | self.reset_proxyes()
125 |
126 | if self.len_valid_proxy() < self.extend_proxy_threshold: # 代理数量仍然不足, 抓取新的代理
127 | logger.info("valid proxy < threshold: %d/%d" % (self.len_valid_proxy(), self.extend_proxy_threshold))
128 | self.fetch_new_proxyes()
129 |
130 | logger.info("now using new proxy: %s" % self.proxyes[self.proxy_index]["proxy"])
131 |
132 | # 一定时间没更新后可能出现了在目前的代理不断循环不断验证码错误的情况, 强制抓取新代理
133 | #if datetime.now() > self.last_fetch_proxy_time + timedelta(minutes=self.fetch_proxy_interval):
134 | # logger.info("%d munites since last fetch" % self.fetch_proxy_interval)
135 | # self.fetch_new_proxyes()
136 |
137 | def set_proxy(self, request):
138 | """
139 | 将request设置使用为当前的或下一个有效代理
140 | """
141 | proxy = self.proxyes[self.proxy_index]
142 | if not proxy["valid"]:
143 | self.inc_proxy_index()
144 | proxy = self.proxyes[self.proxy_index]
145 |
146 | if self.proxy_index == 0: # 每次不用代理直接下载时更新self.last_no_proxy_time
147 | self.last_no_proxy_time = datetime.now()
148 |
149 | if proxy["proxy"]:
150 | request.meta["proxy"] = proxy["proxy"]
151 | elif "proxy" in request.meta.keys():
152 | del request.meta["proxy"]
153 | request.meta["proxy_index"] = self.proxy_index
154 | proxy["count"] += 1
155 |
156 | def invalid_proxy(self, index):
157 | """
158 | 将index指向的proxy设置为invalid,
159 | 并调整当前proxy_index到下一个有效代理的位置
160 | """
161 | if index < self.fixed_proxy: # 可信代理永远不会设为invalid
162 | self.inc_proxy_index()
163 | return
164 |
165 | if self.proxyes[index]["valid"]:
166 | logger.info("invalidate %s" % self.proxyes[index])
167 | self.proxyes[index]["valid"] = False
168 | if index == self.proxy_index:
169 | self.inc_proxy_index()
170 |
171 | if self.proxyes[index]["count"] < self.dump_count_threshold:
172 | self.dump_valid_proxy()
173 |
174 | def dump_valid_proxy(self):
175 | """
176 | 保存代理列表中有效的代理到文件
177 | """
178 | if self.dump_count_threshold <= 0:
179 | return
180 | logger.info("dumping proxyes to file")
181 | with open(self.proxy_file, "w") as fd:
182 | for i in range(self.fixed_proxy, len(self.proxyes)):
183 | p = self.proxyes[i]
184 | if p["valid"] or p["count"] >= self.dump_count_threshold:
185 | fd.write(p["proxy"][7:]+"\n") # 只保存有效的代理
186 |
187 | def process_request(self, request, spider):
188 | """
189 | 将request设置为使用代理
190 | """
191 | if self.proxy_index > 0 and datetime.now() > (self.last_no_proxy_time + timedelta(minutes=self.recover_interval)):
192 | logger.info("After %d minutes later, recover from using proxy" % self.recover_interval)
193 | self.last_no_proxy_time = datetime.now()
194 | self.proxy_index = 0
195 | request.meta["dont_redirect"] = True # 有些代理会把请求重定向到一个莫名其妙的地址
196 |
197 | # spider发现parse error, 要求更换代理
198 | if "change_proxy" in request.meta.keys() and request.meta["change_proxy"]:
199 | logger.info("change proxy request get by spider: %s" % request)
200 | self.invalid_proxy(request.meta["proxy_index"])
201 | request.meta["change_proxy"] = False
202 | self.set_proxy(request)
203 |
204 | def process_response(self, request, response, spider):
205 | """
206 | 检查response.status, 根据status是否在允许的状态码中决定是否切换到下一个proxy, 或者禁用proxy
207 | """
208 | if "proxy" in request.meta.keys():
209 | logger.debug("%s %s %s" % (request.meta["proxy"], response.status, request.url))
210 | else:
211 | logger.debug("None %s %s" % (response.status, request.url))
212 |
213 | # status不是正常的200而且不在spider声明的正常爬取过程中可能出现的
214 | # status列表中, 则认为代理无效, 切换代理
215 | if response.status != 200 \
216 | and (not hasattr(spider, "website_possible_httpstatus_list") \
217 | or response.status not in spider.website_possible_httpstatus_list):
218 | logger.info("response status not in spider.website_possible_httpstatus_list")
219 | self.invalid_proxy(request.meta["proxy_index"])
220 | new_request = request.copy()
221 | new_request.dont_filter = True
222 | return new_request
223 | else:
224 | return response
225 |
226 | def process_exception(self, request, exception, spider):
227 | """
228 | 处理由于使用代理导致的连接异常
229 | """
230 | logger.debug("%s exception: %s" % (self.proxyes[request.meta["proxy_index"]]["proxy"], exception))
231 | request_proxy_index = request.meta["proxy_index"]
232 |
233 | # 只有当proxy_index>fixed_proxy-1时才进行比较, 这样能保证至少本地直连是存在的.
234 | if isinstance(exception, self.DONT_RETRY_ERRORS):
235 | if request_proxy_index > self.fixed_proxy - 1 and self.invalid_proxy_flag: # WARNING 直连时超时的话换个代理还是重试? 这是策略问题
236 | if self.proxyes[request_proxy_index]["count"] < self.invalid_proxy_threshold:
237 | self.invalid_proxy(request_proxy_index)
238 | elif request_proxy_index == self.proxy_index: # 虽然超时,但是如果之前一直很好用,也不设为invalid
239 | self.inc_proxy_index()
240 | else: # 简单的切换而不禁用
241 | if request.meta["proxy_index"] == self.proxy_index:
242 | self.inc_proxy_index()
243 | new_request = request.copy()
244 | new_request.dont_filter = True
245 | return new_request
246 |
247 |
--------------------------------------------------------------------------------
/HttpProxyMiddlewareTest/HttpProxyMiddlewareTest/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kohn/HttpProxyMiddleware/b087518ee7c4c7f561c62511f4f6c922a414abbd/HttpProxyMiddlewareTest/HttpProxyMiddlewareTest/__init__.py
--------------------------------------------------------------------------------
/HttpProxyMiddlewareTest/HttpProxyMiddlewareTest/fetch_free_proxyes.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | from bs4 import BeautifulSoup
4 | import urllib2
5 | import logging
6 |
7 | logger = logging.getLogger(__name__)
8 |
9 | def get_html(url):
10 | request = urllib2.Request(url)
11 | request.add_header("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36")
12 | html = urllib2.urlopen(request)
13 | return html.read()
14 |
15 | def get_soup(url):
16 | soup = BeautifulSoup(get_html(url), "lxml")
17 | return soup
18 |
19 | def fetch_kxdaili(page):
20 | """
21 | 从www.kxdaili.com抓取免费代理
22 | """
23 | proxyes = []
24 | try:
25 | url = "http://www.kxdaili.com/dailiip/1/%d.html" % page
26 | soup = get_soup(url)
27 | table_tag = soup.find("table", attrs={"class": "segment"})
28 | trs = table_tag.tbody.find_all("tr")
29 | for tr in trs:
30 | tds = tr.find_all("td")
31 | ip = tds[0].text
32 | port = tds[1].text
33 | latency = tds[4].text.split(" ")[0]
34 | if float(latency) < 0.5: # 输出延迟小于0.5秒的代理
35 | proxy = "%s:%s" % (ip, port)
36 | proxyes.append(proxy)
37 | except:
38 | logger.warning("fail to fetch from kxdaili")
39 | return proxyes
40 |
41 | def img2port(img_url):
42 | """
43 | mimvp.com的端口号用图片来显示, 本函数将图片url转为端口, 目前的临时性方法并不准确
44 | """
45 | code = img_url.split("=")[-1]
46 | if code.find("AO0OO0O")>0:
47 | return 80
48 | else:
49 | return None
50 |
51 | def fetch_mimvp():
52 | """
53 | 从http://proxy.mimvp.com/free.php抓免费代理
54 | """
55 | proxyes = []
56 | try:
57 | url = "http://proxy.mimvp.com/free.php?proxy=in_hp"
58 | soup = get_soup(url)
59 | table = soup.find("div", attrs={"id": "list"}).table
60 | tds = table.tbody.find_all("td")
61 | for i in range(0, len(tds), 10):
62 | id = tds[i].text
63 | ip = tds[i+1].text
64 | port = img2port(tds[i+2].img["src"])
65 | response_time = tds[i+7]["title"][:-1]
66 | transport_time = tds[i+8]["title"][:-1]
67 | if port is not None and float(response_time) < 1 :
68 | proxy = "%s:%s" % (ip, port)
69 | proxyes.append(proxy)
70 | except:
71 | logger.warning("fail to fetch from mimvp")
72 | return proxyes
73 |
74 | def fetch_xici():
75 | """
76 | http://www.xicidaili.com/nn/
77 | """
78 | proxyes = []
79 | try:
80 | url = "http://www.xicidaili.com/nn/"
81 | soup = get_soup(url)
82 | table = soup.find("table", attrs={"id": "ip_list"})
83 | trs = table.find_all("tr")
84 | for i in range(1, len(trs)):
85 | tr = trs[i]
86 | tds = tr.find_all("td")
87 | ip = tds[2].text
88 | port = tds[3].text
89 | speed = tds[7].div["title"][:-1]
90 | latency = tds[8].div["title"][:-1]
91 | if float(speed) < 3 and float(latency) < 1:
92 | proxyes.append("%s:%s" % (ip, port))
93 | except:
94 | logger.warning("fail to fetch from xici")
95 | return proxyes
96 |
97 | def fetch_ip181():
98 | """
99 | http://www.ip181.com/
100 | """
101 | proxyes = []
102 | try:
103 | url = "http://www.ip181.com/"
104 | soup = get_soup(url)
105 | table = soup.find("table")
106 | trs = table.find_all("tr")
107 | for i in range(1, len(trs)):
108 | tds = trs[i].find_all("td")
109 | ip = tds[0].text
110 | port = tds[1].text
111 | latency = tds[4].text[:-2]
112 | if float(latency) < 1:
113 | proxyes.append("%s:%s" % (ip, port))
114 | except Exception as e:
115 | logger.warning("fail to fetch from ip181: %s" % e)
116 | return proxyes
117 |
118 | def fetch_httpdaili():
119 | """
120 | http://www.httpdaili.com/mfdl/
121 | 更新比较频繁
122 | """
123 | proxyes = []
124 | try:
125 | url = "http://www.httpdaili.com/mfdl/"
126 | soup = get_soup(url)
127 | table = soup.find("div", attrs={"kb-item-wrap11"}).table
128 | trs = table.find_all("tr")
129 | for i in range(1, len(trs)):
130 | try:
131 | tds = trs[i].find_all("td")
132 | ip = tds[0].text
133 | port = tds[1].text
134 | type = tds[2].text
135 | if type == u"匿名":
136 | proxyes.append("%s:%s" % (ip, port))
137 | except:
138 | pass
139 | except Exception as e:
140 | logger.warning("fail to fetch from httpdaili: %s" % e)
141 | return proxyes
142 |
143 | def fetch_66ip():
144 | """
145 | http://www.66ip.cn/
146 | 每次打开此链接都能得到一批代理, 速度不保证
147 | """
148 | proxyes = []
149 | try:
150 | # 修改getnum大小可以一次获取不同数量的代理
151 | url = "http://www.66ip.cn/nmtq.php?getnum=10&isp=0&anonymoustype=3&start=&ports=&export=&ipaddress=&area=1&proxytype=0&api=66ip"
152 | content = get_html(url)
153 | urls = content.split("")[-1].split("
")
154 | for u in urls:
155 | if u.strip():
156 | proxyes.append(u.strip())
157 | except Exception as e:
158 | logger.warning("fail to fetch from httpdaili: %s" % e)
159 | return proxyes
160 |
161 |
162 |
163 | def check(proxy):
164 | import urllib2
165 | url = "http://www.baidu.com/js/bdsug.js?v=1.0.3.0"
166 | proxy_handler = urllib2.ProxyHandler({'http': "http://" + proxy})
167 | opener = urllib2.build_opener(proxy_handler,urllib2.HTTPHandler)
168 | try:
169 | response = opener.open(url,timeout=3)
170 | return response.code == 200
171 | except Exception:
172 | return False
173 |
174 | def fetch_all(endpage=2):
175 | proxyes = []
176 | for i in range(1, endpage):
177 | proxyes += fetch_kxdaili(i)
178 | proxyes += fetch_mimvp()
179 | proxyes += fetch_xici()
180 | proxyes += fetch_ip181()
181 | proxyes += fetch_httpdaili()
182 | proxyes += fetch_66ip()
183 | valid_proxyes = []
184 | logger.info("checking proxyes validation")
185 | for p in proxyes:
186 | if check(p):
187 | valid_proxyes.append(p)
188 | return valid_proxyes
189 |
190 | if __name__ == '__main__':
191 | import sys
192 | root_logger = logging.getLogger("")
193 | stream_handler = logging.StreamHandler(sys.stdout)
194 | formatter = logging.Formatter('%(name)-8s %(asctime)s %(levelname)-8s %(message)s', '%a, %d %b %Y %H:%M:%S',)
195 | stream_handler.setFormatter(formatter)
196 | root_logger.addHandler(stream_handler)
197 | logger = logging.getLogger(__name__)
198 | logger.setLevel(logging.DEBUG)
199 | proxyes = fetch_all()
200 | #print check("202.29.238.242:3128")
201 | for p in proxyes:
202 | print p
203 |
--------------------------------------------------------------------------------
/HttpProxyMiddlewareTest/HttpProxyMiddlewareTest/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class HttpproxymiddlewaretestItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | pass
15 |
--------------------------------------------------------------------------------
/HttpProxyMiddlewareTest/HttpProxyMiddlewareTest/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 |
9 | class HttpproxymiddlewaretestPipeline(object):
10 | def process_item(self, item, spider):
11 | return item
12 |
--------------------------------------------------------------------------------
/HttpProxyMiddlewareTest/HttpProxyMiddlewareTest/proxyes.data:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kohn/HttpProxyMiddleware/b087518ee7c4c7f561c62511f4f6c922a414abbd/HttpProxyMiddlewareTest/HttpProxyMiddlewareTest/proxyes.data
--------------------------------------------------------------------------------
/HttpProxyMiddlewareTest/HttpProxyMiddlewareTest/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for HttpProxyMiddlewareTest project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'HttpProxyMiddlewareTest'
13 |
14 | SPIDER_MODULES = ['HttpProxyMiddlewareTest.spiders']
15 | NEWSPIDER_MODULE = 'HttpProxyMiddlewareTest.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'HttpProxyMiddlewareTest (+http://www.yourdomain.com)'
20 |
21 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
22 | #CONCURRENT_REQUESTS=32
23 |
24 | # Configure a delay for requests for the same website (default: 0)
25 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
26 | # See also autothrottle settings and docs
27 | #DOWNLOAD_DELAY=3
28 | # The download delay setting will honor only one of:
29 | #CONCURRENT_REQUESTS_PER_DOMAIN=16
30 | #CONCURRENT_REQUESTS_PER_IP=16
31 |
32 | # Disable cookies (enabled by default)
33 | #COOKIES_ENABLED=False
34 |
35 | # Disable Telnet Console (enabled by default)
36 | #TELNETCONSOLE_ENABLED=False
37 |
38 | # Override the default request headers:
39 | #DEFAULT_REQUEST_HEADERS = {
40 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
41 | # 'Accept-Language': 'en',
42 | #}
43 |
44 | # Enable or disable spider middlewares
45 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
46 | #SPIDER_MIDDLEWARES = {
47 | # 'HttpProxyMiddlewareTest.middlewares.MyCustomSpiderMiddleware': 543,
48 | #}
49 |
50 | # Enable or disable downloader middlewares
51 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
52 | DOWNLOADER_MIDDLEWARES = {
53 | 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware': 350,
54 | 'HttpProxyMiddlewareTest.HttpProxyMiddleware.HttpProxyMiddleware': 543,
55 | }
56 |
57 | # LOG_LEVEL = "INFO"
58 |
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | # 'HttpProxyMiddlewareTest.pipelines.SomePipeline': 300,
69 | #}
70 |
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay
74 | #AUTOTHROTTLE_ENABLED=True
75 | # The initial download delay
76 | #AUTOTHROTTLE_START_DELAY=5
77 | # The maximum download delay to be set in case of high latencies
78 | #AUTOTHROTTLE_MAX_DELAY=60
79 | # Enable showing throttling stats for every response received:
80 | #AUTOTHROTTLE_DEBUG=False
81 |
82 | # Enable and configure HTTP caching (disabled by default)
83 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
84 | #HTTPCACHE_ENABLED=True
85 | #HTTPCACHE_EXPIRATION_SECS=0
86 | #HTTPCACHE_DIR='httpcache'
87 | #HTTPCACHE_IGNORE_HTTP_CODES=[]
88 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
89 |
90 | DOWNLOAD_TIMEOUT = 10
91 |
--------------------------------------------------------------------------------
/HttpProxyMiddlewareTest/HttpProxyMiddlewareTest/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/HttpProxyMiddlewareTest/HttpProxyMiddlewareTest/spiders/test.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from scrapy.http import Request
4 | import logging
5 |
6 | logger = logging.getLogger("test spider")
7 |
8 | class TestSpider(scrapy.Spider):
9 | name = "test"
10 | allowed_domains = ["103.243.24.223"]
11 | website_possible_httpstatus_list = [403]
12 | handle_httpstatus_list = [403]
13 |
14 | start_urls = (
15 | 'http://103.243.24.223:8000',
16 | )
17 |
18 | def parse(self, response):
19 | if response.body == "banned":
20 | req = response.request
21 | req.meta["change_proxy"] = True
22 | yield req
23 | else:
24 | logger.info("got page: %s" % response.body)
25 | yield response.request
26 |
--------------------------------------------------------------------------------
/HttpProxyMiddlewareTest/README.org:
--------------------------------------------------------------------------------
1 | * HttpProxyMiddlewareTest
2 |
3 | A scrapy project used to test [[https://github.com/kohn/HttpProxyMiddleware][HttpProxyMiddleware]].
4 |
5 | * Usage
6 |
7 | Update HttpProxyMiddleware position in
8 | HttpProxyMiddlewareTest/settings.py.
9 |
10 | Run Command:
11 | : scrapy crawl test
12 |
--------------------------------------------------------------------------------
/HttpProxyMiddlewareTest/proxyes.dat:
--------------------------------------------------------------------------------
1 | 111.206.190.155:80
2 | 182.140.132.107:8888
3 | 60.206.233.2:3128
4 | 61.174.10.22:8080
5 | 52.40.124.145:8083
6 | 59.173.187.179:8090
7 | 113.246.216.163:8888
8 | 117.64.150.51:8998
9 | 117.64.226.122:8998
10 |
--------------------------------------------------------------------------------
/HttpProxyMiddlewareTest/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = HttpProxyMiddlewareTest.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = HttpProxyMiddlewareTest
12 |
--------------------------------------------------------------------------------
/IPBanTest/IPBanTest/README.org:
--------------------------------------------------------------------------------
1 | * IPBanTest
2 |
3 | A django powered website for testing [[https://github.com/kohn/HttpProxyMiddleware][HttpProxyMiddleware]].
4 |
--------------------------------------------------------------------------------
/IPBanTest/IPBanTest/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kohn/HttpProxyMiddleware/b087518ee7c4c7f561c62511f4f6c922a414abbd/IPBanTest/IPBanTest/__init__.py
--------------------------------------------------------------------------------
/IPBanTest/IPBanTest/settings.py:
--------------------------------------------------------------------------------
1 | """
2 | Django settings for IPBanTest project.
3 |
4 | Generated by 'django-admin startproject' using Django 1.9.5.
5 |
6 | For more information on this file, see
7 | https://docs.djangoproject.com/en/1.9/topics/settings/
8 |
9 | For the full list of settings and their values, see
10 | https://docs.djangoproject.com/en/1.9/ref/settings/
11 | """
12 |
13 | import os
14 |
15 | # Build paths inside the project like this: os.path.join(BASE_DIR, ...)
16 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
17 |
18 |
19 | # Quick-start development settings - unsuitable for production
20 | # See https://docs.djangoproject.com/en/1.9/howto/deployment/checklist/
21 |
22 | # SECURITY WARNING: keep the secret key used in production secret!
23 | SECRET_KEY = '-b-4g9o*37*@x_!uzc@ft4u6wxxegi-luwxfp)s4_f7@&s=mt%'
24 |
25 | # SECURITY WARNING: don't run with debug turned on in production!
26 | DEBUG = False
27 |
28 | ALLOWED_HOSTS = ["103.243.24.223"]
29 |
30 |
31 | # Application definition
32 |
33 | INSTALLED_APPS = [
34 | 'django.contrib.admin',
35 | 'django.contrib.auth',
36 | 'django.contrib.contenttypes',
37 | 'django.contrib.sessions',
38 | 'django.contrib.messages',
39 | 'django.contrib.staticfiles',
40 | ]
41 |
42 | MIDDLEWARE_CLASSES = [
43 | 'django.middleware.cache.UpdateCacheMiddleware',
44 | 'django.middleware.common.CommonMiddleware',
45 | 'django.middleware.cache.FetchFromCacheMiddleware',
46 | 'django.middleware.security.SecurityMiddleware',
47 | 'django.contrib.sessions.middleware.SessionMiddleware',
48 | 'django.middleware.csrf.CsrfViewMiddleware',
49 | 'django.contrib.auth.middleware.AuthenticationMiddleware',
50 | 'django.contrib.auth.middleware.SessionAuthenticationMiddleware',
51 | 'django.contrib.messages.middleware.MessageMiddleware',
52 | 'django.middleware.clickjacking.XFrameOptionsMiddleware',
53 | ]
54 |
55 | ROOT_URLCONF = 'IPBanTest.urls'
56 |
57 | TEMPLATES = [
58 | {
59 | 'BACKEND': 'django.template.backends.django.DjangoTemplates',
60 | 'DIRS': [],
61 | 'APP_DIRS': True,
62 | 'OPTIONS': {
63 | 'context_processors': [
64 | 'django.template.context_processors.debug',
65 | 'django.template.context_processors.request',
66 | 'django.contrib.auth.context_processors.auth',
67 | 'django.contrib.messages.context_processors.messages',
68 | ],
69 | },
70 | },
71 | ]
72 |
73 | WSGI_APPLICATION = 'IPBanTest.wsgi.application'
74 |
75 | CACHES = {
76 | 'default': {
77 | 'BACKEND': 'django.core.cache.backends.locmem.LocMemCache',
78 | 'LOCATION': 'unique-snowflake',
79 | }
80 | }
81 |
82 | # Database
83 | # https://docs.djangoproject.com/en/1.9/ref/settings/#databases
84 |
85 | # DATABASES = {
86 | # 'default': {
87 | # 'ENGINE': 'django.db.backends.sqlite3',
88 | # 'NAME': os.path.join(BASE_DIR, 'db.sqlite3'),
89 | # }
90 | # }
91 |
92 |
93 | # Password validation
94 | # https://docs.djangoproject.com/en/1.9/ref/settings/#auth-password-validators
95 |
96 | AUTH_PASSWORD_VALIDATORS = [
97 | {
98 | 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
99 | },
100 | {
101 | 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
102 | },
103 | {
104 | 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
105 | },
106 | {
107 | 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
108 | },
109 | ]
110 |
111 |
112 | # Internationalization
113 | # https://docs.djangoproject.com/en/1.9/topics/i18n/
114 |
115 | LANGUAGE_CODE = 'en-us'
116 |
117 | TIME_ZONE = 'UTC'
118 |
119 | USE_I18N = True
120 |
121 | USE_L10N = True
122 |
123 | USE_TZ = True
124 |
125 |
126 | # Static files (CSS, JavaScript, Images)
127 | # https://docs.djangoproject.com/en/1.9/howto/static-files/
128 |
129 | STATIC_URL = '/static/'
130 |
131 | DOWNLOAD_TIMEOUT = 10
132 |
--------------------------------------------------------------------------------
/IPBanTest/IPBanTest/urls.py:
--------------------------------------------------------------------------------
1 | """IPBanTest URL Configuration
2 |
3 | The `urlpatterns` list routes URLs to views. For more information please see:
4 | https://docs.djangoproject.com/en/1.9/topics/http/urls/
5 | Examples:
6 | Function views
7 | 1. Add an import: from my_app import views
8 | 2. Add a URL to urlpatterns: url(r'^$', views.home, name='home')
9 | Class-based views
10 | 1. Add an import: from other_app.views import Home
11 | 2. Add a URL to urlpatterns: url(r'^$', Home.as_view(), name='home')
12 | Including another URLconf
13 | 1. Import the include() function: from django.conf.urls import url, include
14 | 2. Add a URL to urlpatterns: url(r'^blog/', include('blog.urls'))
15 | """
16 | from django.conf.urls import url
17 | from IPBanTest import views
18 |
19 | urlpatterns = [
20 | url(r'^/?$', views.index),
21 | ]
22 |
--------------------------------------------------------------------------------
/IPBanTest/IPBanTest/views.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from django.http import HttpResponseForbidden
3 | from django.http import JsonResponse
4 | from django.views.decorators.cache import cache_page
5 | from threading import Timer
6 | import logging
7 |
8 | logger = logging.getLogger("IPBanTest")
9 |
10 | ip_count = {}
11 |
12 | def clear_ip():
13 | logger.info("clear ip_count")
14 | ip_count.clear()
15 | Timer(10*60, clear_ip, ()).start() # clear evert 10min
16 |
17 | Timer(10, clear_ip, ()).start()
18 |
19 | @cache_page(0)
20 | def index(request):
21 | ip = request.META.get("REMOTE_ADDR")
22 | if ip in ip_count.keys():
23 | ip_count[ip] += 1
24 | else:
25 | ip_count[ip] = 0
26 |
27 | message = {}
28 | if ip_count[ip] > 5:
29 | return HttpResponseForbidden("banned")
30 | else:
31 | message["ip"] = ip
32 | message["count"] = ip_count[ip]
33 | return JsonResponse(message)
34 |
--------------------------------------------------------------------------------
/IPBanTest/IPBanTest/wsgi.py:
--------------------------------------------------------------------------------
1 | """
2 | WSGI config for IPBanTest project.
3 |
4 | It exposes the WSGI callable as a module-level variable named ``application``.
5 |
6 | For more information on this file, see
7 | https://docs.djangoproject.com/en/1.9/howto/deployment/wsgi/
8 | """
9 |
10 | import os
11 |
12 | from django.core.wsgi import get_wsgi_application
13 |
14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "IPBanTest.settings")
15 |
16 | application = get_wsgi_application()
17 |
--------------------------------------------------------------------------------
/IPBanTest/db.sqlite3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kohn/HttpProxyMiddleware/b087518ee7c4c7f561c62511f4f6c922a414abbd/IPBanTest/db.sqlite3
--------------------------------------------------------------------------------
/IPBanTest/manage.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import os
3 | import sys
4 |
5 | if __name__ == "__main__":
6 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "IPBanTest.settings")
7 |
8 | from django.core.management import execute_from_command_line
9 |
10 | execute_from_command_line(sys.argv)
11 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2015 Kohn
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
23 |
--------------------------------------------------------------------------------
/README.org:
--------------------------------------------------------------------------------
1 | * HttpProxyMiddleware
2 |
3 | A middleware for scrapy. Used to change HTTP proxy from time to time.
4 |
5 | Initial proxyes are stored in a file. During runtime, the middleware
6 | will fetch new proxyes if it finds out lack of valid proxyes.
7 |
8 | Related blog: [[http://www.kohn.com.cn/wordpress/?p=208]]
9 |
10 |
11 | ** fetch_free_proxyes.py
12 | Used to fetch free proxyes from the Internet. Could be modified by
13 | youself.
14 |
15 | ** Usage
16 |
17 | *** settings.py
18 |
19 | #+BEGIN_SRC python
20 | DOWNLOADER_MIDDLEWARES = {
21 | 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware': 350,
22 | 'scrapy.contrib.downloadermiddleware.retry.RetryMiddleware': 351,
23 | # put this middleware after RetryMiddleware
24 | 'crawler.middleware.HttpProxyMiddleware': 999,
25 | }
26 |
27 | DOWNLOAD_TIMEOUT = 10 # 10-15 second is an experienmental reasonable timeout
28 | #+END_SRC
29 |
30 | *** change proxy
31 |
32 | Often, we wanna change to use a new proxy when our spider gets banned.
33 | Just recognize your IP being banned and yield a new Request in your
34 | Spider.parse method with:
35 |
36 | #+BEGIN_SRC python
37 | request.meta["change_proxy"] = True
38 | #+END_SRC
39 |
40 | Some proxy may return invalid HTML code. So if you get any exception
41 | during parsing response, also yield a new request with:
42 |
43 | #+BEGIN_SRC python
44 | request.meta["change_proxy"] = True
45 | #+END_SRC
46 |
47 |
48 | *** spider.py
49 |
50 | Your spider should specify an array of status code where your spider
51 | may encouter during crawling. Any status code that is not 200 nor in
52 | the array would be treated as a result of invalid proxy and the proxy
53 | would be discarded. For example:
54 |
55 | #+BEGIN_SRC python
56 | website_possible_httpstatus_list = [404]
57 | #+END_SRC
58 |
59 | This line tolds the middleware that the website you're crawling would
60 | possibly return a response whose status code is 404, and do not
61 | discard the proxy that this request is using.
62 |
63 |
64 | ** Test
65 |
66 | Update HttpProxyMiddleware.py path in
67 | HttpProxyMiddlewareTest/settings.py.
68 |
69 |
70 | #+BEGIN_SRC sh
71 | cd HttpProxyMiddlewareTest
72 | scrapy crawl test
73 | #+END_SRC
74 |
75 |
76 | The testing server is hosted on my VPS, so take it easy... DO NOT
77 | waste too much of my data plan.
78 |
79 | You may start your own testing server using IPBanTest which is powered
80 | by Django.
81 |
--------------------------------------------------------------------------------
/fetch_free_proxyes.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | from bs4 import BeautifulSoup
4 | import urllib.request, urllib.error, urllib.parse
5 | import logging
6 |
7 | logger = logging.getLogger(__name__)
8 |
9 | def get_html(url):
10 | request = urllib.request.Request(url)
11 | request.add_header("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36")
12 | html = urllib.request.urlopen(request)
13 | return html.read()
14 |
15 | def get_soup(url):
16 | soup = BeautifulSoup(get_html(url), "lxml")
17 | return soup
18 |
19 | def fetch_kxdaili(page, https):
20 | """
21 | 从www.kxdaili.com抓取免费代理
22 | """
23 | proxyes = []
24 | try:
25 | url = "http://www.kxdaili.com/dailiip/1/%d.html" % page
26 | soup = get_soup(url)
27 | table_tag = soup.find("table", attrs={"class": "segment"})
28 | trs = table_tag.tbody.find_all("tr")
29 | for tr in trs:
30 | tds = tr.find_all("td")
31 | ip = tds[0].text
32 | port = tds[1].text
33 | types = tds[2].text
34 | if https and "HTTPS" not in types:
35 | continue
36 | latency = tds[4].text.split(" ")[0]
37 | if float(latency) < 0.5: # 输出延迟小于0.5秒的代理
38 | if https:
39 | proxy = "https://%s:%s" % (ip, port)
40 | else:
41 | proxy = "http://%s:%s" % (ip, port)
42 | proxyes.append(proxy)
43 | except Exception as e:
44 | logger.warning(e)
45 | logger.warning("fail to fetch from kxdaili")
46 | return proxyes
47 |
48 | def img2port(img_url):
49 | """
50 | mimvp.com的端口号用图片来显示, 本函数将图片url转为端口, 目前的临时性方法并不准确
51 | """
52 | code = img_url.split("=")[-1]
53 | if code.find("AO0OO0O")>0:
54 | return 80
55 | else:
56 | return None
57 |
58 | def fetch_mimvp(https):
59 | """
60 | 从http://proxy.mimvp.com/free.php抓免费代理
61 | """
62 | proxyes = []
63 | if https:
64 | return proxyes
65 | try:
66 | url = "http://proxy.mimvp.com/free.php?proxy=in_hp"
67 | soup = get_soup(url)
68 | table = soup.find("div", attrs={"id": "list"}).table
69 | tds = table.tbody.find_all("td")
70 | for i in range(0, len(tds), 10):
71 | id = tds[i].text
72 | ip = tds[i+1].text
73 | port = img2port(tds[i+2].img["src"])
74 | response_time = tds[i+7]["title"][:-1]
75 | transport_time = tds[i+8]["title"][:-1]
76 | if port is not None and float(response_time) < 1 :
77 | proxy = "%s:%s" % (ip, port)
78 | proxyes.append(proxy)
79 | except:
80 | logger.warning("fail to fetch from mimvp")
81 | return proxyes
82 |
83 | def fetch_xici(https):
84 | """
85 | http://www.xicidaili.com/nn/
86 | """
87 | proxyes = []
88 | try:
89 | url = "http://www.xicidaili.com/nn/"
90 | soup = get_soup(url)
91 | table = soup.find("table", attrs={"id": "ip_list"})
92 | trs = table.find_all("tr")
93 | for i in range(1, len(trs)):
94 | tr = trs[i]
95 | tds = tr.find_all("td")
96 | ip = tds[1].text
97 | port = tds[2].text
98 | if https and tds[5].text.strip()!="HTTPS":
99 | continue
100 | speed = tds[6].div["title"][:-1]
101 | latency = tds[7].div["title"][:-1]
102 | if float(speed) < 3 and float(latency) < 1:
103 | if https:
104 | proxyes.append("https://%s:%s" % (ip, port))
105 | else:
106 | proxyes.append("http://%s:%s" % (ip, port))
107 | except:
108 | logger.warning("fail to fetch from xici")
109 | return proxyes
110 |
111 | def fetch_ip181(https):
112 | """
113 | http://www.ip181.com/
114 | """
115 | proxyes = []
116 | try:
117 | url = "http://www.ip181.com/"
118 | soup = get_soup(url)
119 | table = soup.find("table")
120 | trs = table.find_all("tr")
121 | for i in range(1, len(trs)):
122 | tds = trs[i].find_all("td")
123 | ip = tds[0].text
124 | port = tds[1].text
125 | if https and "HTTPS" not in tds[3].text:
126 | continue
127 | latency = tds[4].text[:-2]
128 | if float(latency) < 1:
129 | if https:
130 | proxyes.append("https://%s:%s" % (ip, port))
131 | else:
132 | proxyes.append("http://%s:%s" % (ip, port))
133 | except Exception as e:
134 | logger.warning("fail to fetch from ip181: %s" % e)
135 | return proxyes
136 |
137 | def fetch_httpdaili(https):
138 | """
139 | http://www.httpdaili.com/mfdl/
140 | 更新比较频繁
141 | """
142 | proxyes = []
143 | if https:
144 | return proxyes
145 | try:
146 | url = "http://www.httpdaili.com/mfdl/"
147 | soup = get_soup(url)
148 | table = soup.find("div", attrs={"kb-item-wrap11"}).table
149 | trs = table.find_all("tr")
150 | for i in range(1, len(trs)):
151 | try:
152 | tds = trs[i].find_all("td")
153 | ip = tds[0].text
154 | port = tds[1].text
155 | type = tds[2].text
156 | if type == "匿名":
157 | proxyes.append("%s:%s" % (ip, port))
158 | except:
159 | pass
160 | except Exception as e:
161 | logger.warning("fail to fetch from httpdaili: %s" % e)
162 | return proxyes
163 |
164 | def fetch_66ip(https):
165 | """
166 | http://www.66ip.cn/
167 | 每次打开此链接都能得到一批代理, 速度不保证
168 | """
169 | proxyes = []
170 | try:
171 | # 修改getnum大小可以一次获取不同数量的代理
172 | if https:
173 | url = "http://www.66ip.cn/nmtq.php?getnum=10&isp=0&anonymoustype=3&start=&ports=&export=&ipaddress=&area=1&proxytype=1&api=66ip"
174 | else:
175 | url = "http://www.66ip.cn/nmtq.php?getnum=10&isp=0&anonymoustype=3&start=&ports=&export=&ipaddress=&area=1&proxytype=0&api=66ip"
176 | content = get_html(url)
177 | content = str(content)
178 | urls = content.split("")[1].split("")[0].split("
")
179 | for u in urls:
180 | u = u.split("\\t")[-1]
181 | if u.strip():
182 | if https:
183 | proxyes.append("https://" + u.strip())
184 | else:
185 | proxyes.append("http://" + u.strip())
186 |
187 | except Exception as e:
188 | logger.warning("fail to fetch from 66ip: %s" % e)
189 | return proxyes
190 |
191 |
192 |
193 | def check(proxy):
194 | import urllib.request, urllib.error, urllib.parse
195 |
196 | if proxy.startswith("https"):
197 | url = "https://www.baidu.com/js/bdsug.js?v=1.0.3.0"
198 | proxy_handler = urllib.request.ProxyHandler({'https': proxy})
199 | else:
200 | url = "http://www.baidu.com/js/bdsug.js?v=1.0.3.0"
201 | proxy_handler = urllib.request.ProxyHandler({'http': proxy})
202 | opener = urllib.request.build_opener(proxy_handler,urllib.request.HTTPHandler)
203 | try:
204 | response = opener.open(url, timeout=3)
205 | return response.code == 200 and response.url == url
206 | except Exception:
207 | return False
208 |
209 | def fetch_all(endpage=2, https=False):
210 | proxyes = []
211 | for i in range(1, endpage):
212 | proxyes += fetch_kxdaili(i, https=https)
213 | proxyes += fetch_mimvp(https)
214 | proxyes += fetch_xici(https)
215 | proxyes += fetch_ip181(https)
216 | proxyes += fetch_httpdaili(https)
217 | proxyes += fetch_66ip(https)
218 | valid_proxyes = []
219 | logger.info("checking proxyes validation")
220 | for p in proxyes:
221 | if check(p):
222 | valid_proxyes.append(p)
223 | return valid_proxyes
224 |
225 | if __name__ == '__main__':
226 | import sys
227 | root_logger = logging.getLogger("")
228 | stream_handler = logging.StreamHandler(sys.stdout)
229 | formatter = logging.Formatter('%(name)-8s %(asctime)s %(levelname)-8s %(message)s', '%a, %d %b %Y %H:%M:%S',)
230 | stream_handler.setFormatter(formatter)
231 | root_logger.addHandler(stream_handler)
232 | logger = logging.getLogger(__name__)
233 | logger.setLevel(logging.DEBUG)
234 | proxyes = fetch_66ip(https=True)
235 | #print check("202.29.238.242:3128")
236 | for p in proxyes:
237 | print(p)
238 |
--------------------------------------------------------------------------------