├── .gitignore ├── HttpProxyMiddleware.py ├── HttpProxyMiddlewareTest ├── HttpProxyMiddlewareTest │ ├── HttpProxyMiddleware.py │ ├── __init__.py │ ├── fetch_free_proxyes.py │ ├── items.py │ ├── pipelines.py │ ├── proxyes.data │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── test.py ├── README.org ├── proxyes.dat └── scrapy.cfg ├── IPBanTest ├── IPBanTest │ ├── README.org │ ├── __init__.py │ ├── settings.py │ ├── urls.py │ ├── views.py │ └── wsgi.py ├── db.sqlite3 └── manage.py ├── LICENSE ├── README.org └── fetch_free_proxyes.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | cookie 59 | index.html 60 | bak.py 61 | -------------------------------------------------------------------------------- /HttpProxyMiddleware.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | import os 4 | import logging 5 | from datetime import datetime, timedelta 6 | from twisted.web._newclient import ResponseNeverReceived 7 | from twisted.internet.error import TimeoutError, ConnectionRefusedError, ConnectError 8 | from crawler import fetch_free_proxyes 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | class HttpProxyMiddleware(object): 13 | # 遇到这些类型的错误直接当做代理不可用处理掉, 不再传给retrymiddleware 14 | DONT_RETRY_ERRORS = (TimeoutError, ConnectionRefusedError, ResponseNeverReceived, ConnectError, ValueError) 15 | 16 | def __init__(self, use_https): 17 | # 保存上次不用代理直接连接的时间点 18 | self.last_no_proxy_time = datetime.now() 19 | # 一定分钟数后切换回不用代理, 因为用代理影响到速度 20 | self.recover_interval = 20 21 | # 一个proxy如果没用到这个数字就被发现老是超时, 则永久移除该proxy. 设为0则不会修改代理文件. 22 | self.dump_count_threshold = 20 23 | # 存放代理列表的文件, 每行一个代理, 格式为proto://ip:port, 这个文件会被修改, 注意备份 24 | self.proxy_file = "proxyes.dat" 25 | # 是否在超时的情况下禁用代理 26 | self.invalid_proxy_flag = True 27 | # 当有效代理小于这个数时(包括直连), 从网上抓取新的代理, 可以将这个数设为为了满足每个ip被要求输入验证码后得到足够休息时间所需要的代理数 28 | # 例如爬虫在十个可用代理之间切换时, 每个ip经过数分钟才再一次轮到自己, 这样就能get一些请求而不用输入验证码. 29 | # 如果这个数过小, 例如两个, 爬虫用A ip爬了没几个就被ban, 换了一个又爬了没几次就被ban, 这样整个爬虫就会处于一种忙等待的状态, 影响效率 30 | self.extend_proxy_threshold = 10 31 | # 初始化代理列表 32 | self.proxyes = [{"proxy": None, "valid": True, "count": 0}] 33 | # 初始时使用0号代理(即无代理) 34 | self.proxy_index = 0 35 | # 表示可信代理的数量(如自己搭建的HTTP代理)+1(不用代理直接连接) 36 | self.fixed_proxy = len(self.proxyes) 37 | # 上一次抓新代理的时间 38 | self.last_fetch_proxy_time = datetime.now() 39 | # 每隔固定时间强制抓取新代理(min) 40 | self.fetch_proxy_interval = 120 41 | # 一个将被设为invalid的代理如果已经成功爬取大于这个参数的页面, 将不会被invalid 42 | self.invalid_proxy_threshold = 200 43 | # 使用http代理还是https代理 44 | self.use_https = use_https 45 | # 从文件读取初始代理 46 | if os.path.exists(self.proxy_file): 47 | with open(self.proxy_file, "r") as fd: 48 | lines = fd.readlines() 49 | for line in lines: 50 | line = line.strip() 51 | if not line or self.url_in_proxyes(line): 52 | continue 53 | self.proxyes.append({"proxy": line, 54 | "valid": True, 55 | "count": 0}) 56 | 57 | @classmethod 58 | def from_crawler(cls, crawler): 59 | use_https = crawler.settings.getbool('HTTPS_PROXY') 60 | return cls(use_https) 61 | 62 | def url_in_proxyes(self, url): 63 | """ 64 | 返回一个代理url是否在代理列表中 65 | """ 66 | for p in self.proxyes: 67 | if url == p["proxy"]: 68 | return True 69 | return False 70 | 71 | def reset_proxyes(self): 72 | """ 73 | 将所有count>=指定阈值的代理重置为valid, 74 | """ 75 | logger.info("reset proxyes to valid") 76 | for p in self.proxyes: 77 | if p["count"] >= self.dump_count_threshold: 78 | p["valid"] = True 79 | 80 | def fetch_new_proxyes(self): 81 | """ 82 | 从网上抓取新的代理添加到代理列表中 83 | """ 84 | logger.info("extending proxyes using fetch_free_proxyes.py") 85 | new_proxyes = fetch_free_proxyes.fetch_all(https=self.use_https) 86 | logger.info("new proxyes: %s" % new_proxyes) 87 | self.last_fetch_proxy_time = datetime.now() 88 | 89 | for np in new_proxyes: 90 | if self.url_in_proxyes(np): 91 | continue 92 | else: 93 | self.proxyes.append({"proxy": np, 94 | "valid": True, 95 | "count": 0}) 96 | if self.len_valid_proxy() < self.extend_proxy_threshold: # 如果发现抓不到什么新的代理了, 缩小threshold以避免白费功夫 97 | self.extend_proxy_threshold -= 1 98 | 99 | def len_valid_proxy(self): 100 | """ 101 | 返回proxy列表中有效的代理数量 102 | """ 103 | count = 0 104 | for p in self.proxyes: 105 | if p["valid"]: 106 | count += 1 107 | return count 108 | 109 | def inc_proxy_index(self, current=-1): 110 | """ 111 | 将代理列表的索引移到下一个有效代理的位置 112 | 如果发现代理列表只有fixed_proxy项有效, 重置代理列表 113 | 如果还发现已经距离上次抓代理过了指定时间, 则抓取新的代理 114 | """ 115 | assert self.proxyes[0]["valid"] 116 | if current != -1 and self.proxy_index != current: 117 | return 118 | while True: 119 | self.proxy_index = (self.proxy_index + 1) % len(self.proxyes) 120 | if self.proxyes[self.proxy_index]["valid"]: 121 | break 122 | 123 | # 两轮proxy_index==0的时间间隔过短, 说明出现了验证码抖动,扩展代理列表 124 | if self.proxy_index == 0 and datetime.now() < self.last_no_proxy_time + timedelta(minutes=2): 125 | logger.info("captcha thrashing") 126 | self.fetch_new_proxyes() 127 | 128 | if self.len_valid_proxy() <= self.fixed_proxy or self.len_valid_proxy() < self.extend_proxy_threshold: # 如果代理列表中有效的代理不足的话重置为valid 129 | self.reset_proxyes() 130 | 131 | if self.len_valid_proxy() < self.extend_proxy_threshold: # 代理数量仍然不足, 抓取新的代理 132 | logger.info("valid proxy < threshold: %d/%d" % (self.len_valid_proxy(), self.extend_proxy_threshold)) 133 | self.fetch_new_proxyes() 134 | 135 | logger.info("now using new proxy: %s" % self.proxyes[self.proxy_index]["proxy"]) 136 | 137 | # 一定时间没更新后可能出现了在目前的代理不断循环不断验证码错误的情况, 强制抓取新代理 138 | #if datetime.now() > self.last_fetch_proxy_time + timedelta(minutes=self.fetch_proxy_interval): 139 | # logger.info("%d munites since last fetch" % self.fetch_proxy_interval) 140 | # self.fetch_new_proxyes() 141 | 142 | def set_proxy(self, request): 143 | """ 144 | 将request设置使用为当前的或下一个有效代理 145 | """ 146 | proxy = self.proxyes[self.proxy_index] 147 | if not proxy["valid"]: 148 | self.inc_proxy_index() 149 | proxy = self.proxyes[self.proxy_index] 150 | 151 | if self.proxy_index == 0: # 每次不用代理直接下载时更新self.last_no_proxy_time 152 | self.last_no_proxy_time = datetime.now() 153 | 154 | if proxy["proxy"]: 155 | request.meta["proxy"] = proxy["proxy"] 156 | elif "proxy" in request.meta.keys(): 157 | del request.meta["proxy"] 158 | request.meta["proxy_index"] = self.proxy_index 159 | proxy["count"] += 1 160 | 161 | def invalid_proxy(self, index): 162 | """ 163 | 将index指向的proxy设置为invalid, 164 | 并调整当前proxy_index到下一个有效代理的位置 165 | """ 166 | if index < self.fixed_proxy: # 可信代理永远不会设为invalid 167 | logger.info("fixed proxy will not be invalid: %s" % self.proxyes[index]) 168 | self.inc_proxy_index(index) 169 | return 170 | 171 | if self.proxyes[index]["valid"]: 172 | logger.info("invalidate %s" % self.proxyes[index]) 173 | self.proxyes[index]["valid"] = False 174 | if index == self.proxy_index: 175 | self.inc_proxy_index() 176 | 177 | if self.proxyes[index]["count"] < self.dump_count_threshold: 178 | self.dump_valid_proxy() 179 | 180 | def dump_valid_proxy(self): 181 | """ 182 | 保存代理列表中有效的代理到文件 183 | """ 184 | if self.dump_count_threshold <= 0: 185 | return 186 | logger.info("dumping proxyes to file") 187 | with open(self.proxy_file, "w") as fd: 188 | for i in range(self.fixed_proxy, len(self.proxyes)): 189 | p = self.proxyes[i] 190 | if p["valid"] or p["count"] >= self.dump_count_threshold: 191 | fd.write(p["proxy"]+"\n") # 只保存有效的代理 192 | 193 | def process_request(self, request, spider): 194 | """ 195 | 将request设置为使用代理 196 | """ 197 | if self.proxy_index > 0 and datetime.now() > (self.last_no_proxy_time + timedelta(minutes=self.recover_interval)): 198 | logger.info("After %d minutes later, recover from using proxy" % self.recover_interval) 199 | self.last_no_proxy_time = datetime.now() 200 | self.proxy_index = 0 201 | request.meta["dont_redirect"] = True # 有些代理会把请求重定向到一个莫名其妙的地址 202 | 203 | # spider发现parse error, 要求更换代理 204 | if "change_proxy" in request.meta.keys() and request.meta["change_proxy"]: 205 | logger.info("change proxy request get by spider: %s" % request) 206 | self.invalid_proxy(request.meta["proxy_index"]) 207 | request.meta["change_proxy"] = False 208 | self.set_proxy(request) 209 | 210 | def process_response(self, request, response, spider): 211 | """ 212 | 检查response.status, 根据status是否在允许的状态码中决定是否切换到下一个proxy, 或者禁用proxy 213 | """ 214 | if "proxy" in request.meta.keys(): 215 | logger.debug("%s %s %s" % (request.meta["proxy"], response.status, request.url)) 216 | else: 217 | logger.debug("None %s %s" % (response.status, request.url)) 218 | 219 | # status不是正常的200而且不在spider声明的正常爬取过程中可能出现的 220 | # status列表中, 则认为代理无效, 切换代理 221 | if response.status != 200 \ 222 | and (not hasattr(spider, "website_possible_httpstatus_list") \ 223 | or response.status not in spider.website_possible_httpstatus_list): 224 | logger.info("response status[%d] not in spider.website_possible_httpstatus_list" % response.status) 225 | self.invalid_proxy(request.meta["proxy_index"]) 226 | new_request = request.copy() 227 | new_request.dont_filter = True 228 | return new_request 229 | else: 230 | return response 231 | 232 | def process_exception(self, request, exception, spider): 233 | """ 234 | 处理由于使用代理导致的连接异常 235 | """ 236 | logger.debug("%s exception: %s" % (self.proxyes[request.meta["proxy_index"]]["proxy"], exception)) 237 | request_proxy_index = request.meta["proxy_index"] 238 | 239 | # 只有当proxy_index>fixed_proxy-1时才进行比较, 这样能保证至少本地直连是存在的. 240 | if isinstance(exception, self.DONT_RETRY_ERRORS): 241 | if request_proxy_index > self.fixed_proxy - 1 and self.invalid_proxy_flag: # WARNING 直连时超时的话换个代理还是重试? 这是策略问题 242 | if self.proxyes[request_proxy_index]["count"] < self.invalid_proxy_threshold: 243 | self.invalid_proxy(request_proxy_index) 244 | elif request_proxy_index == self.proxy_index: # 虽然超时,但是如果之前一直很好用,也不设为invalid 245 | self.inc_proxy_index() 246 | else: # 简单的切换而不禁用 247 | if request.meta["proxy_index"] == self.proxy_index: 248 | self.inc_proxy_index() 249 | new_request = request.copy() 250 | new_request.dont_filter = True 251 | return new_request 252 | -------------------------------------------------------------------------------- /HttpProxyMiddlewareTest/HttpProxyMiddlewareTest/HttpProxyMiddleware.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | import os 4 | import logging 5 | from datetime import datetime, timedelta 6 | from twisted.web._newclient import ResponseNeverReceived 7 | from twisted.internet.error import TimeoutError, ConnectionRefusedError, ConnectError 8 | import fetch_free_proxyes 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | class HttpProxyMiddleware(object): 13 | # 遇到这些类型的错误直接当做代理不可用处理掉, 不再传给retrymiddleware 14 | DONT_RETRY_ERRORS = (TimeoutError, ConnectionRefusedError, ResponseNeverReceived, ConnectError, ValueError) 15 | 16 | def __init__(self, settings): 17 | # 保存上次不用代理直接连接的时间点 18 | self.last_no_proxy_time = datetime.now() 19 | # 一定分钟数后切换回不用代理, 因为用代理影响到速度 20 | self.recover_interval = 20 21 | # 一个proxy如果没用到这个数字就被发现老是超时, 则永久移除该proxy. 设为0则不会修改代理文件. 22 | self.dump_count_threshold = 20 23 | # 存放代理列表的文件, 每行一个代理, 格式为ip:port, 注意没有http://, 而且这个文件会被修改, 注意备份 24 | self.proxy_file = "proxyes.dat" 25 | # 是否在超时的情况下禁用代理 26 | self.invalid_proxy_flag = True 27 | # 当有效代理小于这个数时(包括直连), 从网上抓取新的代理, 可以将这个数设为为了满足每个ip被要求输入验证码后得到足够休息时间所需要的代理数 28 | # 例如爬虫在十个可用代理之间切换时, 每个ip经过数分钟才再一次轮到自己, 这样就能get一些请求而不用输入验证码. 29 | # 如果这个数过小, 例如两个, 爬虫用A ip爬了没几个就被ban, 换了一个又爬了没几次就被ban, 这样整个爬虫就会处于一种忙等待的状态, 影响效率 30 | self.extend_proxy_threshold = 10 31 | # 初始化代理列表 32 | self.proxyes = [{"proxy": None, "valid": True, "count": 0}] 33 | # 初始时使用0号代理(即无代理) 34 | self.proxy_index = 0 35 | # 表示可信代理的数量(如自己搭建的HTTP代理)+1(不用代理直接连接) 36 | self.fixed_proxy = len(self.proxyes) 37 | # 上一次抓新代理的时间 38 | self.last_fetch_proxy_time = datetime.now() 39 | # 每隔固定时间强制抓取新代理(min) 40 | self.fetch_proxy_interval = 120 41 | # 一个将被设为invalid的代理如果已经成功爬取大于这个参数的页面, 将不会被invalid 42 | self.invalid_proxy_threshold = 200 43 | # 从文件读取初始代理 44 | if os.path.exists(self.proxy_file): 45 | with open(self.proxy_file, "r") as fd: 46 | lines = fd.readlines() 47 | for line in lines: 48 | line = line.strip() 49 | if not line or self.url_in_proxyes("http://" + line): 50 | continue 51 | self.proxyes.append({"proxy": "http://" + line, 52 | "valid": True, 53 | "count": 0}) 54 | 55 | @classmethod 56 | def from_crawler(cls, crawler): 57 | return cls(crawler.settings) 58 | 59 | def url_in_proxyes(self, url): 60 | """ 61 | 返回一个代理url是否在代理列表中 62 | """ 63 | for p in self.proxyes: 64 | if url == p["proxy"]: 65 | return True 66 | return False 67 | 68 | def reset_proxyes(self): 69 | """ 70 | 将所有count>=指定阈值的代理重置为valid, 71 | """ 72 | logger.info("reset proxyes to valid") 73 | for p in self.proxyes: 74 | if p["count"] >= self.dump_count_threshold: 75 | p["valid"] = True 76 | 77 | def fetch_new_proxyes(self): 78 | """ 79 | 从网上抓取新的代理添加到代理列表中 80 | """ 81 | logger.info("extending proxyes using fetch_free_proxyes.py") 82 | new_proxyes = fetch_free_proxyes.fetch_all() 83 | logger.info("new proxyes: %s" % new_proxyes) 84 | self.last_fetch_proxy_time = datetime.now() 85 | 86 | for np in new_proxyes: 87 | if self.url_in_proxyes("http://" + np): 88 | continue 89 | else: 90 | self.proxyes.append({"proxy": "http://" + np, 91 | "valid": True, 92 | "count": 0}) 93 | if self.len_valid_proxy() < self.extend_proxy_threshold: # 如果发现抓不到什么新的代理了, 缩小threshold以避免白费功夫 94 | self.extend_proxy_threshold -= 1 95 | 96 | def len_valid_proxy(self): 97 | """ 98 | 返回proxy列表中有效的代理数量 99 | """ 100 | count = 0 101 | for p in self.proxyes: 102 | if p["valid"]: 103 | count += 1 104 | return count 105 | 106 | def inc_proxy_index(self): 107 | """ 108 | 将代理列表的索引移到下一个有效代理的位置 109 | 如果发现代理列表只有fixed_proxy项有效, 重置代理列表 110 | 如果还发现已经距离上次抓代理过了指定时间, 则抓取新的代理 111 | """ 112 | assert self.proxyes[0]["valid"] 113 | while True: 114 | self.proxy_index = (self.proxy_index + 1) % len(self.proxyes) 115 | if self.proxyes[self.proxy_index]["valid"]: 116 | break 117 | 118 | # 两轮proxy_index==0的时间间隔过短, 说明出现了验证码抖动,扩展代理列表 119 | if self.proxy_index == 0 and datetime.now() < self.last_no_proxy_time + timedelta(minutes=2): 120 | logger.info("captcha thrashing") 121 | self.fetch_new_proxyes() 122 | 123 | if self.len_valid_proxy() <= self.fixed_proxy or self.len_valid_proxy() < self.extend_proxy_threshold: # 如果代理列表中有效的代理不足的话重置为valid 124 | self.reset_proxyes() 125 | 126 | if self.len_valid_proxy() < self.extend_proxy_threshold: # 代理数量仍然不足, 抓取新的代理 127 | logger.info("valid proxy < threshold: %d/%d" % (self.len_valid_proxy(), self.extend_proxy_threshold)) 128 | self.fetch_new_proxyes() 129 | 130 | logger.info("now using new proxy: %s" % self.proxyes[self.proxy_index]["proxy"]) 131 | 132 | # 一定时间没更新后可能出现了在目前的代理不断循环不断验证码错误的情况, 强制抓取新代理 133 | #if datetime.now() > self.last_fetch_proxy_time + timedelta(minutes=self.fetch_proxy_interval): 134 | # logger.info("%d munites since last fetch" % self.fetch_proxy_interval) 135 | # self.fetch_new_proxyes() 136 | 137 | def set_proxy(self, request): 138 | """ 139 | 将request设置使用为当前的或下一个有效代理 140 | """ 141 | proxy = self.proxyes[self.proxy_index] 142 | if not proxy["valid"]: 143 | self.inc_proxy_index() 144 | proxy = self.proxyes[self.proxy_index] 145 | 146 | if self.proxy_index == 0: # 每次不用代理直接下载时更新self.last_no_proxy_time 147 | self.last_no_proxy_time = datetime.now() 148 | 149 | if proxy["proxy"]: 150 | request.meta["proxy"] = proxy["proxy"] 151 | elif "proxy" in request.meta.keys(): 152 | del request.meta["proxy"] 153 | request.meta["proxy_index"] = self.proxy_index 154 | proxy["count"] += 1 155 | 156 | def invalid_proxy(self, index): 157 | """ 158 | 将index指向的proxy设置为invalid, 159 | 并调整当前proxy_index到下一个有效代理的位置 160 | """ 161 | if index < self.fixed_proxy: # 可信代理永远不会设为invalid 162 | self.inc_proxy_index() 163 | return 164 | 165 | if self.proxyes[index]["valid"]: 166 | logger.info("invalidate %s" % self.proxyes[index]) 167 | self.proxyes[index]["valid"] = False 168 | if index == self.proxy_index: 169 | self.inc_proxy_index() 170 | 171 | if self.proxyes[index]["count"] < self.dump_count_threshold: 172 | self.dump_valid_proxy() 173 | 174 | def dump_valid_proxy(self): 175 | """ 176 | 保存代理列表中有效的代理到文件 177 | """ 178 | if self.dump_count_threshold <= 0: 179 | return 180 | logger.info("dumping proxyes to file") 181 | with open(self.proxy_file, "w") as fd: 182 | for i in range(self.fixed_proxy, len(self.proxyes)): 183 | p = self.proxyes[i] 184 | if p["valid"] or p["count"] >= self.dump_count_threshold: 185 | fd.write(p["proxy"][7:]+"\n") # 只保存有效的代理 186 | 187 | def process_request(self, request, spider): 188 | """ 189 | 将request设置为使用代理 190 | """ 191 | if self.proxy_index > 0 and datetime.now() > (self.last_no_proxy_time + timedelta(minutes=self.recover_interval)): 192 | logger.info("After %d minutes later, recover from using proxy" % self.recover_interval) 193 | self.last_no_proxy_time = datetime.now() 194 | self.proxy_index = 0 195 | request.meta["dont_redirect"] = True # 有些代理会把请求重定向到一个莫名其妙的地址 196 | 197 | # spider发现parse error, 要求更换代理 198 | if "change_proxy" in request.meta.keys() and request.meta["change_proxy"]: 199 | logger.info("change proxy request get by spider: %s" % request) 200 | self.invalid_proxy(request.meta["proxy_index"]) 201 | request.meta["change_proxy"] = False 202 | self.set_proxy(request) 203 | 204 | def process_response(self, request, response, spider): 205 | """ 206 | 检查response.status, 根据status是否在允许的状态码中决定是否切换到下一个proxy, 或者禁用proxy 207 | """ 208 | if "proxy" in request.meta.keys(): 209 | logger.debug("%s %s %s" % (request.meta["proxy"], response.status, request.url)) 210 | else: 211 | logger.debug("None %s %s" % (response.status, request.url)) 212 | 213 | # status不是正常的200而且不在spider声明的正常爬取过程中可能出现的 214 | # status列表中, 则认为代理无效, 切换代理 215 | if response.status != 200 \ 216 | and (not hasattr(spider, "website_possible_httpstatus_list") \ 217 | or response.status not in spider.website_possible_httpstatus_list): 218 | logger.info("response status not in spider.website_possible_httpstatus_list") 219 | self.invalid_proxy(request.meta["proxy_index"]) 220 | new_request = request.copy() 221 | new_request.dont_filter = True 222 | return new_request 223 | else: 224 | return response 225 | 226 | def process_exception(self, request, exception, spider): 227 | """ 228 | 处理由于使用代理导致的连接异常 229 | """ 230 | logger.debug("%s exception: %s" % (self.proxyes[request.meta["proxy_index"]]["proxy"], exception)) 231 | request_proxy_index = request.meta["proxy_index"] 232 | 233 | # 只有当proxy_index>fixed_proxy-1时才进行比较, 这样能保证至少本地直连是存在的. 234 | if isinstance(exception, self.DONT_RETRY_ERRORS): 235 | if request_proxy_index > self.fixed_proxy - 1 and self.invalid_proxy_flag: # WARNING 直连时超时的话换个代理还是重试? 这是策略问题 236 | if self.proxyes[request_proxy_index]["count"] < self.invalid_proxy_threshold: 237 | self.invalid_proxy(request_proxy_index) 238 | elif request_proxy_index == self.proxy_index: # 虽然超时,但是如果之前一直很好用,也不设为invalid 239 | self.inc_proxy_index() 240 | else: # 简单的切换而不禁用 241 | if request.meta["proxy_index"] == self.proxy_index: 242 | self.inc_proxy_index() 243 | new_request = request.copy() 244 | new_request.dont_filter = True 245 | return new_request 246 | 247 | -------------------------------------------------------------------------------- /HttpProxyMiddlewareTest/HttpProxyMiddlewareTest/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kohn/HttpProxyMiddleware/b087518ee7c4c7f561c62511f4f6c922a414abbd/HttpProxyMiddlewareTest/HttpProxyMiddlewareTest/__init__.py -------------------------------------------------------------------------------- /HttpProxyMiddlewareTest/HttpProxyMiddlewareTest/fetch_free_proxyes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | from bs4 import BeautifulSoup 4 | import urllib2 5 | import logging 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | def get_html(url): 10 | request = urllib2.Request(url) 11 | request.add_header("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36") 12 | html = urllib2.urlopen(request) 13 | return html.read() 14 | 15 | def get_soup(url): 16 | soup = BeautifulSoup(get_html(url), "lxml") 17 | return soup 18 | 19 | def fetch_kxdaili(page): 20 | """ 21 | 从www.kxdaili.com抓取免费代理 22 | """ 23 | proxyes = [] 24 | try: 25 | url = "http://www.kxdaili.com/dailiip/1/%d.html" % page 26 | soup = get_soup(url) 27 | table_tag = soup.find("table", attrs={"class": "segment"}) 28 | trs = table_tag.tbody.find_all("tr") 29 | for tr in trs: 30 | tds = tr.find_all("td") 31 | ip = tds[0].text 32 | port = tds[1].text 33 | latency = tds[4].text.split(" ")[0] 34 | if float(latency) < 0.5: # 输出延迟小于0.5秒的代理 35 | proxy = "%s:%s" % (ip, port) 36 | proxyes.append(proxy) 37 | except: 38 | logger.warning("fail to fetch from kxdaili") 39 | return proxyes 40 | 41 | def img2port(img_url): 42 | """ 43 | mimvp.com的端口号用图片来显示, 本函数将图片url转为端口, 目前的临时性方法并不准确 44 | """ 45 | code = img_url.split("=")[-1] 46 | if code.find("AO0OO0O")>0: 47 | return 80 48 | else: 49 | return None 50 | 51 | def fetch_mimvp(): 52 | """ 53 | 从http://proxy.mimvp.com/free.php抓免费代理 54 | """ 55 | proxyes = [] 56 | try: 57 | url = "http://proxy.mimvp.com/free.php?proxy=in_hp" 58 | soup = get_soup(url) 59 | table = soup.find("div", attrs={"id": "list"}).table 60 | tds = table.tbody.find_all("td") 61 | for i in range(0, len(tds), 10): 62 | id = tds[i].text 63 | ip = tds[i+1].text 64 | port = img2port(tds[i+2].img["src"]) 65 | response_time = tds[i+7]["title"][:-1] 66 | transport_time = tds[i+8]["title"][:-1] 67 | if port is not None and float(response_time) < 1 : 68 | proxy = "%s:%s" % (ip, port) 69 | proxyes.append(proxy) 70 | except: 71 | logger.warning("fail to fetch from mimvp") 72 | return proxyes 73 | 74 | def fetch_xici(): 75 | """ 76 | http://www.xicidaili.com/nn/ 77 | """ 78 | proxyes = [] 79 | try: 80 | url = "http://www.xicidaili.com/nn/" 81 | soup = get_soup(url) 82 | table = soup.find("table", attrs={"id": "ip_list"}) 83 | trs = table.find_all("tr") 84 | for i in range(1, len(trs)): 85 | tr = trs[i] 86 | tds = tr.find_all("td") 87 | ip = tds[2].text 88 | port = tds[3].text 89 | speed = tds[7].div["title"][:-1] 90 | latency = tds[8].div["title"][:-1] 91 | if float(speed) < 3 and float(latency) < 1: 92 | proxyes.append("%s:%s" % (ip, port)) 93 | except: 94 | logger.warning("fail to fetch from xici") 95 | return proxyes 96 | 97 | def fetch_ip181(): 98 | """ 99 | http://www.ip181.com/ 100 | """ 101 | proxyes = [] 102 | try: 103 | url = "http://www.ip181.com/" 104 | soup = get_soup(url) 105 | table = soup.find("table") 106 | trs = table.find_all("tr") 107 | for i in range(1, len(trs)): 108 | tds = trs[i].find_all("td") 109 | ip = tds[0].text 110 | port = tds[1].text 111 | latency = tds[4].text[:-2] 112 | if float(latency) < 1: 113 | proxyes.append("%s:%s" % (ip, port)) 114 | except Exception as e: 115 | logger.warning("fail to fetch from ip181: %s" % e) 116 | return proxyes 117 | 118 | def fetch_httpdaili(): 119 | """ 120 | http://www.httpdaili.com/mfdl/ 121 | 更新比较频繁 122 | """ 123 | proxyes = [] 124 | try: 125 | url = "http://www.httpdaili.com/mfdl/" 126 | soup = get_soup(url) 127 | table = soup.find("div", attrs={"kb-item-wrap11"}).table 128 | trs = table.find_all("tr") 129 | for i in range(1, len(trs)): 130 | try: 131 | tds = trs[i].find_all("td") 132 | ip = tds[0].text 133 | port = tds[1].text 134 | type = tds[2].text 135 | if type == u"匿名": 136 | proxyes.append("%s:%s" % (ip, port)) 137 | except: 138 | pass 139 | except Exception as e: 140 | logger.warning("fail to fetch from httpdaili: %s" % e) 141 | return proxyes 142 | 143 | def fetch_66ip(): 144 | """ 145 | http://www.66ip.cn/ 146 | 每次打开此链接都能得到一批代理, 速度不保证 147 | """ 148 | proxyes = [] 149 | try: 150 | # 修改getnum大小可以一次获取不同数量的代理 151 | url = "http://www.66ip.cn/nmtq.php?getnum=10&isp=0&anonymoustype=3&start=&ports=&export=&ipaddress=&area=1&proxytype=0&api=66ip" 152 | content = get_html(url) 153 | urls = content.split("")[-1].split("
") 154 | for u in urls: 155 | if u.strip(): 156 | proxyes.append(u.strip()) 157 | except Exception as e: 158 | logger.warning("fail to fetch from httpdaili: %s" % e) 159 | return proxyes 160 | 161 | 162 | 163 | def check(proxy): 164 | import urllib2 165 | url = "http://www.baidu.com/js/bdsug.js?v=1.0.3.0" 166 | proxy_handler = urllib2.ProxyHandler({'http': "http://" + proxy}) 167 | opener = urllib2.build_opener(proxy_handler,urllib2.HTTPHandler) 168 | try: 169 | response = opener.open(url,timeout=3) 170 | return response.code == 200 171 | except Exception: 172 | return False 173 | 174 | def fetch_all(endpage=2): 175 | proxyes = [] 176 | for i in range(1, endpage): 177 | proxyes += fetch_kxdaili(i) 178 | proxyes += fetch_mimvp() 179 | proxyes += fetch_xici() 180 | proxyes += fetch_ip181() 181 | proxyes += fetch_httpdaili() 182 | proxyes += fetch_66ip() 183 | valid_proxyes = [] 184 | logger.info("checking proxyes validation") 185 | for p in proxyes: 186 | if check(p): 187 | valid_proxyes.append(p) 188 | return valid_proxyes 189 | 190 | if __name__ == '__main__': 191 | import sys 192 | root_logger = logging.getLogger("") 193 | stream_handler = logging.StreamHandler(sys.stdout) 194 | formatter = logging.Formatter('%(name)-8s %(asctime)s %(levelname)-8s %(message)s', '%a, %d %b %Y %H:%M:%S',) 195 | stream_handler.setFormatter(formatter) 196 | root_logger.addHandler(stream_handler) 197 | logger = logging.getLogger(__name__) 198 | logger.setLevel(logging.DEBUG) 199 | proxyes = fetch_all() 200 | #print check("202.29.238.242:3128") 201 | for p in proxyes: 202 | print p 203 | -------------------------------------------------------------------------------- /HttpProxyMiddlewareTest/HttpProxyMiddlewareTest/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class HttpproxymiddlewaretestItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /HttpProxyMiddlewareTest/HttpProxyMiddlewareTest/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class HttpproxymiddlewaretestPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /HttpProxyMiddlewareTest/HttpProxyMiddlewareTest/proxyes.data: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kohn/HttpProxyMiddleware/b087518ee7c4c7f561c62511f4f6c922a414abbd/HttpProxyMiddlewareTest/HttpProxyMiddlewareTest/proxyes.data -------------------------------------------------------------------------------- /HttpProxyMiddlewareTest/HttpProxyMiddlewareTest/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for HttpProxyMiddlewareTest project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'HttpProxyMiddlewareTest' 13 | 14 | SPIDER_MODULES = ['HttpProxyMiddlewareTest.spiders'] 15 | NEWSPIDER_MODULE = 'HttpProxyMiddlewareTest.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'HttpProxyMiddlewareTest (+http://www.yourdomain.com)' 20 | 21 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 22 | #CONCURRENT_REQUESTS=32 23 | 24 | # Configure a delay for requests for the same website (default: 0) 25 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 26 | # See also autothrottle settings and docs 27 | #DOWNLOAD_DELAY=3 28 | # The download delay setting will honor only one of: 29 | #CONCURRENT_REQUESTS_PER_DOMAIN=16 30 | #CONCURRENT_REQUESTS_PER_IP=16 31 | 32 | # Disable cookies (enabled by default) 33 | #COOKIES_ENABLED=False 34 | 35 | # Disable Telnet Console (enabled by default) 36 | #TELNETCONSOLE_ENABLED=False 37 | 38 | # Override the default request headers: 39 | #DEFAULT_REQUEST_HEADERS = { 40 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 41 | # 'Accept-Language': 'en', 42 | #} 43 | 44 | # Enable or disable spider middlewares 45 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 46 | #SPIDER_MIDDLEWARES = { 47 | # 'HttpProxyMiddlewareTest.middlewares.MyCustomSpiderMiddleware': 543, 48 | #} 49 | 50 | # Enable or disable downloader middlewares 51 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 52 | DOWNLOADER_MIDDLEWARES = { 53 | 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware': 350, 54 | 'HttpProxyMiddlewareTest.HttpProxyMiddleware.HttpProxyMiddleware': 543, 55 | } 56 | 57 | # LOG_LEVEL = "INFO" 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'HttpProxyMiddlewareTest.pipelines.SomePipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay 74 | #AUTOTHROTTLE_ENABLED=True 75 | # The initial download delay 76 | #AUTOTHROTTLE_START_DELAY=5 77 | # The maximum download delay to be set in case of high latencies 78 | #AUTOTHROTTLE_MAX_DELAY=60 79 | # Enable showing throttling stats for every response received: 80 | #AUTOTHROTTLE_DEBUG=False 81 | 82 | # Enable and configure HTTP caching (disabled by default) 83 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 84 | #HTTPCACHE_ENABLED=True 85 | #HTTPCACHE_EXPIRATION_SECS=0 86 | #HTTPCACHE_DIR='httpcache' 87 | #HTTPCACHE_IGNORE_HTTP_CODES=[] 88 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage' 89 | 90 | DOWNLOAD_TIMEOUT = 10 91 | -------------------------------------------------------------------------------- /HttpProxyMiddlewareTest/HttpProxyMiddlewareTest/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /HttpProxyMiddlewareTest/HttpProxyMiddlewareTest/spiders/test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy.http import Request 4 | import logging 5 | 6 | logger = logging.getLogger("test spider") 7 | 8 | class TestSpider(scrapy.Spider): 9 | name = "test" 10 | allowed_domains = ["103.243.24.223"] 11 | website_possible_httpstatus_list = [403] 12 | handle_httpstatus_list = [403] 13 | 14 | start_urls = ( 15 | 'http://103.243.24.223:8000', 16 | ) 17 | 18 | def parse(self, response): 19 | if response.body == "banned": 20 | req = response.request 21 | req.meta["change_proxy"] = True 22 | yield req 23 | else: 24 | logger.info("got page: %s" % response.body) 25 | yield response.request 26 | -------------------------------------------------------------------------------- /HttpProxyMiddlewareTest/README.org: -------------------------------------------------------------------------------- 1 | * HttpProxyMiddlewareTest 2 | 3 | A scrapy project used to test [[https://github.com/kohn/HttpProxyMiddleware][HttpProxyMiddleware]]. 4 | 5 | * Usage 6 | 7 | Update HttpProxyMiddleware position in 8 | HttpProxyMiddlewareTest/settings.py. 9 | 10 | Run Command: 11 | : scrapy crawl test 12 | -------------------------------------------------------------------------------- /HttpProxyMiddlewareTest/proxyes.dat: -------------------------------------------------------------------------------- 1 | 111.206.190.155:80 2 | 182.140.132.107:8888 3 | 60.206.233.2:3128 4 | 61.174.10.22:8080 5 | 52.40.124.145:8083 6 | 59.173.187.179:8090 7 | 113.246.216.163:8888 8 | 117.64.150.51:8998 9 | 117.64.226.122:8998 10 | -------------------------------------------------------------------------------- /HttpProxyMiddlewareTest/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = HttpProxyMiddlewareTest.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = HttpProxyMiddlewareTest 12 | -------------------------------------------------------------------------------- /IPBanTest/IPBanTest/README.org: -------------------------------------------------------------------------------- 1 | * IPBanTest 2 | 3 | A django powered website for testing [[https://github.com/kohn/HttpProxyMiddleware][HttpProxyMiddleware]]. 4 | -------------------------------------------------------------------------------- /IPBanTest/IPBanTest/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kohn/HttpProxyMiddleware/b087518ee7c4c7f561c62511f4f6c922a414abbd/IPBanTest/IPBanTest/__init__.py -------------------------------------------------------------------------------- /IPBanTest/IPBanTest/settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Django settings for IPBanTest project. 3 | 4 | Generated by 'django-admin startproject' using Django 1.9.5. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/1.9/topics/settings/ 8 | 9 | For the full list of settings and their values, see 10 | https://docs.djangoproject.com/en/1.9/ref/settings/ 11 | """ 12 | 13 | import os 14 | 15 | # Build paths inside the project like this: os.path.join(BASE_DIR, ...) 16 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 17 | 18 | 19 | # Quick-start development settings - unsuitable for production 20 | # See https://docs.djangoproject.com/en/1.9/howto/deployment/checklist/ 21 | 22 | # SECURITY WARNING: keep the secret key used in production secret! 23 | SECRET_KEY = '-b-4g9o*37*@x_!uzc@ft4u6wxxegi-luwxfp)s4_f7@&s=mt%' 24 | 25 | # SECURITY WARNING: don't run with debug turned on in production! 26 | DEBUG = False 27 | 28 | ALLOWED_HOSTS = ["103.243.24.223"] 29 | 30 | 31 | # Application definition 32 | 33 | INSTALLED_APPS = [ 34 | 'django.contrib.admin', 35 | 'django.contrib.auth', 36 | 'django.contrib.contenttypes', 37 | 'django.contrib.sessions', 38 | 'django.contrib.messages', 39 | 'django.contrib.staticfiles', 40 | ] 41 | 42 | MIDDLEWARE_CLASSES = [ 43 | 'django.middleware.cache.UpdateCacheMiddleware', 44 | 'django.middleware.common.CommonMiddleware', 45 | 'django.middleware.cache.FetchFromCacheMiddleware', 46 | 'django.middleware.security.SecurityMiddleware', 47 | 'django.contrib.sessions.middleware.SessionMiddleware', 48 | 'django.middleware.csrf.CsrfViewMiddleware', 49 | 'django.contrib.auth.middleware.AuthenticationMiddleware', 50 | 'django.contrib.auth.middleware.SessionAuthenticationMiddleware', 51 | 'django.contrib.messages.middleware.MessageMiddleware', 52 | 'django.middleware.clickjacking.XFrameOptionsMiddleware', 53 | ] 54 | 55 | ROOT_URLCONF = 'IPBanTest.urls' 56 | 57 | TEMPLATES = [ 58 | { 59 | 'BACKEND': 'django.template.backends.django.DjangoTemplates', 60 | 'DIRS': [], 61 | 'APP_DIRS': True, 62 | 'OPTIONS': { 63 | 'context_processors': [ 64 | 'django.template.context_processors.debug', 65 | 'django.template.context_processors.request', 66 | 'django.contrib.auth.context_processors.auth', 67 | 'django.contrib.messages.context_processors.messages', 68 | ], 69 | }, 70 | }, 71 | ] 72 | 73 | WSGI_APPLICATION = 'IPBanTest.wsgi.application' 74 | 75 | CACHES = { 76 | 'default': { 77 | 'BACKEND': 'django.core.cache.backends.locmem.LocMemCache', 78 | 'LOCATION': 'unique-snowflake', 79 | } 80 | } 81 | 82 | # Database 83 | # https://docs.djangoproject.com/en/1.9/ref/settings/#databases 84 | 85 | # DATABASES = { 86 | # 'default': { 87 | # 'ENGINE': 'django.db.backends.sqlite3', 88 | # 'NAME': os.path.join(BASE_DIR, 'db.sqlite3'), 89 | # } 90 | # } 91 | 92 | 93 | # Password validation 94 | # https://docs.djangoproject.com/en/1.9/ref/settings/#auth-password-validators 95 | 96 | AUTH_PASSWORD_VALIDATORS = [ 97 | { 98 | 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', 99 | }, 100 | { 101 | 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', 102 | }, 103 | { 104 | 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', 105 | }, 106 | { 107 | 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', 108 | }, 109 | ] 110 | 111 | 112 | # Internationalization 113 | # https://docs.djangoproject.com/en/1.9/topics/i18n/ 114 | 115 | LANGUAGE_CODE = 'en-us' 116 | 117 | TIME_ZONE = 'UTC' 118 | 119 | USE_I18N = True 120 | 121 | USE_L10N = True 122 | 123 | USE_TZ = True 124 | 125 | 126 | # Static files (CSS, JavaScript, Images) 127 | # https://docs.djangoproject.com/en/1.9/howto/static-files/ 128 | 129 | STATIC_URL = '/static/' 130 | 131 | DOWNLOAD_TIMEOUT = 10 132 | -------------------------------------------------------------------------------- /IPBanTest/IPBanTest/urls.py: -------------------------------------------------------------------------------- 1 | """IPBanTest URL Configuration 2 | 3 | The `urlpatterns` list routes URLs to views. For more information please see: 4 | https://docs.djangoproject.com/en/1.9/topics/http/urls/ 5 | Examples: 6 | Function views 7 | 1. Add an import: from my_app import views 8 | 2. Add a URL to urlpatterns: url(r'^$', views.home, name='home') 9 | Class-based views 10 | 1. Add an import: from other_app.views import Home 11 | 2. Add a URL to urlpatterns: url(r'^$', Home.as_view(), name='home') 12 | Including another URLconf 13 | 1. Import the include() function: from django.conf.urls import url, include 14 | 2. Add a URL to urlpatterns: url(r'^blog/', include('blog.urls')) 15 | """ 16 | from django.conf.urls import url 17 | from IPBanTest import views 18 | 19 | urlpatterns = [ 20 | url(r'^/?$', views.index), 21 | ] 22 | -------------------------------------------------------------------------------- /IPBanTest/IPBanTest/views.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from django.http import HttpResponseForbidden 3 | from django.http import JsonResponse 4 | from django.views.decorators.cache import cache_page 5 | from threading import Timer 6 | import logging 7 | 8 | logger = logging.getLogger("IPBanTest") 9 | 10 | ip_count = {} 11 | 12 | def clear_ip(): 13 | logger.info("clear ip_count") 14 | ip_count.clear() 15 | Timer(10*60, clear_ip, ()).start() # clear evert 10min 16 | 17 | Timer(10, clear_ip, ()).start() 18 | 19 | @cache_page(0) 20 | def index(request): 21 | ip = request.META.get("REMOTE_ADDR") 22 | if ip in ip_count.keys(): 23 | ip_count[ip] += 1 24 | else: 25 | ip_count[ip] = 0 26 | 27 | message = {} 28 | if ip_count[ip] > 5: 29 | return HttpResponseForbidden("banned") 30 | else: 31 | message["ip"] = ip 32 | message["count"] = ip_count[ip] 33 | return JsonResponse(message) 34 | -------------------------------------------------------------------------------- /IPBanTest/IPBanTest/wsgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | WSGI config for IPBanTest project. 3 | 4 | It exposes the WSGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/1.9/howto/deployment/wsgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.wsgi import get_wsgi_application 13 | 14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "IPBanTest.settings") 15 | 16 | application = get_wsgi_application() 17 | -------------------------------------------------------------------------------- /IPBanTest/db.sqlite3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kohn/HttpProxyMiddleware/b087518ee7c4c7f561c62511f4f6c922a414abbd/IPBanTest/db.sqlite3 -------------------------------------------------------------------------------- /IPBanTest/manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | 5 | if __name__ == "__main__": 6 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "IPBanTest.settings") 7 | 8 | from django.core.management import execute_from_command_line 9 | 10 | execute_from_command_line(sys.argv) 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Kohn 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.org: -------------------------------------------------------------------------------- 1 | * HttpProxyMiddleware 2 | 3 | A middleware for scrapy. Used to change HTTP proxy from time to time. 4 | 5 | Initial proxyes are stored in a file. During runtime, the middleware 6 | will fetch new proxyes if it finds out lack of valid proxyes. 7 | 8 | Related blog: [[http://www.kohn.com.cn/wordpress/?p=208]] 9 | 10 | 11 | ** fetch_free_proxyes.py 12 | Used to fetch free proxyes from the Internet. Could be modified by 13 | youself. 14 | 15 | ** Usage 16 | 17 | *** settings.py 18 | 19 | #+BEGIN_SRC python 20 | DOWNLOADER_MIDDLEWARES = { 21 | 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware': 350, 22 | 'scrapy.contrib.downloadermiddleware.retry.RetryMiddleware': 351, 23 | # put this middleware after RetryMiddleware 24 | 'crawler.middleware.HttpProxyMiddleware': 999, 25 | } 26 | 27 | DOWNLOAD_TIMEOUT = 10 # 10-15 second is an experienmental reasonable timeout 28 | #+END_SRC 29 | 30 | *** change proxy 31 | 32 | Often, we wanna change to use a new proxy when our spider gets banned. 33 | Just recognize your IP being banned and yield a new Request in your 34 | Spider.parse method with: 35 | 36 | #+BEGIN_SRC python 37 | request.meta["change_proxy"] = True 38 | #+END_SRC 39 | 40 | Some proxy may return invalid HTML code. So if you get any exception 41 | during parsing response, also yield a new request with: 42 | 43 | #+BEGIN_SRC python 44 | request.meta["change_proxy"] = True 45 | #+END_SRC 46 | 47 | 48 | *** spider.py 49 | 50 | Your spider should specify an array of status code where your spider 51 | may encouter during crawling. Any status code that is not 200 nor in 52 | the array would be treated as a result of invalid proxy and the proxy 53 | would be discarded. For example: 54 | 55 | #+BEGIN_SRC python 56 | website_possible_httpstatus_list = [404] 57 | #+END_SRC 58 | 59 | This line tolds the middleware that the website you're crawling would 60 | possibly return a response whose status code is 404, and do not 61 | discard the proxy that this request is using. 62 | 63 | 64 | ** Test 65 | 66 | Update HttpProxyMiddleware.py path in 67 | HttpProxyMiddlewareTest/settings.py. 68 | 69 | 70 | #+BEGIN_SRC sh 71 | cd HttpProxyMiddlewareTest 72 | scrapy crawl test 73 | #+END_SRC 74 | 75 | 76 | The testing server is hosted on my VPS, so take it easy... DO NOT 77 | waste too much of my data plan. 78 | 79 | You may start your own testing server using IPBanTest which is powered 80 | by Django. 81 | -------------------------------------------------------------------------------- /fetch_free_proxyes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | from bs4 import BeautifulSoup 4 | import urllib.request, urllib.error, urllib.parse 5 | import logging 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | def get_html(url): 10 | request = urllib.request.Request(url) 11 | request.add_header("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36") 12 | html = urllib.request.urlopen(request) 13 | return html.read() 14 | 15 | def get_soup(url): 16 | soup = BeautifulSoup(get_html(url), "lxml") 17 | return soup 18 | 19 | def fetch_kxdaili(page, https): 20 | """ 21 | 从www.kxdaili.com抓取免费代理 22 | """ 23 | proxyes = [] 24 | try: 25 | url = "http://www.kxdaili.com/dailiip/1/%d.html" % page 26 | soup = get_soup(url) 27 | table_tag = soup.find("table", attrs={"class": "segment"}) 28 | trs = table_tag.tbody.find_all("tr") 29 | for tr in trs: 30 | tds = tr.find_all("td") 31 | ip = tds[0].text 32 | port = tds[1].text 33 | types = tds[2].text 34 | if https and "HTTPS" not in types: 35 | continue 36 | latency = tds[4].text.split(" ")[0] 37 | if float(latency) < 0.5: # 输出延迟小于0.5秒的代理 38 | if https: 39 | proxy = "https://%s:%s" % (ip, port) 40 | else: 41 | proxy = "http://%s:%s" % (ip, port) 42 | proxyes.append(proxy) 43 | except Exception as e: 44 | logger.warning(e) 45 | logger.warning("fail to fetch from kxdaili") 46 | return proxyes 47 | 48 | def img2port(img_url): 49 | """ 50 | mimvp.com的端口号用图片来显示, 本函数将图片url转为端口, 目前的临时性方法并不准确 51 | """ 52 | code = img_url.split("=")[-1] 53 | if code.find("AO0OO0O")>0: 54 | return 80 55 | else: 56 | return None 57 | 58 | def fetch_mimvp(https): 59 | """ 60 | 从http://proxy.mimvp.com/free.php抓免费代理 61 | """ 62 | proxyes = [] 63 | if https: 64 | return proxyes 65 | try: 66 | url = "http://proxy.mimvp.com/free.php?proxy=in_hp" 67 | soup = get_soup(url) 68 | table = soup.find("div", attrs={"id": "list"}).table 69 | tds = table.tbody.find_all("td") 70 | for i in range(0, len(tds), 10): 71 | id = tds[i].text 72 | ip = tds[i+1].text 73 | port = img2port(tds[i+2].img["src"]) 74 | response_time = tds[i+7]["title"][:-1] 75 | transport_time = tds[i+8]["title"][:-1] 76 | if port is not None and float(response_time) < 1 : 77 | proxy = "%s:%s" % (ip, port) 78 | proxyes.append(proxy) 79 | except: 80 | logger.warning("fail to fetch from mimvp") 81 | return proxyes 82 | 83 | def fetch_xici(https): 84 | """ 85 | http://www.xicidaili.com/nn/ 86 | """ 87 | proxyes = [] 88 | try: 89 | url = "http://www.xicidaili.com/nn/" 90 | soup = get_soup(url) 91 | table = soup.find("table", attrs={"id": "ip_list"}) 92 | trs = table.find_all("tr") 93 | for i in range(1, len(trs)): 94 | tr = trs[i] 95 | tds = tr.find_all("td") 96 | ip = tds[1].text 97 | port = tds[2].text 98 | if https and tds[5].text.strip()!="HTTPS": 99 | continue 100 | speed = tds[6].div["title"][:-1] 101 | latency = tds[7].div["title"][:-1] 102 | if float(speed) < 3 and float(latency) < 1: 103 | if https: 104 | proxyes.append("https://%s:%s" % (ip, port)) 105 | else: 106 | proxyes.append("http://%s:%s" % (ip, port)) 107 | except: 108 | logger.warning("fail to fetch from xici") 109 | return proxyes 110 | 111 | def fetch_ip181(https): 112 | """ 113 | http://www.ip181.com/ 114 | """ 115 | proxyes = [] 116 | try: 117 | url = "http://www.ip181.com/" 118 | soup = get_soup(url) 119 | table = soup.find("table") 120 | trs = table.find_all("tr") 121 | for i in range(1, len(trs)): 122 | tds = trs[i].find_all("td") 123 | ip = tds[0].text 124 | port = tds[1].text 125 | if https and "HTTPS" not in tds[3].text: 126 | continue 127 | latency = tds[4].text[:-2] 128 | if float(latency) < 1: 129 | if https: 130 | proxyes.append("https://%s:%s" % (ip, port)) 131 | else: 132 | proxyes.append("http://%s:%s" % (ip, port)) 133 | except Exception as e: 134 | logger.warning("fail to fetch from ip181: %s" % e) 135 | return proxyes 136 | 137 | def fetch_httpdaili(https): 138 | """ 139 | http://www.httpdaili.com/mfdl/ 140 | 更新比较频繁 141 | """ 142 | proxyes = [] 143 | if https: 144 | return proxyes 145 | try: 146 | url = "http://www.httpdaili.com/mfdl/" 147 | soup = get_soup(url) 148 | table = soup.find("div", attrs={"kb-item-wrap11"}).table 149 | trs = table.find_all("tr") 150 | for i in range(1, len(trs)): 151 | try: 152 | tds = trs[i].find_all("td") 153 | ip = tds[0].text 154 | port = tds[1].text 155 | type = tds[2].text 156 | if type == "匿名": 157 | proxyes.append("%s:%s" % (ip, port)) 158 | except: 159 | pass 160 | except Exception as e: 161 | logger.warning("fail to fetch from httpdaili: %s" % e) 162 | return proxyes 163 | 164 | def fetch_66ip(https): 165 | """ 166 | http://www.66ip.cn/ 167 | 每次打开此链接都能得到一批代理, 速度不保证 168 | """ 169 | proxyes = [] 170 | try: 171 | # 修改getnum大小可以一次获取不同数量的代理 172 | if https: 173 | url = "http://www.66ip.cn/nmtq.php?getnum=10&isp=0&anonymoustype=3&start=&ports=&export=&ipaddress=&area=1&proxytype=1&api=66ip" 174 | else: 175 | url = "http://www.66ip.cn/nmtq.php?getnum=10&isp=0&anonymoustype=3&start=&ports=&export=&ipaddress=&area=1&proxytype=0&api=66ip" 176 | content = get_html(url) 177 | content = str(content) 178 | urls = content.split("")[1].split("")[0].split("
") 179 | for u in urls: 180 | u = u.split("\\t")[-1] 181 | if u.strip(): 182 | if https: 183 | proxyes.append("https://" + u.strip()) 184 | else: 185 | proxyes.append("http://" + u.strip()) 186 | 187 | except Exception as e: 188 | logger.warning("fail to fetch from 66ip: %s" % e) 189 | return proxyes 190 | 191 | 192 | 193 | def check(proxy): 194 | import urllib.request, urllib.error, urllib.parse 195 | 196 | if proxy.startswith("https"): 197 | url = "https://www.baidu.com/js/bdsug.js?v=1.0.3.0" 198 | proxy_handler = urllib.request.ProxyHandler({'https': proxy}) 199 | else: 200 | url = "http://www.baidu.com/js/bdsug.js?v=1.0.3.0" 201 | proxy_handler = urllib.request.ProxyHandler({'http': proxy}) 202 | opener = urllib.request.build_opener(proxy_handler,urllib.request.HTTPHandler) 203 | try: 204 | response = opener.open(url, timeout=3) 205 | return response.code == 200 and response.url == url 206 | except Exception: 207 | return False 208 | 209 | def fetch_all(endpage=2, https=False): 210 | proxyes = [] 211 | for i in range(1, endpage): 212 | proxyes += fetch_kxdaili(i, https=https) 213 | proxyes += fetch_mimvp(https) 214 | proxyes += fetch_xici(https) 215 | proxyes += fetch_ip181(https) 216 | proxyes += fetch_httpdaili(https) 217 | proxyes += fetch_66ip(https) 218 | valid_proxyes = [] 219 | logger.info("checking proxyes validation") 220 | for p in proxyes: 221 | if check(p): 222 | valid_proxyes.append(p) 223 | return valid_proxyes 224 | 225 | if __name__ == '__main__': 226 | import sys 227 | root_logger = logging.getLogger("") 228 | stream_handler = logging.StreamHandler(sys.stdout) 229 | formatter = logging.Formatter('%(name)-8s %(asctime)s %(levelname)-8s %(message)s', '%a, %d %b %Y %H:%M:%S',) 230 | stream_handler.setFormatter(formatter) 231 | root_logger.addHandler(stream_handler) 232 | logger = logging.getLogger(__name__) 233 | logger.setLevel(logging.DEBUG) 234 | proxyes = fetch_66ip(https=True) 235 | #print check("202.29.238.242:3128") 236 | for p in proxyes: 237 | print(p) 238 | --------------------------------------------------------------------------------