├── README.md └── cyrequest ├── __init__.py ├── _pycurlToRe.py └── cyRequest.py /README.md: -------------------------------------------------------------------------------- 1 | # cyrequest文档 2 | cyrequest可以实现自动试错,代理池管理的同步和异步请求的包,可以让我们能够更加简单的实现大批量请求 3 | 4 | 5 | 6 | 7 | ## 发送同步GET请求(requests发送) 8 | ```python 9 | from cyrequest import cyRequest 10 | req = cyRequest.get("http://www.baidu.com") 11 | req.response.encoding = 'utf-8' 12 | print(req.response.text) 13 | ``` 14 | 15 | ## 通过requests的session发送get请求 16 | ```python 17 | from cyrequest import cyRequest 18 | import requests 19 | s = requests.session() 20 | req = cyRequest.get("http://www.baidu.com", session=s) 21 | req.response.encoding = 'utf-8' 22 | print(req.response.text) 23 | ``` 24 | 25 | 26 | ## 发送同步GET请求并设置失败的callback 27 | ```python 28 | from cyrequest import cyRequest 29 | 30 | 31 | def errFun(a): 32 | # 输出报错 33 | print(a.error) 34 | # 输出请求的详细信息 35 | print(a.requestData) 36 | 37 | 38 | req = cyRequest.get("http://www.baidu.com", errback=errFun) 39 | req.response.encoding = 'utf-8' 40 | print(req.response.text) 41 | ``` 42 | 43 | ## 发送同步GET请求并设置重试次数 44 | ```python 45 | from cyrequest import cyRequest 46 | 47 | def errFun(a): 48 | # 输出报错 49 | print(a.error) 50 | # 输出请求的详细信息 51 | print(a.requestData) 52 | 53 | req = cyRequest.get("http://www.baidu.com", 54 | # 重试3次后失败的回调 55 | errback=errFun, 56 | # 设置重试3次, 3秒超时 57 | errNum=3, timeout=3) 58 | req.response.encoding = 'utf-8' 59 | print(req.response.text) 60 | 61 | ## 发送同步GET请求并设置自定义报错 62 | from cyrequest import cyRequest 63 | 64 | def errFun(a): 65 | print(a.error) 66 | print(a.requestData) 67 | 68 | def err(a): 69 | a.response.encoding = 'utf-8' 70 | if a.response.text.find("IP访问频繁稍后再试") != -1: 71 | # 判断返回的数据在是否包含该字符串然后返回自定义报错 72 | return "IP被封" 73 | # 返回True表示不报错,这样子就不会重试请求 74 | return True 75 | 76 | req = cyRequest.get("http://www.baidu.com", 77 | # 设置代理 78 | proxies={"http":"http://127.0.0.1:8888"}, 79 | timeout=3, errback=errFun, errNum=3, errfun=err) 80 | req.response.encoding = 'utf-8' 81 | print(req.response.text) 82 | ``` 83 | 84 | ## 发送同步POST请求 85 | ```python 86 | from cyrequest import cyRequest 87 | 88 | req = cyRequest.post("http://www.baidu.com", data={"cbb":"yhh"}) 89 | req.response.encoding = 'utf-8' 90 | print(req.response.text) 91 | 92 | ``` 93 | 94 | ## 通过requests的session发送POST请求 95 | ```python 96 | from cyrequest import cyRequest 97 | import requests 98 | s = requests.session() 99 | req = cyRequest.post("http://www.baidu.com", data={"cbb":"yhh"}, session=s) 100 | req.response.encoding = 'utf-8' 101 | print(req.response.text) 102 | ``` 103 | 104 | ## 发送同步POST请求并设置失败的callback 105 | ```python 106 | from cyrequest import cyRequest 107 | 108 | def errFun(a): 109 | print(a.error) 110 | print(a.requestData) 111 | 112 | req = cyRequest.post("http://www.baidu.com", data={"cbb":"yhh"}, errback=errFun) 113 | req.response.encoding = 'utf-8' 114 | print(req.response.text) 115 | ``` 116 | 117 | 118 | ## 发送同步POST请求并设置重试次数 119 | ```python 120 | from cyrequest import cyRequest 121 | 122 | def errFun(a): 123 | print(a.error) 124 | print(a.requestData) 125 | 126 | req = cyRequest.post("http://www.baidu.com", data={"cbb":"yhh"}, errback=errFun, errNum=3) 127 | req.response.encoding = 'utf-8' 128 | print(req.response.text) 129 | ``` 130 | 131 | ## 发送同步POST请求并设置自定义报错 132 | ```python 133 | from cyrequest import cyRequest 134 | 135 | def err(a): 136 | a.response.encoding = 'utf-8' 137 | if a.response.text.find("IP访问频繁稍后再试") != -1: 138 | # 判断返回的数据在是否包含该字符串然后返回自定义报错 139 | return "IP被封" 140 | # 返回True表示不报错,这样子就不会重试请求 141 | return True 142 | 143 | def errFun(a): 144 | print(a.error) 145 | print(a.requestData) 146 | 147 | req = cyRequest.post("http://www.baidu.com", data={"cbb":"yhh"}, errback=errFun,errfun=err, errNum=3) 148 | req.response.encoding = 'utf-8' 149 | print(req.response.text) 150 | ``` 151 | 152 | ## 发送异步get请求(无返回值) 153 | ```python 154 | from cyrequest import cyRequest 155 | 156 | def sucess(a): 157 | print(a.response.text) 158 | print(a.requestData) 159 | print(a.id) 160 | 161 | s = cyRequest.cyRequest() 162 | 163 | # 发送请求 164 | s.get("http://www.baidu.com", 165 | # 设置请求成功后的回调 166 | callback=sucess, 167 | # 可以携带请求的一些信息,可传递到callback 168 | id="我是百度首页的请求" 169 | ) 170 | # 发送请求 171 | s.get("https://cn.bing.com/", 172 | # 设置请求成功后的回调 173 | callback=sucess, 174 | # 可以携带请求的一些信息,可传递到callback 175 | id="我是必应首页的请求" 176 | # 异步发送上面2个请求 177 | s.advance() 178 | ``` 179 | 180 | ## 发送异步post请求(无返回值) 181 | ```python 182 | from cyrequest import cyRequest 183 | 184 | 185 | def sucess(a): 186 | print(a.response.text) 187 | print(a.requestData) 188 | print(a.id) 189 | 190 | 191 | s = cyRequest.cyRequest() 192 | 193 | 194 | # 发送请求 195 | s.post("http://www.baidu.com", 196 | # 设置请求成功后的回调 197 | callback=sucess,data={"cbb":"yhh"}, 198 | # 可以携带请求的一些信息,可传递到callback 199 | id="我是百度首页的请求" 200 | ) 201 | # 发送请求 202 | s.post("https://cn.bing.com/",data={"cbb":"yhh"} 203 | # 设置请求成功后的回调 204 | callback=sucess, 205 | # 可以携带请求的一些信息,可传递到callback 206 | id="我是必应首页的请求" 207 | # 异步发送上面2个请求 208 | s.advance() 209 | ``` 210 | 211 | ## 发送异步get请求(有返回值) 212 | ```python 213 | from cyrequest import cyRequest 214 | 215 | 216 | def sucess(a): 217 | print(a.response.text) 218 | print(a.requestData) 219 | print(a.id) 220 | 221 | 222 | s = cyRequest.cyRequest() 223 | 224 | 225 | # 发送请求 226 | s.get("http://www.baidu.com", 227 | # 设置请求成功后的回调 228 | callback=sucess, 229 | # 可以携带请求的一些信息,可传递到callback 230 | id="我是百度首页的请求" 231 | ) 232 | # 发送请求 233 | s.get("https://cn.bing.com/", 234 | # 设置请求成功后的回调,可以不设置 235 | callback=sucess, 236 | # 可以携带请求的一些信息,可传递到callback,可以不设置 237 | id="我是必应首页的请求" 238 | # 异步发送上面2个请求 239 | # 并且有返回值 240 | for i in s.adyield(): 241 | print(i.id) 242 | print(i.requestData) 243 | ``` 244 | 245 | ## 发送异步post请求(有返回值) 246 | ```python 247 | from cyrequest import cyRequest 248 | 249 | def sucess(a): 250 | print(a.response.text) 251 | print(a.requestData) 252 | print(a.id) 253 | 254 | s = cyRequest.cyRequest() 255 | 256 | # 发送请求 257 | s.post("http://www.baidu.com", 258 | # 设置请求成功后的回调 259 | callback=sucess,data={"cbb":"yhh"}, 260 | # 可以携带请求的一些信息,可传递到callback 261 | id="我是百度首页的请求" 262 | ) 263 | # 发送请求 264 | s.post("https://cn.bing.com/",data={"cbb":"yhh"} 265 | # 设置请求成功后的回调,可以不设置 266 | callback=sucess, 267 | # 可以携带请求的一些信息,可传递到callback,可以不设置 268 | id="我是必应首页的请求" 269 | 页的请求" 270 | # 异步发送上面2个请求 271 | # 并且有返回值 272 | for i in s.adyield(): 273 | print(i.id) 274 | print(i.requestData) 275 | ``` 276 | 277 | ## 发送异步请求设置重试次数 278 | ```python 279 | from cyrequest import cyRequest 280 | 281 | def sucess(a): 282 | print(a.response.text) 283 | print(a.requestData) 284 | print(a.id) 285 | # 重试10次 286 | s = cyRequest.cyRequest(errNum=10) 287 | 288 | # 发送get请求 289 | s.get("http://www.baidu.com", 290 | # 设置请求成功后的回调 291 | callback=sucess, 292 | # 可以携带请求的一些信息,可传递到callback 293 | id="我是百度首页的请求" 294 | ) 295 | # 发送post请求 296 | s.post("https://cn.bing.com/",data={"cbb":"yhh"} 297 | # 设置请求成功后的回调,可以不设置 298 | callback=sucess, 299 | # 可以携带请求的一些信息,可传递到callback,可以不设置 300 | id="我是必应首页的请求" 301 | # 异步发送上面2个请求 302 | # 并且有返回值 303 | for i in s.adyield(): 304 | print(i.id) 305 | print(i.requestData) 306 | ``` 307 | 308 | ## 发送异步请求设置最大异步数量 309 | ```python 310 | from cyrequest import cyRequest 311 | 312 | 313 | def sucess(a): 314 | print(a.response.text) 315 | print(a.requestData) 316 | print(a.id) 317 | # 重试10次, 最大异步数量为10 318 | s = cyRequest.cyRequest(errNum=10,max_workers=10) 319 | 320 | 321 | # 发送get请求 322 | s.get("http://www.baidu.com", 323 | # 设置请求成功后的回调 324 | callback=sucess, 325 | # 可以携带请求的一些信息,可传递到callback 326 | id="我是百度首页的请求" 327 | ) 328 | # 发送post请求 329 | s.post("https://cn.bing.com/",data={"cbb":"yhh"} 330 | # 设置请求成功后的回调,可以不设置 331 | callback=sucess, 332 | # 可以携带请求的一些信息,可传递到callback,可以不设置 333 | id="我是必应首页的请求" 334 | # 异步发送上面2个请求 335 | # 并且有返回值 336 | for i in s.adyield(): 337 | print(i.id) 338 | print(i.requestData) 339 | ``` 340 | 341 | ## 通过requests的session发送异步请求 342 | ```python 343 | from cyrequest import cyRequest 344 | import requests 345 | session = requests.session() 346 | def sucess(a): 347 | print(a.response.text) 348 | print(a.requestData) 349 | print(a.id) 350 | # 重试10次, 最大异步数量为10 351 | s = cyRequest.cyRequest(errNum=10,max_workers=10,session=session) 352 | 353 | # 发送get请求 354 | s.get("http://www.baidu.com", 355 | # 设置请求成功后的回调 356 | callback=sucess, 357 | # 可以携带请求的一些信息,可传递到callback 358 | id="我是百度首页的请求" 359 | ) 360 | # 发送post请求 361 | s.post("https://cn.bing.com/",data={"cbb":"yhh"} 362 | # 设置请求成功后的回调,可以不设置 363 | callback=sucess, 364 | # 可以携带请求的一些信息,可传递到callback,可以不设置 365 | id="我是必应首页的请求" 366 | # 异步发送上面2个请求 367 | # 并且有返回值 368 | for i in s.adyield(): 369 | print(i.id) 370 | print(i.requestData) 371 | ``` 372 | 373 | ## 代理池管理 374 | 同步代理设置和异步代理设置和requests的设置类似 375 | ```python 376 | # 可以为这个实例下所有的请求设置代理,也可以单独设置 377 | s = cyRequest.cyRequest(errNum=10,max_workers=10,session=session, 378 | proxies={"http":"http://127.0.0.1:8888"}) 379 | cyrequest有代理池管理模块,该模块可以自动管理每个代理可同时被多少个request请求使用,错误多少次后删除并记录该代理 380 | ## 异步使用代理池管理 381 | from cyrequest import cyRequest 382 | 383 | def sucess(a): 384 | print(a.response.text) 385 | print(a.requestData) 386 | print(a.id) 387 | # 创建一个代理池管理模块 388 | proxiesList = cyRequest.cyProxy( 389 | # 初始化代理池 390 | proxyList=[ 391 | "https://192.168.32.22:8090", 392 | # http代理 393 | "http://127.0.0.1:888", 394 | # https代理 395 | 396 | ], 397 | # 最大有3个请求共同使用这个代理(默认3个) 398 | max=3, 399 | # 这个代理最大错误几次后会被舍弃 400 | maxerr=3 401 | ) 402 | 403 | s = cyRequest.cyRequest(max_workers=10, errNum=20) 404 | 405 | # 发送请求 406 | s.get("http://www.baidu.com", 407 | # 设置请求成功后的回调 408 | callback=sucess, 409 | # 可以携带请求的一些信息,可传递到callback 410 | id="我是百度首页的请求", 411 | # 设置代理 412 | proxies=proxiesList, 413 | 414 | timeout=4 415 | ) 416 | 417 | for i in s.adyield(): 418 | print(i.id) 419 | print(i.requestData) 420 | 421 | 422 | ``` 423 | ## 异步使用代理池管理,代理池自动获取代理(当代理数量不足会自动调用函数获取代理) 424 | ```python 425 | from cyrequest import cyRequest 426 | 427 | def sucess(a): 428 | print(a.response.text) 429 | print(a.requestData) 430 | print(a.id) 431 | 432 | def getProxies(): 433 | req = cyRequest.get("http://101.35.218.236:5010/get") 434 | # 下面返回 101.36.33.33:8080 435 | text = req.response.text 436 | # 设置https代理 437 | proxy = "https://"+text 438 | 439 | # 设置http代理 440 | # proxy = "http://" + text 441 | 442 | # 返回一个代理给线程池(可以返回多个) 443 | return [proxy] 444 | 445 | proxiesList = cyRequest.cyProxy( 446 | # 设置获取代理的函数 447 | getProxy=getProxies, 448 | # 最大有3个请求共同使用这个代理(默认3个) 449 | max=3, 450 | # 这个代理最大错误几次后会被舍弃 451 | maxerr=3 452 | ) 453 | 454 | s = cyRequest.cyRequest(max_workers=10, errNum=20) 455 | 456 | # 发送请求 457 | s.get("http://www.baidu.com", 458 | # 设置请求成功后的回调 459 | callback=sucess, 460 | # 可以携带请求的一些信息,可传递到callback 461 | id="我是百度首页的请求", 462 | # 设置代理 463 | proxies=proxiesList, 464 | 465 | timeout=4 466 | ) 467 | 468 | for i in s.adyield(): 469 | print(i.id) 470 | print(i.requestData) 471 | ``` 472 | 473 | ## 还有更多模块正在开发哦,有什么意见可加q 2833844911 或者邮箱发给我哦(2833844911@qq.com) 474 | 作者:陈不不 475 | b站:[https://space.bilibili.com/227452348](https://space.bilibili.com/227452348?spm_id_from=333.1007.0.0) 476 | -------------------------------------------------------------------------------- /cyrequest/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/2833844911/cyrequest/78a248c1bf9b1e30433018ea4d4ca23d1889f8b4/cyrequest/__init__.py -------------------------------------------------------------------------------- /cyrequest/_pycurlToRe.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /cyrequest/cyRequest.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | # !/usr/bin/python3 3 | ''' 4 | 该库用于异步与同步请求 5 | 能自动试错 6 | 7 | ''' 8 | import random 9 | 10 | from requests_futures.sessions import FuturesSession 11 | import requests 12 | import re 13 | from queue import Queue 14 | 15 | 16 | class _request: 17 | def __init__(self, id, response, request, data, errNum): 18 | self.id = id 19 | self.response = response 20 | self.request = request 21 | self.requestData = data 22 | self.errNum = errNum 23 | 24 | 25 | class _error: 26 | def __init__(self, id, error, request, data, errNum): 27 | self.id = id 28 | self.error = error 29 | self.request = request 30 | self.requestData = data 31 | self.errNum = errNum 32 | 33 | 34 | class cyRequest: 35 | def __init__(self, headers=None, cookie=None, verify=True, proxies=None, errback=None, errNum=1, max_workers=8, 36 | session=None): 37 | ''' 38 | 39 | :param headers:请求头的设置 40 | :param cookie: 设置cookie 41 | :param verify: 设置是否信任ssl证书 42 | :param proxies: 设置代理 43 | :param errback: 设置失败后的回调 44 | :param errNum: 设置最大重试次数 45 | :param max_workers: 设置最大异步数量 46 | :param session: 输入requests.session会话 47 | ''' 48 | self.responeList = [] 49 | self.requestList = [] 50 | self.errNum = errNum 51 | self.asnum = max_workers 52 | self.session = FuturesSession(max_workers=max_workers, session=session) 53 | if verify != None: 54 | self.session.verify = verify 55 | if proxies != None: 56 | self.proxies = proxies 57 | else: 58 | self.proxies = {} 59 | if headers != None: 60 | self.session.headers = headers 61 | if cookie != None: 62 | self.session.cookie = cookie 63 | if errback == None: 64 | self.errback = self._errback 65 | else: 66 | self.errback = errback 67 | 68 | def _errback(self, e, callback, err, errfun): 69 | if e.requestData['AT'] == "GET": 70 | self.get(e.requestData['url'], e.requestData['headers'], e.requestData['verify'], e.requestData['proxies'], 71 | e.requestData['params'], e.id, callback, e.requestData['timeout'], e.requestData['allow_redirects'], err, e.errNum, errfun) 72 | elif e.requestData['AT'] == "POST": 73 | self.post(e.requestData['url'], e.requestData['data'], e.requestData['json'], e.requestData['headers'], 74 | e.requestData['verify'], e.requestData['proxies'], e.id, 75 | callback, e.requestData['timeout'], e.requestData['allow_redirects'], err, e.errNum, errfun 76 | ) 77 | 78 | def get(self, url=None, headers=None, verify=True, proxies={},params=None, id=None, callback=None, timeout=20, 79 | allow_redirects=True, errback=None, _errNum=0, errfun=None): 80 | if type(proxies) == type({}): 81 | if len(proxies) == 0: 82 | proxies = self.proxies 83 | self.requestList.append( 84 | ["GET", url, headers,params, verify, proxies, id, callback, timeout, allow_redirects, errback, _errNum, errfun]) 85 | 86 | def _get(self, url=None, headers=None, params=None, verify=True, proxies=None, id=None, callback=None, timeout=20, 87 | allow_redirects=True, errback=None, errNum=None, errfun=None): 88 | 89 | if type(proxies) != type({}): 90 | proxiesstr_ = proxies.get() 91 | if proxiesstr_ == None: 92 | self.requestList.append( 93 | ["GET", url, headers, params, verify, proxies, id, callback, timeout, allow_redirects, errback, 94 | errNum, errfun]) 95 | return 96 | if proxiesstr_[:5] != "https": 97 | proxies_ = {"http": proxiesstr_} 98 | else: 99 | proxies_ = {"http": "http" + proxiesstr_[5:], "https": "http" + proxiesstr_[5:]} 100 | else: 101 | proxies_ = proxies 102 | 103 | req = self.session.get(url, headers=headers, verify=verify,params=None, proxies=proxies_, timeout=timeout, 104 | allow_redirects=allow_redirects) 105 | self.responeList.append([req, callback, id, errback, 106 | {"AT": "GET", "url": url, "headers": headers,"params":params, "verify": verify, "proxies": proxies_, 107 | "timeout": timeout, "allow_redirects": allow_redirects}, errNum, errfun, proxies]) 108 | 109 | def post(self, url=None, data=None, json=None, headers=None, verify=True, proxies={}, id=None, callback=None, 110 | timeout=20, allow_redirects=True, errback=None, _errNum=0, errfun=None): 111 | if type(proxies) == type({}): 112 | if len(proxies) == 0: 113 | proxies = self.proxies 114 | self.requestList.append( 115 | ["POST", url, data, json, headers, verify, proxies, id, callback, timeout, allow_redirects, errback, _errNum, 116 | errfun]) 117 | # random.shuffle(self.requestList) 118 | 119 | def _post(self, url=None, data=None, json=None, headers=None, verify=None, proxies=None, id=None, callback=None, 120 | timeout=20, allow_redirects=True, errback=None, errNum=None, errfun=None): 121 | if type(proxies) != type({}): 122 | proxiesstr_ = proxies.get() 123 | if proxiesstr_ == None: 124 | self.requestList.append( 125 | ["POST", url, data, json, headers, verify, proxies, id, callback, timeout, allow_redirects, errback, 126 | errNum, 127 | errfun]) 128 | return 129 | if proxiesstr_[:5] != "https": 130 | proxies_ = {"http": proxiesstr_} 131 | else: 132 | proxies_ = {"http": "http" + proxiesstr_[5:], "https": "http" + proxiesstr_[5:]} 133 | else: 134 | proxies_ = proxies 135 | 136 | req = self.session.post(url, data=data, json=json, headers=headers, verify=verify, proxies=proxies_, 137 | timeout=timeout, allow_redirects=allow_redirects) 138 | self.responeList.append([req, callback, id, errback, 139 | {"AT": "POST", "url": url, "data": data, "json": json, "headers": headers, 140 | "verify": verify, "proxies": proxies_, "timeout": timeout, 141 | "allow_redirects": allow_redirects}, errNum, errfun, proxies]) 142 | 143 | def _pushRequest(self): 144 | for i in range(self.asnum - len(self.responeList)): 145 | if len(self.requestList) == 0: 146 | break 147 | datar = self.requestList.pop(0) 148 | if datar[0] == "GET": 149 | self._get(*datar[1:]) 150 | elif datar[0] == "POST": 151 | self._post(*datar[1:]) 152 | 153 | def advance(self): 154 | while len(self.responeList + self.requestList): 155 | self._pushRequest() 156 | deleList = [] 157 | 158 | for index, (req, callback, id, err, requestData, errNum, errfun, proxies) in enumerate(self.responeList): 159 | 160 | if req._state == "FINISHED": 161 | if 'https' in requestData['proxies']: 162 | proxiesstr_ = "https"+requestData["proxies"]['https'][4:] 163 | elif 'http' in requestData['proxies']: 164 | proxiesstr_ = requestData["proxies"]['http'] 165 | deleList.append(index) 166 | try: 167 | data = _request(id, req.result(), self, requestData, errNum) 168 | if errfun != None: 169 | enrr = errfun(data) 170 | if enrr != True: 171 | raise Exception(enrr) 172 | 173 | if type(proxies) != type({}): 174 | proxies.updata(proxiesstr_) 175 | 176 | if callback != None: 177 | callback(data) 178 | 179 | except Exception as e: 180 | if type(proxies) != type({}): 181 | proxies.puterr(proxiesstr_) 182 | requestData['proxies'] = proxies 183 | errNum += 1 184 | if self.errNum > errNum: 185 | self._errback(_error(id, e, self, requestData, errNum), callback, err, errfun) 186 | continue 187 | if err == None: 188 | self.errback(_error(id, e, self, requestData, errNum)) 189 | else: 190 | err(_error(id, e, self, requestData, errNum)) 191 | jf = 0 192 | deleList.sort() 193 | for i in deleList: 194 | self.responeList.pop(i - jf) 195 | jf += 1 196 | 197 | def adyield(self): 198 | while len(self.responeList) + len(self.requestList): 199 | self._pushRequest() 200 | deleList = [] 201 | for index, (req, callback, id, err, requestData, errNum, errfun, proxies) in enumerate(self.responeList): 202 | 203 | if req._state == "FINISHED": 204 | if 'https' in requestData['proxies']: 205 | proxiesstr_ = "https"+requestData["proxies"]['https'][4:] 206 | elif 'http' in requestData['proxies']: 207 | proxiesstr_ = requestData["proxies"]['http'] 208 | 209 | deleList.append(index) 210 | try: 211 | data = _request(id, req.result(), self, requestData, errNum) 212 | if errfun != None: 213 | e = errfun(data) 214 | if e != True: 215 | raise Exception(e) 216 | 217 | data = _request(id, req.result(), self, requestData, errNum) 218 | if errfun != None: 219 | enrr = errfun(data) 220 | if enrr != True: 221 | raise Exception(enrr) 222 | 223 | if type(proxies) != type({}): 224 | proxies.updata(proxiesstr_) 225 | 226 | if callback != None: 227 | id = callback(data) 228 | yield data 229 | except Exception as y: 230 | 231 | if type(proxies) != type({}): 232 | proxies.puterr(proxiesstr_) 233 | requestData['proxies'] = proxies 234 | 235 | e = y 236 | errNum += 1 237 | if self.errNum > errNum: 238 | self._errback(_error(id, e, self, requestData, errNum), callback, err, errfun) 239 | continue 240 | if err == None: 241 | self.errback(_error(id, e, self, requestData, errNum)) 242 | else: 243 | err(_error(id, e, self, requestData, errNum)) 244 | jf = 0 245 | deleList.sort() 246 | for i in deleList: 247 | self.responeList.pop(i - jf) 248 | jf += 1 249 | class cyProxy: 250 | def __init__(self, getProxy=None, proxyList=[], max=3, maxerr=2): 251 | ''' 252 | 253 | :param getProxy: 返回一个代理的列表的函数 254 | :param proxyList: 初始化代理输入列表 255 | :param max: 最大的代理请求连接数 256 | :param maxerr: 最大的代理错误数 257 | ''' 258 | self._proxies = {} 259 | self._badproxy = set('') 260 | self.maxerr = maxerr 261 | self._max = max 262 | self._getproxy = getProxy 263 | self._proxyList = proxyList 264 | self._Qproxy = Queue(maxsize=max * 5) 265 | self._putProxyDict() 266 | self._putProxy() 267 | 268 | def _putProxyDict(self): 269 | if self._getproxy != None: 270 | self._proxyList += self._getproxy() 271 | while 1: 272 | if len(self._proxyList) + self._Qproxy.qsize() == 0: 273 | if len(self._proxies) == 0: 274 | raise Exception("代理不够了") 275 | return 276 | if len(self._proxyList) == 0: 277 | return 278 | s = self._proxyList.pop() 279 | if s in self._badproxy: 280 | if len(self._proxyList) == 0: 281 | break 282 | continue 283 | self._proxies[s] = [self._max, 0] 284 | if len(self._proxyList) == 0: 285 | break 286 | 287 | def _putProxy(self): 288 | while 1: 289 | off2 = 0 290 | off = 1 291 | for k, v in self._proxies.items(): 292 | if self._Qproxy.full(): 293 | off = 0 294 | break 295 | elif v[0] > 0: 296 | off2 = 1 297 | self._Qproxy.put(k) 298 | v[0] -= 1 299 | if off == 0 or off2 == 0: 300 | break 301 | 302 | def get(self): 303 | if self._Qproxy.empty(): 304 | self._putProxyDict() 305 | self._putProxy() 306 | if self._Qproxy.empty(): 307 | return None 308 | proxy = self._Qproxy.get() 309 | if self._Qproxy.qsize() < 4: 310 | self._putProxyDict() 311 | self._putProxy() 312 | return proxy 313 | 314 | def updata(self, proxy): 315 | if proxy not in self._proxies: 316 | return 317 | self._proxies[proxy][0] += 1 318 | 319 | def puterr(self, proxies): 320 | if proxies not in self._proxies: 321 | return 322 | self._proxies[proxies][1] += 1 323 | if self._proxies[proxies][1] >= self.maxerr: 324 | self._proxies.pop(proxies) 325 | self._badproxy.add(proxies) 326 | 327 | 328 | def get(url=None, headers=None, verify=True, params=None, proxies={}, timeout=20, allow_redirects=True, 329 | errback=None, errNum=1, session=None, errfun=None): 330 | if type(proxies) != type({}): 331 | proxiesstr_ = proxies.get() 332 | if proxiesstr_[:5] != "https": 333 | proxies_ = {"http": proxiesstr_} 334 | else: 335 | proxies_ = {"http": "http" + proxiesstr_[5:], "https": "http" + proxiesstr_[5:]} 336 | 337 | else: 338 | proxies_ = proxies 339 | requestData = {"AT": "GET", "url": url, "headers": headers, "verify": verify,"params":params, "proxies": proxies_, 340 | "timeout": timeout, "allow_redirects": allow_redirects} 341 | if session == None: 342 | session = requests.session() 343 | e = None 344 | for i in range(errNum): 345 | try: 346 | req = session.get(url=url, headers=headers, verify=verify, params=params, proxies=proxies_, timeout=timeout, 347 | allow_redirects=allow_redirects) 348 | a = _request(url, req, session, requestData, errNum) 349 | if errfun != None: 350 | err = errfun(a) 351 | if err != True: 352 | raise Exception(err) 353 | if type(proxies) != type({}): 354 | proxies.updata(proxiesstr_) 355 | return a 356 | except Exception as y: 357 | if type(proxies) != type({}): 358 | proxies.puterr(proxiesstr_) 359 | proxiesstr_ = proxies.get() 360 | if proxiesstr_[:5] != "https": 361 | proxies_ = {"http": proxiesstr_} 362 | else: 363 | proxies_ = {"http": "http" + proxiesstr_[5:], "https": "http" + proxiesstr_[5:]} 364 | e = y 365 | 366 | if errback != None: 367 | errback(_error(url, e, session, requestData, errNum)) 368 | else: 369 | raise Exception(e) 370 | 371 | 372 | def post(url=None, data=None, json=None, headers=None, verify=True, proxies={}, timeout=20, allow_redirects=True, 373 | errback=None, errNum=1, session=None, errfun=None): 374 | if type(proxies) != type({}): 375 | proxiesstr_ = proxies.get() 376 | if proxiesstr_[:5] != "https": 377 | proxies_ = {"http": proxiesstr_} 378 | else: 379 | proxies_ = {"http": "http" + proxiesstr_[5:], "https": "http" + proxiesstr_[5:]} 380 | 381 | else: 382 | proxies_ = proxies 383 | 384 | requestData = {"AT": "GET", "url": url, "headers": headers, "verify": verify, "proxies": proxies_, 385 | "timeout": timeout, "allow_redirects": allow_redirects} 386 | if session == None: 387 | session = requests.session() 388 | e = None 389 | for i in range(errNum): 390 | try: 391 | req = session.post(url=url, data=data, json=json, headers=headers, verify=verify, proxies=proxies_, 392 | timeout=timeout, allow_redirects=allow_redirects) 393 | 394 | a = _request(url, req, session, requestData, errNum) 395 | if errfun != None: 396 | err = errfun(a) 397 | if err != True: 398 | raise Exception(err) 399 | 400 | if type(proxies) != type({}): 401 | proxies.updata(proxiesstr_) 402 | 403 | return a 404 | except Exception as y: 405 | if type(proxies) != type({}): 406 | proxies.puterr(proxiesstr_) 407 | proxiesstr_ = proxies.get() 408 | if proxiesstr_[:5] != "https": 409 | proxies_ = {"http": proxiesstr_} 410 | else: 411 | proxies_ = {"http": "http" + proxiesstr_[5:], "https": "http" + proxiesstr_[5:]} 412 | e = y 413 | 414 | if errback != None: 415 | errback(_error(url, e, session, requestData, errNum)) 416 | else: 417 | raise Exception(e) 418 | 419 | 420 | 421 | 422 | def cleanString(s): 423 | '''去除字符串中的多余字符''' 424 | return re.sub('\s+', ' ', s) 425 | 426 | 427 | def getListdata_(data, key, listData): 428 | for i in data: 429 | if type(i) == type({}): 430 | getDictdata_(i, key, listData) 431 | 432 | 433 | def getDictdata_(data, key, listData): 434 | if type(data) == type([]): 435 | getListdata_(data, key, listData) 436 | if type(data) == type({}): 437 | for k, v in data.items(): 438 | if type(v) == type([]): 439 | getListdata_(v, key, listData) 440 | if type(v) == type({}): 441 | getDictdata_(v, key, listData) 442 | if k == key: 443 | listData.append(v) 444 | 445 | 446 | 447 | def getDictdata(data, key): 448 | ''' 449 | 获取字典中一个键的值 450 | :param data: 传入字典 451 | :param key: 传入需要获取的健 452 | :return: 453 | ''' 454 | myList = [] 455 | getDictdata_(data, key, myList) 456 | return myList 457 | 458 | def timeTogs(datetime): 459 | if datetime.find('-') != -1: 460 | sp = '-' 461 | else: 462 | sp = '/' 463 | date = datetime.split(' ') 464 | s = '' 465 | for i in date[0].split(sp): 466 | if len(i) < 2: 467 | i = '0' + i 468 | s += i 469 | if len(date) == 1: 470 | return s 471 | for i in date[1].split(':'): 472 | if len(i) < 2: 473 | i = '0' + i 474 | s += i 475 | return s 476 | --------------------------------------------------------------------------------