├── .dockerignore ├── .gitignore ├── .travis.yml ├── CICD ├── Scripts │ └── deploy.sh ├── id_rsa.enc └── ssh_config ├── Config └── Config.ini.default ├── Data └── 17monipdb.datx ├── Docker ├── Dockerfile └── docker-compose.yml ├── Docs ├── Design.md └── images │ ├── 2019-06-12-22-11-21.png │ ├── 2019-06-12-22-13-10.png │ ├── 2019-06-12-22-17-34.png │ ├── 2019-06-12-22-22-46.png │ ├── 2019-06-15-08-18-36.png │ ├── 2019-06-15-10-35-29.png │ └── 2019-06-15-13-18-47.png ├── LICENSE ├── README.md ├── Src ├── Config │ ├── ConfigManager.py │ └── __init__.py ├── DB │ ├── DbClient.py │ ├── MongodbClient.py │ └── __init__.py ├── Fetcher │ ├── FetcherManager.py │ ├── __init__.py │ └── fetchers │ │ ├── 66ip.py │ │ ├── __init__.py │ │ ├── cn-proxy.py │ │ ├── coderbusy.py │ │ ├── data5u.py │ │ ├── goubanjia.py │ │ ├── ip181.py │ │ ├── ip3366.py │ │ ├── iphai.py │ │ ├── jiangxianli.py │ │ ├── kuaidaili.py │ │ ├── mimiip.py │ │ ├── proxy-list.py │ │ ├── proxylistplus.py │ │ ├── xdaili.py │ │ └── xicidaili.py ├── Forward │ ├── ForwardManager.py │ ├── __init__.py │ └── base.py ├── Log │ ├── LogHandler.py │ ├── LogManager.py │ └── __init__.py ├── Manager │ ├── ProxyClean.py │ ├── ProxyFetch.py │ ├── ProxyManager.py │ ├── ProxyVerify.py │ └── __init__.py ├── Notify │ ├── NotifyManager.py │ └── __init__.py ├── ProxyGetter │ ├── CheckProxy.py │ ├── __init__.py │ └── getFreeProxy.py ├── Run │ ├── __init__.py │ └── main.py ├── Schedule │ ├── ProxyCleanSchedule.py │ ├── ProxyFetchSchedule.py │ ├── ProxySchedule.py │ ├── ProxyVerifySchedule.py │ └── __init__.py ├── Util │ ├── EnvUtil.py │ ├── GetConfig.py │ ├── WebRequest.py │ ├── __init__.py │ ├── utilClass.py │ └── utilFunction.py ├── Version │ ├── VersionManger.py │ ├── __init__.py │ └── version │ │ ├── __init__.py │ │ └── version_1_0_0.py └── Web │ ├── WebManager.py │ ├── __init__.py │ ├── admin │ ├── __init__.py │ ├── admin.py │ ├── forms.py │ ├── model.py │ └── views.py │ ├── api │ ├── __init__.py │ └── api.py │ ├── config.py │ └── templates │ ├── admin │ ├── index.html │ └── master_base.html │ ├── index.html │ └── security │ ├── _macros.html │ ├── _menu.html │ ├── _messages.html │ ├── login_user.html │ └── register_user.html ├── Test ├── .pytest_cache │ └── v │ │ └── cache │ │ ├── lastfailed │ │ └── nodeids ├── __init__.py ├── testGetConfig.py ├── testGetFreeProxy.py ├── testLogHandler.py └── testWebRequest.py ├── _config.yml ├── requirements.txt ├── test.py └── version /.dockerignore: -------------------------------------------------------------------------------- 1 | Config/Config.ini -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | *.pyc 3 | *.log 4 | .vscode/ 5 | site-packages/ 6 | 7 | Config\.ini 8 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | os: 2 | - linux 3 | 4 | services: 5 | - docker 6 | 7 | before_install: 8 | - openssl aes-256-cbc -K $encrypted_42099b4af021_key -iv $encrypted_42099b4af021_iv -in CICD/id_rsa.enc -out ~/.ssh/id_rsa -d 9 | - chmod 600 ~/.ssh/id_rsa 10 | - cp CICD/ssh_config ~/.ssh/ 11 | 12 | script: 13 | - docker build -t 1again/proxy_pool -f Docker/Dockerfile . 14 | 15 | after_success: 16 | - echo $DOCKER_1AGAIN_PASSWORD | docker login -u 1again --password-stdin 17 | - docker push 1again/proxy_pool 18 | 19 | deploy: 20 | provider: script 21 | script: bash CICD/Scripts/deploy.sh 22 | on: 23 | branch: develop -------------------------------------------------------------------------------- /CICD/Scripts/deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | echo "======== Start Pull Image ========" 4 | echo 5 | ssh -o StrictHostKeyChecking=no root@$DEPLOY_SSH_HOST -p$DEPLOY_SSH_PORT "docker pull 1again/proxy_pool" 6 | echo 7 | 8 | echo "======== Start Update Code ========" 9 | echo 10 | ssh -o StrictHostKeyChecking=no root@$DEPLOY_SSH_HOST -p$DEPLOY_SSH_PORT "cd $WORKDIR && git pull" 11 | echo 12 | 13 | echo "======== Start Update Container ========" 14 | echo 15 | ssh -o StrictHostKeyChecking=no root@$DEPLOY_SSH_HOST -p$DEPLOY_SSH_PORT "cd $WORKDIR && docker-compose -f Docker/docker-compose.yml up -d" 16 | echo -------------------------------------------------------------------------------- /CICD/id_rsa.enc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ozhiwei/SmartProxyPool/f38a21857c9f5e3511bc65792a493cf45f6c491e/CICD/id_rsa.enc -------------------------------------------------------------------------------- /CICD/ssh_config: -------------------------------------------------------------------------------- 1 | Host * 2 | User root 3 | StrictHostKeyChecking no 4 | IdentityFile ~/.ssh/id_rsa 5 | IdentitiesOnly yes -------------------------------------------------------------------------------- /Config/Config.ini.default: -------------------------------------------------------------------------------- 1 | [DB] 2 | ; Configure the database information 3 | db_type = MONGODB 4 | db_host = proxy_pool_db 5 | db_port = 27017 6 | db_name = proxy 7 | ; user = your_username (Only Mongodb) 8 | ; pass = your_password 9 | 10 | [LOG] 11 | log_level = INFO 12 | 13 | [BIND] 14 | web_bind_host = 0.0.0.0 15 | web_bind_port = 35050 16 | 17 | forward_bind_host = 0.0.0.0 18 | forward_bind_port = 36050 19 | -------------------------------------------------------------------------------- /Data/17monipdb.datx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ozhiwei/SmartProxyPool/f38a21857c9f5e3511bc65792a493cf45f6c491e/Data/17monipdb.datx -------------------------------------------------------------------------------- /Docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7 2 | WORKDIR /usr/src/app 3 | COPY . . 4 | 5 | ENV DEBIAN_FRONTEND noninteractive 6 | ENV TZ Asia/Shanghai 7 | 8 | RUN pip install -r requirements.txt 9 | 10 | EXPOSE 35050 11 | EXPOSE 36050 12 | 13 | CMD [ "python", "Src/Run/main.py" ] 14 | -------------------------------------------------------------------------------- /Docker/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | services: 3 | proxy_pool: 4 | ports: 5 | - "35050:35050" 6 | - "36050:36050" 7 | image: "1again/proxy_pool" 8 | proxy_pool_db: 9 | volumes: 10 | - /data/mongodb/:/data/ 11 | ports: 12 | - "27017:27017" 13 | image: "mongo" -------------------------------------------------------------------------------- /Docs/Design.md: -------------------------------------------------------------------------------- 1 | # 设计目标 2 | 3 | 高质量, 高灵活 4 | 5 | 我们HTTP/HTTPS的代理是从网上获取来的. 6 | 7 | 所有天生具有不稳定的属性. 8 | 9 | 我们就需要通过持续的校验来判断代理是否可靠. 10 | 11 | # 高质量 12 | 13 | 问题是我们怎么定义高质量呢? 14 | 15 | 起初, 我们认为高质量为 `代理的可用性=校验成功次数/校验总共次数` 16 | 17 | 但这个思路不严谨, 举个栗子: 18 | 19 | 有个代理A, 一个月前总共校验500次, 成功校验486次, 代理的可用性=486/500=97.2% 20 | 21 | 看起来似乎不错, 然而, 有个问题, 假如这个代理突然无效了. 22 | 23 | 那这个代理的可用性会慢慢的降低, 486/501=97%, 486/502=96.81%, 486/503=96.62% 24 | 25 | 这个数字看起来依然是一个可用性很高的代理, 但却无法正常使用. 26 | 27 | 所以我们需要额外的信息来判断一个高质量的代理是否可以使用. 28 | 29 | 这个额外的信息就是代理`最后一次校验的状态`. 30 | 31 | 所以我们通过`可用性`和`最后一次成功的状态`来过滤高质量的代理. 32 | 33 | # 高灵活 34 | 35 | 怎么定义高灵活呢? 不太好理解. 36 | 37 | 换个角度, 作为一个代理池, 如何才能更好的提供服务. 38 | 39 | 一开始的设计是通过RESTful API暴露接口提供服务. 40 | 41 | 但这种方式对代码有侵入性, 使用的体验非常不好. 42 | 43 | 于是, 我们可以在做一个动态代理. 44 | 45 | 客户端 -> 动态代理 -> 一般代理 -> WEB服务器 46 | 47 | 在这个动态代理里, 我们可以根据统计的数据做一些过滤, 以及对代理进行反馈. 48 | 49 | # 校验代理 50 | 51 | 实现没什么问题, 关键是在于规模. 52 | 53 | 当代理池的代理数量非常大时, 会导致大量校验, 但其中大部分的校验是无效的. 54 | 55 | 根据`最近最少使用算法`的思想: 56 | 57 | 如果一个代理, 之前大部分时间不可用, 那它在以后不可用的概率非常大. 58 | 59 | 但是, 如果一个代理, 之前大部分时间可用, 那它在以后可用的概率其实是`未知的`. 60 | 61 | 所以我们不需要对`一直不能用`的代理进行校验, 而对`一直可用`的代理进行校验. 62 | 63 | 关键在于应该多久校验一次呢? 64 | 65 | 大部分时间不可用, 换个说法也就是校验失败次数的占比较大. 66 | 67 | 大部分时间可用, 换个说法也就是校验成功次数的占比较大. 68 | 69 | 于是, 我们可以把`校验的时间间隔`与`校验成功次数/校验失败次数`做关联. 70 | 71 | 想法没什么问题, 但实现起来有点麻烦. 72 | 73 | 这里我们可以使用一个额外的信息来代表代理的质量, 这个额外的信息就是 74 | 75 | 每次校验代理后, 如果成功, 我们就+1分, 如果失败, 我们就-1分. 76 | 77 | 最后我们用`代理的评分` 乘以 `常量间隔数`, 就能减少`一直不能用`代理的校验次数. 78 | 79 | 而`一直可用`的代理, 我们直接用`常量间隔数`频率来验证. 80 | 81 | 这样其实就能大量的减少无效的校验. 82 | 83 | 总结一句话: 太差的代理不管了, 表现好的代理要紧盯着. -------------------------------------------------------------------------------- /Docs/images/2019-06-12-22-11-21.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ozhiwei/SmartProxyPool/f38a21857c9f5e3511bc65792a493cf45f6c491e/Docs/images/2019-06-12-22-11-21.png -------------------------------------------------------------------------------- /Docs/images/2019-06-12-22-13-10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ozhiwei/SmartProxyPool/f38a21857c9f5e3511bc65792a493cf45f6c491e/Docs/images/2019-06-12-22-13-10.png -------------------------------------------------------------------------------- /Docs/images/2019-06-12-22-17-34.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ozhiwei/SmartProxyPool/f38a21857c9f5e3511bc65792a493cf45f6c491e/Docs/images/2019-06-12-22-17-34.png -------------------------------------------------------------------------------- /Docs/images/2019-06-12-22-22-46.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ozhiwei/SmartProxyPool/f38a21857c9f5e3511bc65792a493cf45f6c491e/Docs/images/2019-06-12-22-22-46.png -------------------------------------------------------------------------------- /Docs/images/2019-06-15-08-18-36.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ozhiwei/SmartProxyPool/f38a21857c9f5e3511bc65792a493cf45f6c491e/Docs/images/2019-06-15-08-18-36.png -------------------------------------------------------------------------------- /Docs/images/2019-06-15-10-35-29.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ozhiwei/SmartProxyPool/f38a21857c9f5e3511bc65792a493cf45f6c491e/Docs/images/2019-06-15-10-35-29.png -------------------------------------------------------------------------------- /Docs/images/2019-06-15-13-18-47.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ozhiwei/SmartProxyPool/f38a21857c9f5e3511bc65792a493cf45f6c491e/Docs/images/2019-06-15-13-18-47.png -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 J_hao104 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # 当前状况 3 | 4 | 不维护(新功能), 原因如下 5 | 6 | 1. 个人没有使用场景, 做出的东西可能不适用于现实场景. 7 | 2. 代码结构比较混乱, 因为是fork其他的项目, 代码的整体比较松散, 对代码洁癖来说, 是种折磨. 8 | 3. 使用场景不好, 无论是代理爬虫或者其他, 都不是一个有利于国家和社会的行为, 技术无罪, 人心难测. 9 | 10 | 以上, 不再开发新功能, 如果有bug之类的会进行修复, 仅供学习交流. 11 | 12 | # 介绍 13 | 14 | 高质量, 高灵活的开放代理池服务 15 | 16 | 可能是`全球第一个`带有`智能动态代理`的代理池服务. 17 | 18 | 这下牛皮吹大了, 不好下来. 19 | 20 | [ProxyPool Demo](http://proxy.1again.cc:35050/api/v1/proxy/) (我就是个栗子, 别指望我能有多稳定!) 21 | 22 | --- 23 | 24 | # 功能/特点 25 | 26 | 我们的目标是`高质量`, `高灵活`. 27 | 28 | 所有功能都是围绕这两点开发的: 29 | 30 | 1. 所有代理都有验证的`计数`和`评分`, 验证成功的次数 / 总计验证的次数 == 代理可用率 (数据库界面) 31 | 32 | ![](Docs/images/2019-06-12-22-11-21.png) 33 | 34 | 2. 支持动态代理(手动加粗) 35 | 36 | ``` 37 | root@1again:~# curl -x "proxy.1again.cc:36050" https://httpbin.org/ip 38 | { 39 | "origin": "183.82.32.56" 40 | } 41 | root@1again:~# curl -x "proxy.1again.cc:36050" https://httpbin.org/ip 42 | { 43 | "origin": "200.149.19.170" 44 | } 45 | root@1again:~# curl -x "proxy.1again.cc:36050" https://httpbin.org/ip 46 | { 47 | "origin": "125.21.43.82" 48 | } 49 | root@1again:~# curl -x "proxy.1again.cc:36050" https://httpbin.org/ip 50 | { 51 | "origin": "110.52.235.124" 52 | } 53 | root@1again:~# curl -x "proxy.1again.cc:36050" https://httpbin.org/ip 54 | { 55 | "origin": "176.74.134.6" 56 | } 57 | ``` 58 | 59 | 3. 获取代理时可以根据是否支持`https`, 透明还是匿名(普匿)`type`, 代理的所在的区域`region`进行过滤, 举栗子 60 | 61 | ``` 62 | # 获取支持https的proxy 63 | http://proxy.1again.cc:35050/api/v1/proxy/?https=1 64 | 65 | # 获取匿名的proxy 66 | http://proxy.1again.cc:35050/api/v1/proxy/?type=2 67 | 68 | # 获取所在区域为中国的proxy 69 | http://proxy.1again.cc:35050/api/v1/proxy/?region=中国 70 | 71 | # 获取所在区域不为中国的proxy 72 | http://proxy.1again.cc:35050/api/v1/proxy/?region=!中国 73 | 74 | # 获取支持https, 匿名, 所在区域为中国的rpoxy 75 | http://proxy.1again.cc:35050/api/v1/proxy/?https=1&type=2®ion=中国 76 | ``` 77 | 78 | 4. [WEB页面的管理](http://proxy.1again.cc:35050/admin) 用户名:admin 密码:admin (尔敢乱动, 打洗雷啊!) 79 | 80 | ![](Docs/images/2019-06-15-08-18-36.png) 81 | 82 | 5. 可以通过WEB界面配置参数. 83 | 84 | ![](Docs/images/2019-06-15-13-18-47.png) 85 | 86 | 6. WEB管理`抓取代理的站点` 87 | 88 | ![](Docs/images/2019-06-12-22-22-46.png) 89 | 90 | 7. 支持`gevent`并发模式, 效果杠杠的, 别看广告, 看疗效! 91 | 92 | ``` 93 | 2019-06-13 10:00:26,656 ProxyFetch.py[line:103] INFO fetch [ xicidaili ] proxy finish, total:400, succ:65, fail:0, skip:335, elapsed_time:1s 94 | 2019-06-13 10:00:26,662 ProxyFetch.py[line:103] INFO fetch [ proxylistplus ] proxy finish, total:0, succ:0, fail:0, skip:0, elapsed_time:1s 95 | 2019-06-13 10:00:27,179 ProxyFetch.py[line:103] INFO fetch [ iphai ] proxy finish, total:83, succ:17, fail:0, skip:66, elapsed_time:2s 96 | 2019-06-13 10:00:27,374 ProxyFetch.py[line:103] INFO fetch [ 66ip ] proxy finish, total:0, succ:0, fail:0, skip:0, elapsed_time:2s 97 | 2019-06-13 10:00:32,276 ProxyFetch.py[line:103] INFO fetch [ ip3366 ] proxy finish, total:15, succ:0, fail:0, skip:15, elapsed_time:7s 98 | 2019-06-13 10:00:33,888 ProxyFetch.py[line:103] INFO fetch [ ip181 ] proxy finish, total:0, succ:0, fail:0, skip:0, elapsed_time:8s 99 | 2019-06-13 10:00:34,978 ProxyFetch.py[line:103] INFO fetch [ mimiip ] proxy finish, total:0, succ:0, fail:0, skip:0, elapsed_time:9s 100 | 2019-06-13 10:00:38,182 ProxyFetch.py[line:103] INFO fetch [ proxy-list ] proxy finish, total:28, succ:28, fail:0, skip:0, elapsed_time:13s 101 | 2019-06-13 10:01:36,432 ProxyVerify.py[line:301] INFO useful_proxy verify proxy finish, total:636, succ:327, fail:309, elapsed_time:58s 102 | 2019-06-13 10:31:15,800 ProxyVerify.py[line:301] INFO useful_proxy verify proxy finish, total:481, succ:299, fail:182, elapsed_time:37s 103 | 2019-06-13 11:01:37,569 ProxyVerify.py[line:301] INFO useful_proxy verify proxy finish, total:639, succ:315, fail:324, elapsed_time:59s 104 | 2019-06-13 11:31:54,798 ProxyVerify.py[line:301] INFO useful_proxy verify proxy finish, total:977, succ:342, fail:635, elapsed_time:76s 105 | 2019-06-13 12:01:21,659 ProxyVerify.py[line:301] INFO useful_proxy verify proxy finish, total:608, succ:314, fail:294, elapsed_time:43s 106 | ``` 107 | 108 | 8. 实在编不下去了, 你行你来! 109 | 110 | # 文档 111 | 112 | [设计文档](Docs/Design.md) 113 | 114 | # 目前 115 | 116 | 基本上满足了当初的设想, 准备开始写文档和代码优化. 117 | 118 | # 使用场景 119 | 120 | 1. 主要还是用于爬虫. 121 | 122 | 2. 公司需要有个内部代理池服务, 用来干一些丧尽天良的坏事. 123 | 124 | 3. 个人需要用来干一些见不得人的事. 125 | 126 | # 安装/部署 127 | 128 | ## 生产环境 129 | 130 | ```shell 131 | # Install Docker 132 | curl -sSL https://get.docker.com | sh 133 | 134 | # start mongo database 135 | docker run -d --name mongo -v /data/mongodb:/data -p 27017:27017 mongo 136 | 137 | # Start proxy_pool container 138 | docker run -d --name proxy_pool --link mongo:proxy_pool_db -p 35050:35050 -p 36050:36050 1again/proxy_pool 139 | ``` 140 | 141 | ## 开发环境 142 | 143 | ```shell 144 | # Clone Repo 145 | git clone https://github.com/1again/ProxyPool 146 | 147 | # Entry Dir 148 | cd ProxyPool 149 | 150 | # Install Docker 151 | curl -sSL https://get.docker.com | sh 152 | 153 | # start mongo database 154 | docker run -d --name mongo -v /data/mongodb:/data -p 27017:27017 mongo 155 | 156 | # Start proxy_pool container 157 | docker run -it --rm --link mongo:proxy_pool_db -v $(pwd):/usr/src/app -p 35050:35050 -p 36050:36050 1again/proxy_pool 158 | ``` 159 | 160 | # 使用 161 | 162 | 启动过几分钟后就能看到抓取到的代理IP, 你可以直接在WEB管理界面中中查看 163 | 164 | ## DYNAMIC PROXY 165 | 166 | ```shell 167 | curl -x 'your_server_ip:36050' your_access_url 168 | 169 | like this: 170 | curl -x "proxy.1again.cc:36050" https://httpbin.org/ip 171 | ``` 172 | 173 | ## RESTFUL API 174 | 175 | ```python 176 | 177 | API_LIST = { 178 | "/api/v1/proxy/": { 179 | "args": { 180 | "https": { 181 | "value": [1], 182 | "desc": "need https proxy? 1 == true", 183 | "required": False, 184 | }, 185 | "region": { 186 | "value": "region name like 中国 or 广州 or 江苏", 187 | "desc": "Get Region Proxy", 188 | "required": False, 189 | }, 190 | "type": { 191 | "value": [1,2], 192 | "desc": "clear proxy 1 or (common) anonymous 2", 193 | "required": False, 194 | } 195 | }, 196 | "desc": "Get A Random Proxy" 197 | }, 198 | "/api/v1/proxies/": { 199 | "args": { 200 | "https": { 201 | "value": [1], 202 | "desc": "need https proxy? 1 == true", 203 | "required": False, 204 | }, 205 | "region": { 206 | "value": "region name like 中国 or 广州 or 江苏", 207 | "desc": "Get Region Proxy", 208 | "required": False, 209 | }, 210 | "type": { 211 | "value": [1,2], 212 | "desc": "clear proxy 1 or (common) anonymous 2", 213 | "required": False, 214 | } 215 | }, 216 | "desc": "Get All Proxy", 217 | }, 218 | } 219 | 220 | ``` 221 | 222 | ## 扩展代理 223 | 224 | 项目默认包含几个免费的代理获取方法 225 | 226 | 如果遇到好的免费代理渠道, 可以自行添加其他代理获取的方法. 227 | 228 | 添加一个新的代理获取方法如下: 229 | 230 | 首先在`Src/Fetcher/fetchers/`目录中添加你的代理类. 231 | 232 | 该类需要有一个`run`方法, 以生成器(yield)形式返回`host:ip`格式的代理,例如: 233 | 234 | ```python 235 | 236 | # 文件名任意, 一般建议与`fetcher_host`的中间部分保持一致方便识别 237 | # Class名, 固定为`CustomFetcher` 238 | class CustomFetcher(): 239 | # 只用来识别的, 会映射到数据库里面 240 | fetcher_host = "www.66ip.cn" 241 | 242 | def run(self): 243 | url_list = [ 244 | 'http://www.xxx.com/', 245 | ] 246 | for url in url_list: 247 | html_tree = getHtmlTree(url) 248 | ul_list = html_tree.xpath('//ul[@class="l2"]') 249 | for ul in ul_list: 250 | try: 251 | yield ':'.join(ul.xpath('.//li/text()')[0:2]) 252 | except Exception as e: 253 | print(e) 254 | ``` 255 | 256 | `ProxyFetchSchedule` 会每隔一段时间抓取一次代理,下次抓取时会自动识别调用你定义的方法。 257 | 258 | # Contributing 259 | 260 | 感谢你的支持, 让我们变得更好! 261 | 262 | 为了规范和清晰, 我们需要一起做些简单约定. 263 | 264 | 两个主要的分支 265 | develop 为下个版本的内容 266 | master 为当前稳定版本的内容 267 | 268 | 1. 小修小改, 不影响原版本的修改, 可以在develop上进行, 然后pull requests 269 | 2. 大动干戈, 影响之前版本的修改, 需要新建一个分支eg: feature_random_proxy, 然后进行pull requests. 270 | 271 | 我会将新分支合并到develop上, 并在演示的机器上运行一段时间后合并至master. 272 | 273 | 以上, 感谢! 274 | 275 | # 问题反馈 276 | 277 | 任何问题欢迎在[Issues](https://github.com/1again/ProxyPool/issues)中反馈. 278 | 279 | 我们的目标是, 没有蛀牙! 280 | -------------------------------------------------------------------------------- /Src/Config/ConfigManager.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # !/usr/bin/env python 3 | 4 | import sys, os 5 | sys.path.append("Src") 6 | 7 | from Util.utilClass import ConfigParse 8 | from pymongo import MongoClient 9 | from Notify.NotifyManager import register_event, NOTIFY_EVENT 10 | 11 | def is_number(s): 12 | result = False 13 | try: 14 | float(s) 15 | result = True 16 | except ValueError: 17 | pass 18 | 19 | if not result: 20 | try: 21 | import unicodedata 22 | unicodedata.numeric(s) 23 | result = True 24 | except (TypeError, ValueError): 25 | pass 26 | 27 | return result 28 | 29 | class BaseConfig(object): 30 | config_name = "Config.ini" 31 | config_dir = "../../Config/" 32 | 33 | def __init__(self): 34 | pwd = os.path.dirname(os.path.realpath(__file__)) 35 | relative_path = "{pwd}/{config_dir}".format(pwd=pwd, config_dir=self.config_dir) 36 | self.config_dir = os.path.realpath(relative_path) 37 | self.setting = {} 38 | 39 | self.load_config() 40 | 41 | def load_config(self): 42 | self.config_path = os.path.join(self.config_dir, self.config_name) 43 | self.default_config_path = os.path.join(self.config_dir, "Config.ini.default") 44 | 45 | self.config = self.load_config_from_path() 46 | 47 | self.load_setting() 48 | 49 | def load_config_from_path(self): 50 | config = ConfigParse() 51 | if os.path.exists(self.config_path): 52 | config.read(self.config_path) 53 | else: 54 | config.read(self.default_config_path) 55 | 56 | return config 57 | 58 | def load_setting(self): 59 | for section in self.config.sections(): 60 | for item in self.config.items(section): 61 | field = item[0] 62 | value = int(item[1]) if is_number(item[1]) else item[1] 63 | self.setting[field] = value 64 | 65 | class DBConfig(object): 66 | db_name = "proxy" 67 | docs_name = "default" 68 | 69 | def __init__(self): 70 | client = MongoClient(host=base_config.setting.get("db_host"), port=base_config.setting.get("db_port"), username=base_config.setting.get("db_user"), password=base_config.setting.get("db_pass")) 71 | 72 | self.db = client[self.db_name] 73 | 74 | class SettingConfig(DBConfig): 75 | db_name = "proxy" 76 | docs_name = "setting" 77 | default_config = dict( 78 | verify_useful_proxy_concurrency = 100, 79 | verify_useful_proxy_interval = 30, 80 | 81 | fetch_new_proxy_concurrency = 100, 82 | fetch_new_proxy_interval = 30, 83 | 84 | # clean proxy when number is positive 85 | # disable clean proxy when number is -1 86 | hold_useful_proxy_number = -1, 87 | ) 88 | 89 | def __init__(self): 90 | super(SettingConfig, self).__init__() 91 | 92 | self.setting = {} 93 | self.load_data_to_db() 94 | self.load_setting_from_db() 95 | 96 | register_event(NOTIFY_EVENT["AFTER_SETTING_CHANGE"], self.dispatch_event) 97 | 98 | def dispatch_event(self, **kwargs): 99 | self.reload_setting_from_db(**kwargs) 100 | 101 | def load_data_to_db(self): 102 | for field, value in self.default_config.items(): 103 | query = { "setting_name": field } 104 | if self.db[self.docs_name].find_one(query): 105 | pass 106 | else: 107 | data = dict( 108 | setting_name = field, 109 | setting_value = value, 110 | setting_state = True, 111 | ) 112 | 113 | self.db[self.docs_name].insert_one(data) 114 | 115 | def load_setting_from_db(self): 116 | self.reload_setting_from_db() 117 | 118 | def reload_setting_from_db(self, **kwargs): 119 | cursor = self.db.setting.find() 120 | for item in cursor: 121 | if item["setting_state"]: 122 | field = item["setting_name"] 123 | value = item["setting_value"] 124 | value = int(value) if is_number(value) else value 125 | self.setting[field] = value 126 | else: 127 | field = item["setting_name"] 128 | self.setting[field] = None 129 | 130 | class FetcherConfig(DBConfig): 131 | db_name = "proxy" 132 | docs_name = "fetchers" 133 | 134 | def __init__(self): 135 | super(FetcherConfig, self).__init__() 136 | 137 | self.fetcher_list = [] 138 | cursor = self.db[self.docs_name].find() 139 | for item in cursor: 140 | if item["status"]: 141 | self.fetcher_list.append(item["name"]) 142 | 143 | def update_fetcher_list(self, items): 144 | for item in items: 145 | query = { "name": item } 146 | if self.db[self.docs_name].find_one(query): 147 | pass 148 | else: 149 | data = dict( 150 | name = item, 151 | status = True, 152 | succ=0, 153 | fail=0, 154 | skip=0, 155 | total=0, 156 | ) 157 | self.db[self.docs_name].insert_one(data) 158 | self.fetcher_list.append(item) 159 | 160 | def get_fetcher_list(self): 161 | result = self.fetcher_list 162 | return result 163 | 164 | def update_stat(self, name, stat): 165 | query = { 166 | "name": name, 167 | } 168 | 169 | data = { 170 | "$inc": { 171 | "succ": stat["succ"], 172 | "fail": stat["fail"], 173 | "skip": stat["skip"], 174 | "total": stat["total"], 175 | } 176 | } 177 | 178 | self.db[self.docs_name].update(query, data) 179 | 180 | base_config = BaseConfig() 181 | setting_config = SettingConfig() 182 | # fetcher_config = FetcherConfig() 183 | 184 | if __name__ == '__main__': 185 | pass -------------------------------------------------------------------------------- /Src/Config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ozhiwei/SmartProxyPool/f38a21857c9f5e3511bc65792a493cf45f6c491e/Src/Config/__init__.py -------------------------------------------------------------------------------- /Src/DB/DbClient.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # !/usr/bin/env python 3 | 4 | import os 5 | import sys 6 | import time 7 | 8 | from Config import ConfigManager 9 | from Util.utilClass import Singleton 10 | from DB.MongodbClient import MongodbClient 11 | from Log.LogManager import log 12 | from Manager import ProxyManager 13 | 14 | class DocsModel(object): 15 | docs_name = "test" 16 | 17 | def __init__(self): 18 | db_name = ConfigManager.base_config.setting.get("db_name") 19 | db_host = ConfigManager.base_config.setting.get("db_host") 20 | db_port = ConfigManager.base_config.setting.get("db_port") 21 | db_username = ConfigManager.base_config.setting.get("db_user") 22 | db_password = ConfigManager.base_config.setting.get("db_pass") 23 | 24 | self.mc = MongodbClient( 25 | host=db_host, 26 | port=db_port, 27 | db_name=db_name, 28 | docs_name=self.docs_name, 29 | username=db_username, 30 | password=db_password, 31 | ) 32 | 33 | def parse_regin_to_mongo(region_str): 34 | if region_str.startswith('!'): 35 | return { "$nin": [region_str[1:]] } 36 | else: 37 | return { "$in": [region_str] } 38 | 39 | class UsefulProxyDocsModel(DocsModel): 40 | docs_name = "useful_proxy" 41 | 42 | def cleanUsefulProxy(self, **kwargs): 43 | result = 0 44 | hold_number = kwargs.get("hold_number") 45 | 46 | query = {"total": {"$ne": 0}} 47 | total_number = self.mc.count(query) 48 | clean_number = total_number - hold_number 49 | 50 | if clean_number > 0 and hold_number != -1: 51 | operation_list = [ 52 | { 53 | "$match": query, 54 | }, 55 | { 56 | "$project": { "total": 1, "disable_rate": { "$divide": ["$fail", "$total"] } }, 57 | }, 58 | { 59 | "$sort": { "disable_rate": -1, "total": -1 }, 60 | }, 61 | { 62 | "$limit": clean_number, 63 | }, 64 | ] 65 | 66 | 67 | items = self.mc.aggregate(operation_list) 68 | result = len(items) 69 | for item in items: 70 | query = { 71 | "_id": item["_id"] 72 | } 73 | self.mc.delete(query) 74 | 75 | return result 76 | 77 | def cleanRawProxy(self, **kwargs): 78 | 79 | query = { 80 | "health": { 81 | "$lt": 1 82 | } 83 | } 84 | 85 | data = self.mc.delete(query) 86 | result = data['n'] 87 | 88 | return result 89 | 90 | def getAllValidUsefulProxy(self, **kwargs): 91 | https = kwargs.get("https", None) 92 | region = kwargs.get("region", None) 93 | type_ = kwargs.get("type", None) 94 | 95 | result = [] 96 | operation_list = [ 97 | { 98 | "$match": { "total": { "$ne": 0 } } 99 | } 100 | ] 101 | 102 | if https: 103 | operation_list[0]["$match"]["https"] = { "$eq": https } 104 | 105 | if type_: 106 | operation_list[0]["$match"]["type"] = { "$eq": type_ } 107 | 108 | if region: 109 | operation_list[0]["$match"]["region_list"] = parse_regin_to_mongo(region) 110 | 111 | log.debug("getAllValidUsefulProxy, operation_list:{operation_list}, ".format(operation_list=str(operation_list))) 112 | result = self.mc.aggregate(operation_list) 113 | 114 | return result 115 | 116 | def getHighQualityUsefulProxy(self, **kwargs): 117 | query = { "quality": { "$gt": -1 } } 118 | result= self.mc.find(query) 119 | return result 120 | 121 | def getLowQualityUsefulProxy(self, **kwargs): 122 | query = { "quality": { "$lt": 0 } } 123 | result= self.mc.find(query) 124 | return result 125 | 126 | def getAllUsefulProxy(self, **kwargs): 127 | query = {} 128 | result = self.mc.find(query) 129 | return result 130 | 131 | def checkProxyExists(self, proxy): 132 | query = {"proxy": proxy} 133 | result = self.mc.exists(query) 134 | return result 135 | 136 | def checkUsefulProxyExists(self, proxy): 137 | result = self.checkProxyExists(proxy) 138 | return result 139 | 140 | # TODO: refine function 141 | def getSampleUsefulProxy(self, **kwargs): 142 | https = kwargs.get("https", None) 143 | region = kwargs.get("region", None) 144 | type_ = kwargs.get("type", None) 145 | 146 | result = None 147 | operation_list = [ 148 | { 149 | "$match": { 150 | "total": { "$ne": 0}, 151 | "last_status": { "$eq": ProxyManager.PROXY_LAST_STATUS["SUCC"] }, 152 | } 153 | }, 154 | { 155 | "$sample": { "size": 1} 156 | } 157 | ] 158 | 159 | if https: 160 | operation_list[0]["$match"]["https"] = { "$eq": https } 161 | 162 | if type_: 163 | operation_list[0]["$match"]["type"] = { "$eq": type_ } 164 | 165 | if region: 166 | operation_list[0]["$match"]["region_list"] = parse_regin_to_mongo(region) 167 | 168 | log.debug("getSampleUsefulProxy, operation_list:{operation_list}, ".format(operation_list=str(operation_list))) 169 | data = self.mc.aggregate(operation_list) 170 | if data: 171 | result = data[0] 172 | 173 | return result 174 | 175 | def getVerifyUsefulProxy(self, now): 176 | query = { 177 | "next_verify_time": { 178 | "$lt": now 179 | } 180 | } 181 | result = self.mc.find(query) 182 | return result 183 | 184 | def getQualityUsefulProxy(self, **kwargs): 185 | https = kwargs.get("https", None) 186 | region = kwargs.get("region", None) 187 | type_ = kwargs.get("type", None) 188 | 189 | result = None 190 | operation_list = [ 191 | { 192 | "$match": { 193 | "total": { "$ne": 0 }, 194 | } 195 | }, 196 | { 197 | "$sort": { "quality": -1, "total": -1 }, 198 | }, 199 | ] 200 | 201 | if https: 202 | operation_list[0]["$match"]["https"] = { "$eq": https } 203 | 204 | if type_: 205 | operation_list[0]["$match"]["type"] = { "$eq": type_ } 206 | 207 | if region: 208 | operation_list[0]["$match"]["region_list"] = parse_regin_to_mongo(region) 209 | 210 | log.debug("getSampleUsefulProxy, operation_list:{operation_list}, ".format(operation_list=str(operation_list))) 211 | result = self.mc.aggregate(operation_list) 212 | 213 | return result 214 | 215 | def getProxyNum(self): 216 | result = self.mc.count() 217 | return result 218 | 219 | def saveUsefulProxy(self, data): 220 | self.mc.insert(data) 221 | 222 | def updateUsefulProxy(self, proxy, data): 223 | query = {"proxy": proxy} 224 | self.updateProxy(query, data) 225 | 226 | def deleteUsefulProxy(self, proxy): 227 | query = {"proxy": proxy} 228 | self.mc.delete(query) 229 | 230 | def tickUsefulProxyVaildSucc(self, proxy): 231 | now_time = int(time.time()) 232 | query = {"proxy": proxy} 233 | 234 | data = { 235 | "$inc": { 236 | "succ": 1, 237 | "keep_succ": 1, 238 | }, 239 | "$set": { 240 | "last_status": ProxyManager.PROXY_LAST_STATUS["SUCC"], 241 | "last_succ_time": now_time 242 | }, 243 | } 244 | 245 | item = self.mc.find_one(query) 246 | if item["quality"] < 0: 247 | data["$set"]["quality"] = 1 248 | else: 249 | data["$inc"]["quality"] = 1 250 | 251 | self.updateProxy(query, data) 252 | 253 | def getProxy(self, proxy): 254 | query = {"proxy": proxy} 255 | result = self.mc.find_one(query) 256 | return result 257 | 258 | def updateProxy(self, query, data): 259 | self.mc.upsert(query, data) 260 | 261 | def tickUsefulProxyVaildFail(self, proxy): 262 | query = {"proxy": proxy} 263 | data = { 264 | "$inc": { 265 | "fail": 1, 266 | "quality": -1 267 | }, 268 | "$set": { 269 | "last_status": ProxyManager.PROXY_LAST_STATUS["FAIL"], 270 | "keep_succ": 0 271 | }, 272 | } 273 | self.updateProxy(query, data) 274 | 275 | def tickUsefulProxyVaildTotal(self, proxy): 276 | query = {"proxy": proxy} 277 | data = {'$inc': {'total': 1}} 278 | self.updateProxy(query, data) 279 | 280 | class RawProxyDocsModel(DocsModel): 281 | docs_name = "raw_proxy" 282 | 283 | def getAll(self): 284 | result = self.mc.find() 285 | return result 286 | 287 | def getAllRawProxy(self, **kwargs): 288 | result = self.getAll() 289 | return result 290 | 291 | def cleanRawProxy(self, **kwargs): 292 | 293 | query = { 294 | "health": { 295 | "$lt": 1 296 | } 297 | } 298 | 299 | data = self.mc.delete(query) 300 | result = data['n'] 301 | 302 | return result 303 | 304 | def checkProxyExists(self, proxy): 305 | query = {"proxy": proxy} 306 | result = self.mc.exists(query) 307 | return result 308 | 309 | def checkRawProxyExists(self, proxy): 310 | result = self.checkProxyExists(proxy) 311 | return result 312 | 313 | def getProxyNum(self): 314 | result = self.mc.count() 315 | return result 316 | 317 | def saveRawProxy(self, data): 318 | result = self.mc.insert(data) 319 | return result 320 | 321 | def deleteRawProxy(self, proxy): 322 | query = {"proxy": proxy} 323 | result = self.mc.delete(query) 324 | return result 325 | 326 | def tickRawProxyVaildFail(self, proxy): 327 | query = {"proxy": proxy} 328 | data = {'$inc': {'health': -1}} 329 | self.updateProxy(query, data) 330 | 331 | class DomainCounterDocsModel(DocsModel): 332 | docs_name = "domain_counter" 333 | 334 | def tickDomainRequestState(self, domain, code): 335 | query = {"domain": domain} 336 | data = {'$inc': {code: 1}} 337 | self.mc.upsert(query, data) 338 | 339 | def getDomainCounter(self, domain): 340 | query = {"domain": domain} 341 | result = self.mc.find_one(query) 342 | return result 343 | 344 | class FetchersDocsModel(DocsModel): 345 | docs_name = "fetchers" 346 | 347 | def getAllFetcher(self): 348 | query = {} 349 | result = self.mc.find(query) 350 | return result 351 | 352 | def getExecFetcher(self, now): 353 | query = {"next_fetch_time": {"$lt": now}} 354 | result = self.mc.find(query) 355 | return result 356 | 357 | def getFetcher(self, name): 358 | query = { "name": name } 359 | result = self.mc.find(query) 360 | return result 361 | 362 | def updateFetcher(self, name, data): 363 | query = {"name": name} 364 | self.mc.upsert(query, data) 365 | 366 | if __name__ == "__main__": 367 | pass 368 | -------------------------------------------------------------------------------- /Src/DB/MongodbClient.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from pymongo import MongoClient 4 | 5 | 6 | class MongodbClient(object): 7 | db_name = "proxy" 8 | 9 | def __init__(self, host, port, db_name, docs_name, **kwargs): 10 | self.conn = MongoClient(host, port, **kwargs) 11 | self.db = self.conn[db_name] 12 | self.docs = self.db[docs_name] 13 | 14 | def find_one(self, query): 15 | result = self.docs.find_one(query) 16 | return result 17 | 18 | def insert(self, data): 19 | result = self.docs.insert(data) 20 | return result 21 | 22 | def aggregate(self, operation_list): 23 | result = list(self.docs.aggregate(operation_list)) 24 | return result 25 | 26 | def delete(self, query): 27 | result = self.docs.remove(query) 28 | return result 29 | 30 | def find(self, query): 31 | result = list(self.docs.find(query)) 32 | return result 33 | 34 | def update(self, query, data): 35 | result = self.docs.update(query, data) 36 | return result 37 | 38 | def upsert(self, query, data): 39 | result = self.docs.update(query, data, upsert=True) 40 | return result 41 | 42 | def exists(self, query): 43 | result = False 44 | data = self.find_one(query) 45 | if data: 46 | result = True 47 | 48 | return result 49 | 50 | def count(self, query={}): 51 | result = self.docs.count(query) 52 | return result 53 | 54 | if __name__ == "__main__": 55 | # db = MongodbClient('first', 'localhost', 27017) 56 | # db.put('127.0.0.1:1') 57 | # db2 = MongodbClient('second', 'localhost', 27017) 58 | # db2.put('127.0.0.1:2') 59 | # print(db.pop()) 60 | pass 61 | -------------------------------------------------------------------------------- /Src/DB/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: __init__.py.py 5 | Description : 6 | Author : JHao 7 | date: 2016/12/2 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2016/12/2: 11 | ------------------------------------------------- 12 | """ -------------------------------------------------------------------------------- /Src/Fetcher/FetcherManager.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append("Src") 3 | import os 4 | import importlib 5 | 6 | from Config import ConfigManager 7 | from Manager import ProxyManager 8 | 9 | SKIP_FILE_LIST = [ 10 | "__init__.py", 11 | "__pycache__", 12 | ] 13 | 14 | def init(): 15 | file_names = os.listdir("Src/Fetcher/fetchers") 16 | for file_name in file_names: 17 | if file_name in SKIP_FILE_LIST: 18 | pass 19 | else: 20 | fetcher_name = os.path.splitext(file_name)[0] 21 | fetcher_class = getFetcherClass(fetcher_name) 22 | fetcher_host = fetcher_class.fetcher_host 23 | 24 | item = ProxyManager.proxy_manager.getFetcher(fetcher_name) 25 | if item: 26 | pass 27 | else: 28 | saveDefaultFetcher(fetcher_name, fetcher_host) 29 | 30 | return True 31 | 32 | def saveDefaultFetcher(name, host): 33 | data = dict( 34 | name = name, 35 | host = host, 36 | status = True, 37 | succ=0, 38 | fail=0, 39 | skip=0, 40 | total=0, 41 | interval=30, 42 | next_fetch_time=0, 43 | ) 44 | ProxyManager.proxy_manager.updateFetcher(name, data) 45 | 46 | def getFetcherClass(name): 47 | module_name = "Fetcher.fetchers.%s" % (name) 48 | module = importlib.import_module(module_name) 49 | result = getattr(module, "CustomFetcher") 50 | return result 51 | 52 | init() 53 | 54 | if __name__ == '__main__': 55 | pass -------------------------------------------------------------------------------- /Src/Fetcher/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # !/usr/bin/env python 3 | 4 | 5 | class Fetcher(): 6 | 7 | fetcher_host = "cn-proxy.com" 8 | 9 | def __init__(self): 10 | split_list = self.fetcher_host.split('.') 11 | split_length = len(split_list) 12 | if split_length == 4: 13 | name = split_list[-3] 14 | else: 15 | name = split_list[-2] 16 | 17 | self.fetcher_name = name 18 | 19 | -------------------------------------------------------------------------------- /Src/Fetcher/fetchers/66ip.py: -------------------------------------------------------------------------------- 1 | 2 | # -*- coding: utf-8 -*- 3 | # !/usr/bin/env python 4 | 5 | import re 6 | 7 | from Util.WebRequest import WebRequest 8 | from Util.utilFunction import getHtmlTree 9 | from Fetcher import Fetcher 10 | 11 | 12 | class CustomFetcher(Fetcher): 13 | 14 | fetcher_host = "www.66ip.cn" 15 | 16 | def run(self): 17 | area = 33 18 | page = 1 19 | for area_index in range(1, area + 1): 20 | for i in range(1, page + 1): 21 | url = "http://www.66ip.cn/areaindex_{}/{}.html".format(area_index, i) 22 | html_tree = getHtmlTree(url) 23 | tr_list = html_tree.xpath("//*[@id='footer']/div/table/tr[position()>1]") 24 | if len(tr_list) == 0: 25 | continue 26 | for tr in tr_list: 27 | yield tr.xpath("./td[1]/text()")[0] + ":" + tr.xpath("./td[2]/text()")[0] 28 | break 29 | -------------------------------------------------------------------------------- /Src/Fetcher/fetchers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ozhiwei/SmartProxyPool/f38a21857c9f5e3511bc65792a493cf45f6c491e/Src/Fetcher/fetchers/__init__.py -------------------------------------------------------------------------------- /Src/Fetcher/fetchers/cn-proxy.py: -------------------------------------------------------------------------------- 1 | 2 | # -*- coding: utf-8 -*- 3 | # !/usr/bin/env python 4 | 5 | import re 6 | 7 | from Util.WebRequest import WebRequest 8 | from Util.utilFunction import getHtmlTree 9 | 10 | 11 | class CustomFetcher(): 12 | 13 | fetcher_host = "cn-proxy.com" 14 | 15 | def run(self): 16 | 17 | urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218'] 18 | request = WebRequest() 19 | for url in urls: 20 | r = request.get(url) 21 | proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\w\W](\d+)', r.text) 22 | for proxy in proxies: 23 | yield ':'.join(proxy) 24 | -------------------------------------------------------------------------------- /Src/Fetcher/fetchers/coderbusy.py: -------------------------------------------------------------------------------- 1 | 2 | # -*- coding: utf-8 -*- 3 | # !/usr/bin/env python 4 | 5 | import re 6 | 7 | from Util.WebRequest import WebRequest 8 | from Util.utilFunction import getHtmlTree 9 | 10 | 11 | class CustomFetcher(): 12 | 13 | fetcher_host = "proxy.coderbusy.com" 14 | 15 | def run(self): 16 | 17 | urls = ['https://proxy.coderbusy.com/classical/country/cn.aspx?page=1'] 18 | request = WebRequest() 19 | for url in urls: 20 | r = request.get(url) 21 | proxies = re.findall('data-ip="(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})".+?>(\d+)', r.text) 22 | for proxy in proxies: 23 | yield ':'.join(proxy) 24 | -------------------------------------------------------------------------------- /Src/Fetcher/fetchers/data5u.py: -------------------------------------------------------------------------------- 1 | 2 | # -*- coding: utf-8 -*- 3 | # !/usr/bin/env python 4 | 5 | import re 6 | 7 | from Util.WebRequest import WebRequest 8 | from Util.utilFunction import getHtmlTree 9 | 10 | 11 | class CustomFetcher(): 12 | 13 | fetcher_host = "www.data5u.com" 14 | 15 | def run(self): 16 | 17 | url_list = [ 18 | 'http://www.data5u.com/', 19 | 'http://www.data5u.com/free/gngn/index.shtml', 20 | 'http://www.data5u.com/free/gnpt/index.shtml' 21 | ] 22 | for url in url_list: 23 | html_tree = getHtmlTree(url) 24 | ul_list = html_tree.xpath('//ul[@class="l2"]') 25 | for ul in ul_list: 26 | try: 27 | yield ':'.join(ul.xpath('.//li/text()')[0:2]) 28 | except Exception as e: 29 | print(e) 30 | -------------------------------------------------------------------------------- /Src/Fetcher/fetchers/goubanjia.py: -------------------------------------------------------------------------------- 1 | 2 | # -*- coding: utf-8 -*- 3 | # !/usr/bin/env python 4 | 5 | import re 6 | 7 | from Util.WebRequest import WebRequest 8 | from Util.utilFunction import getHtmlTree 9 | 10 | 11 | class CustomFetcher(): 12 | 13 | fetcher_host = "www.goubanjia.com" 14 | 15 | def run(self): 16 | 17 | url = "http://www.goubanjia.com/" 18 | tree = getHtmlTree(url) 19 | proxy_list = tree.xpath('//td[@class="ip"]') 20 | # 此网站有隐藏的数字干扰,或抓取到多余的数字或.符号 21 | # 需要过滤掉

的内容 22 | xpath_str = """.//*[not(contains(@style, 'display: none')) 23 | and not(contains(@style, 'display:none')) 24 | and not(contains(@class, 'port')) 25 | ]/text() 26 | """ 27 | for each_proxy in proxy_list: 28 | try: 29 | # :符号裸放在td下,其他放在div span p中,先分割找出ip,再找port 30 | ip_addr = ''.join(each_proxy.xpath(xpath_str)) 31 | port = each_proxy.xpath(".//span[contains(@class, 'port')]/text()")[0] 32 | yield '{}:{}'.format(ip_addr, port) 33 | except Exception as e: 34 | pass 35 | -------------------------------------------------------------------------------- /Src/Fetcher/fetchers/ip181.py: -------------------------------------------------------------------------------- 1 | 2 | # -*- coding: utf-8 -*- 3 | # !/usr/bin/env python 4 | 5 | import re 6 | 7 | from Util.WebRequest import WebRequest 8 | from Util.utilFunction import getHtmlTree 9 | 10 | 11 | class CustomFetcher(): 12 | 13 | fetcher_host = "www.ip181.com" 14 | 15 | def run(self): 16 | 17 | url = 'http://www.ip181.com/' 18 | html_tree = getHtmlTree(url) 19 | try: 20 | tr_list = html_tree.xpath('//tr')[1:] 21 | for tr in tr_list: 22 | yield ':'.join(tr.xpath('./td/text()')[0:2]) 23 | except Exception as e: 24 | pass 25 | -------------------------------------------------------------------------------- /Src/Fetcher/fetchers/ip3366.py: -------------------------------------------------------------------------------- 1 | 2 | # -*- coding: utf-8 -*- 3 | # !/usr/bin/env python 4 | 5 | import re 6 | 7 | from Util.WebRequest import WebRequest 8 | from Util.utilFunction import getHtmlTree 9 | 10 | 11 | class CustomFetcher(): 12 | 13 | fetcher_host = "www.ip3366.net" 14 | 15 | def run(self): 16 | 17 | urls = ['http://www.ip3366.net/free/'] 18 | request = WebRequest() 19 | for url in urls: 20 | r = request.get(url) 21 | proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?(\d+)', r.text) 22 | for proxy in proxies: 23 | yield ":".join(proxy) 24 | -------------------------------------------------------------------------------- /Src/Fetcher/fetchers/iphai.py: -------------------------------------------------------------------------------- 1 | 2 | # -*- coding: utf-8 -*- 3 | # !/usr/bin/env python 4 | 5 | import re 6 | 7 | from Util.WebRequest import WebRequest 8 | from Util.utilFunction import getHtmlTree 9 | 10 | 11 | class CustomFetcher(): 12 | 13 | fetcher_host = "www.iphai.com" 14 | 15 | def run(self): 16 | 17 | urls = [ 18 | 'http://www.iphai.com/free/ng', 19 | 'http://www.iphai.com/free/np', 20 | 'http://www.iphai.com/free/wg', 21 | 'http://www.iphai.com/free/wp' 22 | ] 23 | request = WebRequest() 24 | for url in urls: 25 | r = request.get(url) 26 | proxies = re.findall(r'\s*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s*?[\s\S]*?\s*?(\d+)\s*?', 27 | r.text) 28 | for proxy in proxies: 29 | yield ":".join(proxy) 30 | -------------------------------------------------------------------------------- /Src/Fetcher/fetchers/jiangxianli.py: -------------------------------------------------------------------------------- 1 | 2 | # -*- coding: utf-8 -*- 3 | # !/usr/bin/env python 4 | 5 | import re 6 | 7 | from Util.WebRequest import WebRequest 8 | from Util.utilFunction import getHtmlTree 9 | 10 | 11 | class CustomFetcher(): 12 | 13 | fetcher_host = "ip.jiangxianli.com" 14 | 15 | def run(self): 16 | page_count = 8 17 | for i in range(1, page_count + 1): 18 | url = 'http://ip.jiangxianli.com/?page={}'.format(i) 19 | html_tree = getHtmlTree(url) 20 | tr_list = html_tree.xpath("/html/body/div[1]/div/div[1]/div[2]/table/tbody/tr") 21 | if len(tr_list) == 0: 22 | continue 23 | for tr in tr_list: 24 | yield tr.xpath("./td[2]/text()")[0] + ":" + tr.xpath("./td[3]/text()")[0] 25 | -------------------------------------------------------------------------------- /Src/Fetcher/fetchers/kuaidaili.py: -------------------------------------------------------------------------------- 1 | 2 | # -*- coding: utf-8 -*- 3 | # !/usr/bin/env python 4 | 5 | import re 6 | 7 | from Util.WebRequest import WebRequest 8 | from Util.utilFunction import getHtmlTree 9 | 10 | 11 | class CustomFetcher(): 12 | 13 | fetcher_host = "www.kuaidaili.com" 14 | 15 | def run(self): 16 | 17 | url_list = [ 18 | 'https://www.kuaidaili.com/free/inha/{page}/', 19 | 'https://www.kuaidaili.com/free/intr/{page}/' 20 | ] 21 | for url in url_list: 22 | for page in range(1, 5): 23 | page_url = url.format(page=page) 24 | tree = getHtmlTree(page_url) 25 | proxy_list = tree.xpath('.//table//tr') 26 | for tr in proxy_list[1:]: 27 | yield ':'.join(tr.xpath('./td/text()')[0:2]) 28 | -------------------------------------------------------------------------------- /Src/Fetcher/fetchers/mimiip.py: -------------------------------------------------------------------------------- 1 | 2 | # -*- coding: utf-8 -*- 3 | # !/usr/bin/env python 4 | 5 | import re 6 | 7 | from Util.WebRequest import WebRequest 8 | from Util.utilFunction import getHtmlTree 9 | 10 | 11 | class CustomFetcher(): 12 | 13 | fetcher_host = "www.mimiip.com" 14 | 15 | def run(self): 16 | 17 | url_gngao = ['http://www.mimiip.com/gngao/%s' % n for n in range(1, 10)] # 国内高匿 18 | url_gnpu = ['http://www.mimiip.com/gnpu/%s' % n for n in range(1, 10)] # 国内普匿 19 | url_gntou = ['http://www.mimiip.com/gntou/%s' % n for n in range(1, 10)] # 国内透明 20 | url_list = url_gngao + url_gnpu + url_gntou 21 | 22 | request = WebRequest() 23 | for url in url_list: 24 | r = request.get(url) 25 | if r.status_code != 200: 26 | break 27 | proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\w\W].*(\d+)', r.text) 28 | for proxy in proxies: 29 | yield ':'.join(proxy) 30 | -------------------------------------------------------------------------------- /Src/Fetcher/fetchers/proxy-list.py: -------------------------------------------------------------------------------- 1 | 2 | # -*- coding: utf-8 -*- 3 | # !/usr/bin/env python 4 | 5 | import re 6 | 7 | from Util.WebRequest import WebRequest 8 | from Util.utilFunction import getHtmlTree 9 | 10 | 11 | class CustomFetcher(): 12 | 13 | fetcher_host = "proxy-list.org" 14 | 15 | def run(self): 16 | 17 | urls = ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)] 18 | request = WebRequest() 19 | import base64 20 | for url in urls: 21 | r = request.get(url) 22 | if r.status_code != 200: 23 | break 24 | proxies = re.findall(r"Proxy\('(.*?)'\)", r.text) 25 | for proxy in proxies: 26 | yield base64.b64decode(proxy).decode() 27 | -------------------------------------------------------------------------------- /Src/Fetcher/fetchers/proxylistplus.py: -------------------------------------------------------------------------------- 1 | 2 | # -*- coding: utf-8 -*- 3 | # !/usr/bin/env python 4 | 5 | import re 6 | 7 | from Util.WebRequest import WebRequest 8 | from Util.utilFunction import getHtmlTree 9 | 10 | 11 | class CustomFetcher(): 12 | 13 | fetcher_host = "list.proxylistplus.com" 14 | 15 | def run(self): 16 | 17 | urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1'] 18 | request = WebRequest() 19 | for url in urls: 20 | r = request.get(url) 21 | proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?(\d+)', r.text) 22 | for proxy in proxies: 23 | yield ':'.join(proxy) 24 | -------------------------------------------------------------------------------- /Src/Fetcher/fetchers/xdaili.py: -------------------------------------------------------------------------------- 1 | 2 | # -*- coding: utf-8 -*- 3 | # !/usr/bin/env python 4 | 5 | import re 6 | 7 | from Util.WebRequest import WebRequest 8 | from Util.utilFunction import getHtmlTree 9 | 10 | 11 | class CustomFetcher(): 12 | 13 | fetcher_host = "www.xdaili.cn" 14 | 15 | def run(self): 16 | 17 | url = 'http://www.xdaili.cn/ipagent/freeip/getFreeIps?page=1&rows=10' 18 | request = WebRequest() 19 | try: 20 | res = request.get(url).json() 21 | for row in res['RESULT']['rows']: 22 | yield '{}:{}'.format(row['ip'], row['port']) 23 | except Exception as e: 24 | pass 25 | -------------------------------------------------------------------------------- /Src/Fetcher/fetchers/xicidaili.py: -------------------------------------------------------------------------------- 1 | 2 | # -*- coding: utf-8 -*- 3 | # !/usr/bin/env python 4 | 5 | import re 6 | 7 | from Util.WebRequest import WebRequest 8 | from Util.utilFunction import getHtmlTree 9 | 10 | 11 | class CustomFetcher(): 12 | 13 | fetcher_host = "www.xicidaili.com" 14 | 15 | def run(self): 16 | page_count = 2 17 | 18 | url_list = [ 19 | 'http://www.xicidaili.com/nn/', # 高匿 20 | 'http://www.xicidaili.com/nt/', # 透明 21 | ] 22 | for each_url in url_list: 23 | for i in range(1, page_count + 1): 24 | page_url = each_url + str(i) 25 | tree = getHtmlTree(page_url) 26 | proxy_list = tree.xpath('.//table[@id="ip_list"]//tr[position()>1]') 27 | for proxy in proxy_list: 28 | try: 29 | yield ':'.join(proxy.xpath('./td/text()')[0:2]) 30 | except Exception as e: 31 | pass 32 | -------------------------------------------------------------------------------- /Src/Forward/ForwardManager.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append("Src") 3 | 4 | from Forward.base import HTTP, Proxy 5 | 6 | from Manager import ProxyManager 7 | from Config import ConfigManager 8 | 9 | class ForwardProxy(Proxy): 10 | 11 | def _get_host_and_port(self): 12 | https = None 13 | if self.request.method == b'CONNECT': 14 | https = ProxyManager.PROXY_HTTPS["ENABLE"] 15 | 16 | domain = self.request.url.netloc 17 | if isinstance(domain, bytes): 18 | domain = domain.decode("utf8") 19 | 20 | ProxyManager.proxy_manager.tickDomainRequestState(domain, "total") 21 | counter = ProxyManager.proxy_manager.getDomainCounter(domain) 22 | count = counter.get("total") 23 | item = ProxyManager.proxy_manager.getQualityUsefulProxy(https=https, count=count, domain=domain) 24 | proxy = item.get("proxy") 25 | address = proxy.split(":") 26 | return address 27 | 28 | def before_process_response(self): 29 | domain = self.request.url.netloc 30 | if isinstance(domain, bytes): 31 | domain = domain.decode("utf8") 32 | 33 | if isinstance(self.response.code, bytes): 34 | status_code = "status_code_%s" % self.response.code.decode() 35 | else: 36 | status_code = "status_code_%s" % self.response.code 37 | 38 | ProxyManager.proxy_manager.tickDomainRequestState(domain, status_code) 39 | 40 | class ForwardHttp(HTTP): 41 | 42 | def __init__(self): 43 | bind_host = ConfigManager.base_config.setting.get("forward_bind_host") 44 | bind_port = ConfigManager.base_config.setting.get("forward_bind_port") 45 | 46 | super(ForwardHttp, self).__init__(hostname=bind_host, port=bind_port) 47 | 48 | def handle(self, client): 49 | 50 | fp = ForwardProxy(client, 51 | auth_code=self.auth_code, 52 | server_recvbuf_size=self.server_recvbuf_size, 53 | client_recvbuf_size=self.client_recvbuf_size, 54 | ) 55 | 56 | fp.daemon = True 57 | fp.start() 58 | 59 | 60 | if __name__ == '__main__': 61 | fh = ForwardHttp() 62 | fh.run() -------------------------------------------------------------------------------- /Src/Forward/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ozhiwei/SmartProxyPool/f38a21857c9f5e3511bc65792a493cf45f6c491e/Src/Forward/__init__.py -------------------------------------------------------------------------------- /Src/Forward/base.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | proxy.py 5 | ~~~~~~~~ 6 | 7 | HTTP Proxy Server in Python. 8 | 9 | :copyright: (c) 2013-2018 by Abhinav Singh. 10 | :license: BSD, see LICENSE for more details. 11 | """ 12 | import os 13 | import sys 14 | import errno 15 | import base64 16 | import socket 17 | import select 18 | import logging 19 | import argparse 20 | import datetime 21 | import threading 22 | from collections import namedtuple 23 | 24 | if os.name != 'nt': 25 | import resource 26 | 27 | VERSION = (0, 3) 28 | __version__ = '.'.join(map(str, VERSION[0:2])) 29 | __description__ = 'HTTP Proxy Server in Python' 30 | __author__ = 'Abhinav Singh' 31 | __author_email__ = 'mailsforabhinav@gmail.com' 32 | __homepage__ = 'https://github.com/abhinavsingh/proxy.py' 33 | __download_url__ = '%s/archive/master.zip' % __homepage__ 34 | __license__ = 'BSD' 35 | 36 | logger = logging.getLogger(__name__) 37 | 38 | PY3 = sys.version_info[0] == 3 39 | 40 | if PY3: # pragma: no cover 41 | text_type = str 42 | binary_type = bytes 43 | from urllib import parse as urlparse 44 | else: # pragma: no cover 45 | text_type = unicode 46 | binary_type = str 47 | import urlparse 48 | 49 | 50 | def text_(s, encoding='utf-8', errors='strict'): # pragma: no cover 51 | """Utility to ensure text-like usability. 52 | 53 | If ``s`` is an instance of ``binary_type``, return 54 | ``s.decode(encoding, errors)``, otherwise return ``s``""" 55 | if isinstance(s, binary_type): 56 | return s.decode(encoding, errors) 57 | return s 58 | 59 | 60 | def bytes_(s, encoding='utf-8', errors='strict'): # pragma: no cover 61 | """Utility to ensure binary-like usability. 62 | 63 | If ``s`` is an instance of ``text_type``, return 64 | ``s.encode(encoding, errors)``, otherwise return ``s``""" 65 | if isinstance(s, text_type): 66 | return s.encode(encoding, errors) 67 | return s 68 | 69 | 70 | version = bytes_(__version__) 71 | CRLF, COLON, SP = b'\r\n', b':', b' ' 72 | PROXY_AGENT_HEADER = b'Proxy-agent: proxy.py v' + version 73 | 74 | PROXY_TUNNEL_ESTABLISHED_RESPONSE_PKT = CRLF.join([ 75 | b'HTTP/1.1 200 Connection established', 76 | PROXY_AGENT_HEADER, 77 | CRLF 78 | ]) 79 | 80 | BAD_GATEWAY_RESPONSE_PKT = CRLF.join([ 81 | b'HTTP/1.1 502 Bad Gateway', 82 | PROXY_AGENT_HEADER, 83 | b'Content-Length: 11', 84 | b'Connection: close', 85 | CRLF 86 | ]) + b'Bad Gateway' 87 | 88 | PROXY_AUTHENTICATION_REQUIRED_RESPONSE_PKT = CRLF.join([ 89 | b'HTTP/1.1 407 Proxy Authentication Required', 90 | PROXY_AGENT_HEADER, 91 | b'Content-Length: 29', 92 | b'Connection: close', 93 | CRLF 94 | ]) + b'Proxy Authentication Required' 95 | 96 | 97 | class ChunkParser(object): 98 | """HTTP chunked encoding response parser.""" 99 | 100 | states = namedtuple('ChunkParserStates', ( 101 | 'WAITING_FOR_SIZE', 102 | 'WAITING_FOR_DATA', 103 | 'COMPLETE' 104 | ))(1, 2, 3) 105 | 106 | def __init__(self): 107 | self.state = ChunkParser.states.WAITING_FOR_SIZE 108 | self.body = b'' # Parsed chunks 109 | self.chunk = b'' # Partial chunk received 110 | self.size = None # Expected size of next following chunk 111 | 112 | def parse(self, data): 113 | more = True if len(data) > 0 else False 114 | while more: 115 | more, data = self.process(data) 116 | 117 | def process(self, data): 118 | if self.state == ChunkParser.states.WAITING_FOR_SIZE: 119 | # Consume prior chunk in buffer 120 | # in case chunk size without CRLF was received 121 | data = self.chunk + data 122 | self.chunk = b'' 123 | # Extract following chunk data size 124 | line, data = HttpParser.split(data) 125 | if not line: # CRLF not received 126 | self.chunk = data 127 | data = b'' 128 | else: 129 | self.size = int(line, 16) 130 | self.state = ChunkParser.states.WAITING_FOR_DATA 131 | elif self.state == ChunkParser.states.WAITING_FOR_DATA: 132 | remaining = self.size - len(self.chunk) 133 | self.chunk += data[:remaining] 134 | data = data[remaining:] 135 | if len(self.chunk) == self.size: 136 | data = data[len(CRLF):] 137 | self.body += self.chunk 138 | if self.size == 0: 139 | self.state = ChunkParser.states.COMPLETE 140 | else: 141 | self.state = ChunkParser.states.WAITING_FOR_SIZE 142 | self.chunk = b'' 143 | self.size = None 144 | return len(data) > 0, data 145 | 146 | 147 | class HttpParser(object): 148 | """HTTP request/response parser.""" 149 | 150 | states = namedtuple('HttpParserStates', ( 151 | 'INITIALIZED', 152 | 'LINE_RCVD', 153 | 'RCVING_HEADERS', 154 | 'HEADERS_COMPLETE', 155 | 'RCVING_BODY', 156 | 'COMPLETE'))(1, 2, 3, 4, 5, 6) 157 | 158 | types = namedtuple('HttpParserTypes', ( 159 | 'REQUEST_PARSER', 160 | 'RESPONSE_PARSER' 161 | ))(1, 2) 162 | 163 | def __init__(self, parser_type): 164 | assert parser_type in (HttpParser.types.REQUEST_PARSER, HttpParser.types.RESPONSE_PARSER) 165 | self.type = parser_type 166 | self.state = HttpParser.states.INITIALIZED 167 | 168 | self.raw = b'' 169 | self.buffer = b'' 170 | 171 | self.headers = dict() 172 | self.body = None 173 | 174 | self.method = None 175 | self.url = None 176 | self.code = None 177 | self.reason = None 178 | self.version = None 179 | 180 | self.chunk_parser = None 181 | 182 | def is_chunked_encoded_response(self): 183 | return self.type == HttpParser.types.RESPONSE_PARSER and \ 184 | b'transfer-encoding' in self.headers and \ 185 | self.headers[b'transfer-encoding'][1].lower() == b'chunked' 186 | 187 | def parse(self, data): 188 | self.raw += data 189 | data = self.buffer + data 190 | self.buffer = b'' 191 | 192 | more = True if len(data) > 0 else False 193 | while more: 194 | more, data = self.process(data) 195 | self.buffer = data 196 | 197 | def process(self, data): 198 | if self.state in (HttpParser.states.HEADERS_COMPLETE, 199 | HttpParser.states.RCVING_BODY, 200 | HttpParser.states.COMPLETE) and \ 201 | (self.method == b'POST' or self.type == HttpParser.types.RESPONSE_PARSER): 202 | if not self.body: 203 | self.body = b'' 204 | 205 | if b'content-length' in self.headers: 206 | self.state = HttpParser.states.RCVING_BODY 207 | self.body += data 208 | if len(self.body) >= int(self.headers[b'content-length'][1]): 209 | self.state = HttpParser.states.COMPLETE 210 | elif self.is_chunked_encoded_response(): 211 | if not self.chunk_parser: 212 | self.chunk_parser = ChunkParser() 213 | self.chunk_parser.parse(data) 214 | if self.chunk_parser.state == ChunkParser.states.COMPLETE: 215 | self.body = self.chunk_parser.body 216 | self.state = HttpParser.states.COMPLETE 217 | 218 | return False, b'' 219 | 220 | line, data = HttpParser.split(data) 221 | if line is False: 222 | return line, data 223 | 224 | if self.state == HttpParser.states.INITIALIZED: 225 | self.process_line(line) 226 | elif self.state in (HttpParser.states.LINE_RCVD, HttpParser.states.RCVING_HEADERS): 227 | self.process_header(line) 228 | 229 | # When connect request is received without a following host header 230 | # See `TestHttpParser.test_connect_request_without_host_header_request_parse` for details 231 | if self.state == HttpParser.states.LINE_RCVD and \ 232 | self.type == HttpParser.types.REQUEST_PARSER and \ 233 | self.method == b'CONNECT' and \ 234 | data == CRLF: 235 | self.state = HttpParser.states.COMPLETE 236 | 237 | # When raw request has ended with \r\n\r\n and no more http headers are expected 238 | # See `TestHttpParser.test_request_parse_without_content_length` and 239 | # `TestHttpParser.test_response_parse_without_content_length` for details 240 | elif self.state == HttpParser.states.HEADERS_COMPLETE and \ 241 | self.type == HttpParser.types.REQUEST_PARSER and \ 242 | self.method != b'POST' and \ 243 | self.raw.endswith(CRLF * 2): 244 | self.state = HttpParser.states.COMPLETE 245 | elif self.state == HttpParser.states.HEADERS_COMPLETE and \ 246 | self.type == HttpParser.types.REQUEST_PARSER and \ 247 | self.method == b'POST' and \ 248 | (b'content-length' not in self.headers or 249 | (b'content-length' in self.headers and 250 | int(self.headers[b'content-length'][1]) == 0)) and \ 251 | self.raw.endswith(CRLF * 2): 252 | self.state = HttpParser.states.COMPLETE 253 | 254 | return len(data) > 0, data 255 | 256 | def process_line(self, data): 257 | line = data.split(SP) 258 | if self.type == HttpParser.types.REQUEST_PARSER: 259 | self.method = line[0].upper() 260 | self.url = urlparse.urlsplit(line[1]) 261 | self.version = line[2] 262 | else: 263 | self.version = line[0] 264 | self.code = line[1] 265 | self.reason = b' '.join(line[2:]) 266 | self.state = HttpParser.states.LINE_RCVD 267 | 268 | def process_header(self, data): 269 | if len(data) == 0: 270 | if self.state == HttpParser.states.RCVING_HEADERS: 271 | self.state = HttpParser.states.HEADERS_COMPLETE 272 | elif self.state == HttpParser.states.LINE_RCVD: 273 | self.state = HttpParser.states.RCVING_HEADERS 274 | else: 275 | self.state = HttpParser.states.RCVING_HEADERS 276 | parts = data.split(COLON) 277 | key = parts[0].strip() 278 | value = COLON.join(parts[1:]).strip() 279 | self.headers[key.lower()] = (key, value) 280 | 281 | def build_url(self): 282 | if not self.url: 283 | return b'/None' 284 | 285 | url = self.url.path 286 | if url == b'': 287 | url = b'/' 288 | if not self.url.query == b'': 289 | url += b'?' + self.url.query 290 | if not self.url.fragment == b'': 291 | url += b'#' + self.url.fragment 292 | return url 293 | 294 | def build(self, del_headers=None, add_headers=None): 295 | req = b' '.join([self.method, self.build_url(), self.version]) 296 | req += CRLF 297 | 298 | if not del_headers: 299 | del_headers = [] 300 | for k in self.headers: 301 | if k not in del_headers: 302 | req += self.build_header(self.headers[k][0], self.headers[k][1]) + CRLF 303 | 304 | if not add_headers: 305 | add_headers = [] 306 | for k in add_headers: 307 | req += self.build_header(k[0], k[1]) + CRLF 308 | 309 | req += CRLF 310 | if self.body: 311 | req += self.body 312 | 313 | return req 314 | 315 | @staticmethod 316 | def build_header(k, v): 317 | return k + b': ' + v 318 | 319 | @staticmethod 320 | def split(data): 321 | pos = data.find(CRLF) 322 | if pos == -1: 323 | return False, data 324 | line = data[:pos] 325 | data = data[pos + len(CRLF):] 326 | return line, data 327 | 328 | 329 | class Connection(object): 330 | """TCP server/client connection abstraction.""" 331 | 332 | def __init__(self, what): 333 | self.conn = None 334 | self.buffer = b'' 335 | self.closed = False 336 | self.what = what # server or client 337 | 338 | def send(self, data): 339 | # TODO: Gracefully handle BrokenPipeError exceptions 340 | return self.conn.send(data) 341 | 342 | def recv(self, bufsiz=8192): 343 | try: 344 | data = self.conn.recv(bufsiz) 345 | if len(data) == 0: 346 | logger.debug('rcvd 0 bytes from %s' % self.what) 347 | return None 348 | logger.debug('rcvd %d bytes from %s' % (len(data), self.what)) 349 | return data 350 | except Exception as e: 351 | if e.errno == errno.ECONNRESET: 352 | logger.debug('%r' % e) 353 | else: 354 | logger.exception( 355 | 'Exception while receiving from connection %s %r with reason %r' % (self.what, self.conn, e)) 356 | return None 357 | 358 | def close(self): 359 | self.conn.close() 360 | self.closed = True 361 | 362 | def buffer_size(self): 363 | return len(self.buffer) 364 | 365 | def has_buffer(self): 366 | return self.buffer_size() > 0 367 | 368 | def queue(self, data): 369 | self.buffer += data 370 | 371 | def flush(self): 372 | sent = self.send(self.buffer) 373 | self.buffer = self.buffer[sent:] 374 | logger.debug('flushed %d bytes to %s' % (sent, self.what)) 375 | 376 | 377 | class Server(Connection): 378 | """Establish connection to destination server.""" 379 | 380 | def __init__(self, host, port): 381 | super(Server, self).__init__(b'server') 382 | self.addr = (host, int(port)) 383 | 384 | def __del__(self): 385 | if self.conn: 386 | self.close() 387 | 388 | def connect(self): 389 | self.conn = socket.create_connection((self.addr[0], self.addr[1])) 390 | 391 | 392 | class Client(Connection): 393 | """Accepted client connection.""" 394 | 395 | def __init__(self, conn, addr): 396 | super(Client, self).__init__(b'client') 397 | self.conn = conn 398 | self.addr = addr 399 | 400 | 401 | class ProxyError(Exception): 402 | pass 403 | 404 | 405 | class ProxyConnectionFailed(ProxyError): 406 | 407 | def __init__(self, host, port, reason): 408 | self.host = host 409 | self.port = port 410 | self.reason = reason 411 | 412 | def __str__(self): 413 | return '' % (self.host, self.port, self.reason) 414 | 415 | 416 | class ProxyAuthenticationFailed(ProxyError): 417 | pass 418 | 419 | 420 | class Proxy(threading.Thread): 421 | """HTTP proxy implementation. 422 | 423 | Accepts `Client` connection object and act as a proxy between client and server. 424 | """ 425 | 426 | def __init__(self, client, auth_code=None, server_recvbuf_size=8192, client_recvbuf_size=8192): 427 | super(Proxy, self).__init__() 428 | 429 | self.start_time = self._now() 430 | self.last_activity = self.start_time 431 | 432 | self.auth_code = auth_code 433 | self.client = client 434 | self.client_recvbuf_size = client_recvbuf_size 435 | self.server = None 436 | self.server_recvbuf_size = server_recvbuf_size 437 | 438 | self.request = HttpParser(HttpParser.types.REQUEST_PARSER) 439 | self.response = HttpParser(HttpParser.types.RESPONSE_PARSER) 440 | 441 | @staticmethod 442 | def _now(): 443 | return datetime.datetime.utcnow() 444 | 445 | def _inactive_for(self): 446 | return (self._now() - self.last_activity).seconds 447 | 448 | def _is_inactive(self): 449 | return self._inactive_for() > 30 450 | 451 | def _get_host_and_port(self): 452 | if self.request.method == b'CONNECT': 453 | host, port = self.request.url.path.split(COLON) 454 | elif self.request.url: 455 | host, port = self.request.url.hostname, self.request.url.port if self.request.url.port else 80 456 | else: 457 | raise Exception('Invalid request\n%s' % self.request.raw) 458 | 459 | return host, port 460 | 461 | def _process_request(self, data): 462 | # once we have connection to the server 463 | # we don't parse the http request packets 464 | # any further, instead just pipe incoming 465 | # data from client to server 466 | if self.server and not self.server.closed: 467 | self.server.queue(data) 468 | return 469 | 470 | # parse http request 471 | self.request.parse(data) 472 | 473 | # once http request parser has reached the state complete 474 | # we attempt to establish connection to destination server 475 | if self.request.state == HttpParser.states.COMPLETE: 476 | logger.debug('request parser is in state complete') 477 | 478 | if self.auth_code: 479 | if b'proxy-authorization' not in self.request.headers or \ 480 | self.request.headers[b'proxy-authorization'][1] != self.auth_code: 481 | raise ProxyAuthenticationFailed() 482 | 483 | host, port = self._get_host_and_port() 484 | 485 | self.server = Server(host, port) 486 | try: 487 | logger.debug('connecting to server %s:%s' % (host, port)) 488 | self.server.connect() 489 | logger.debug('connected to server %s:%s' % (host, port)) 490 | except Exception as e: # TimeoutError, socket.gaierror 491 | self.server.closed = True 492 | raise ProxyConnectionFailed(host, port, repr(e)) 493 | 494 | # for http connect methods (https requests) 495 | # queue appropriate response for client 496 | # notifying about established connection 497 | # if self.request.method == b'CONNECT': 498 | self.server.queue(data) 499 | # for usual http requests, re-build request packet 500 | # and queue for the server with appropriate headers 501 | # else: 502 | # self.server.queue(self.request.build( 503 | # del_headers=[b'proxy-authorization', b'proxy-connection', b'connection', b'keep-alive'], 504 | # # add_headers=[(b'Via', b'1.1 proxy.py v%s' % version), (b'Connection', b'Close')] 505 | # )) 506 | 507 | def _process_response(self, data): 508 | # parse incoming response packet 509 | # only for non-https requests 510 | if not self.request.method == b'CONNECT': 511 | self.response.parse(data) 512 | 513 | self.before_process_response() 514 | 515 | # queue data for client 516 | self.client.queue(data) 517 | 518 | def _access_log(self): 519 | host, port = self.server.addr if self.server else (None, None) 520 | if self.request.method == b'CONNECT': 521 | logger.info( 522 | '%s:%s - %s %s:%s' % (self.client.addr[0], self.client.addr[1], self.request.method, host, port)) 523 | elif self.request.method: 524 | logger.info('%s:%s - %s %s:%s%s - %s %s - %s bytes' % ( 525 | self.client.addr[0], self.client.addr[1], self.request.method, host, port, self.request.build_url(), 526 | self.response.code, self.response.reason, len(self.response.raw))) 527 | 528 | def _get_waitable_lists(self): 529 | rlist, wlist, xlist = [self.client.conn], [], [] 530 | if self.client.has_buffer(): 531 | wlist.append(self.client.conn) 532 | if self.server and not self.server.closed: 533 | rlist.append(self.server.conn) 534 | if self.server and not self.server.closed and self.server.has_buffer(): 535 | wlist.append(self.server.conn) 536 | return rlist, wlist, xlist 537 | 538 | def _process_wlist(self, w): 539 | if self.client.conn in w: 540 | logger.debug('client is ready for writes, flushing client buffer') 541 | self.client.flush() 542 | 543 | if self.server and not self.server.closed and self.server.conn in w: 544 | logger.debug('server is ready for writes, flushing server buffer') 545 | self.server.flush() 546 | 547 | def _process_rlist(self, r): 548 | """Returns True if connection to client must be closed.""" 549 | if self.client.conn in r: 550 | logger.debug('client is ready for reads, reading') 551 | data = self.client.recv(self.client_recvbuf_size) 552 | self.last_activity = self._now() 553 | 554 | if not data: 555 | logger.debug('client closed connection, breaking') 556 | return True 557 | 558 | try: 559 | self._process_request(data) 560 | except (ProxyAuthenticationFailed, ProxyConnectionFailed) as e: 561 | logger.exception(e) 562 | self.client.queue(Proxy._get_response_pkt_by_exception(e)) 563 | self.client.flush() 564 | return True 565 | 566 | if self.server and not self.server.closed and self.server.conn in r: 567 | logger.debug('server is ready for reads, reading') 568 | data = self.server.recv(self.server_recvbuf_size) 569 | self.last_activity = self._now() 570 | 571 | if not data: 572 | logger.debug('server closed connection') 573 | self.server.close() 574 | else: 575 | self._process_response(data) 576 | 577 | return False 578 | 579 | def _process(self): 580 | while True: 581 | rlist, wlist, xlist = self._get_waitable_lists() 582 | r, w, x = select.select(rlist, wlist, xlist, 1) 583 | 584 | self._process_wlist(w) 585 | if self._process_rlist(r): 586 | break 587 | 588 | if self.client.buffer_size() == 0: 589 | if self.response.state == HttpParser.states.COMPLETE: 590 | logger.debug('client buffer is empty and response state is complete, breaking') 591 | break 592 | 593 | if self._is_inactive(): 594 | logger.debug('client buffer is empty and maximum inactivity has reached, breaking') 595 | break 596 | 597 | @staticmethod 598 | def _get_response_pkt_by_exception(e): 599 | if e.__class__.__name__ == 'ProxyAuthenticationFailed': 600 | return PROXY_AUTHENTICATION_REQUIRED_RESPONSE_PKT 601 | if e.__class__.__name__ == 'ProxyConnectionFailed': 602 | return BAD_GATEWAY_RESPONSE_PKT 603 | 604 | def run(self): 605 | logger.debug('Proxying connection %r' % self.client.conn) 606 | try: 607 | self._process() 608 | except KeyboardInterrupt: 609 | pass 610 | except Exception as e: 611 | logger.exception('Exception while handling connection %r with reason %r' % (self.client.conn, e)) 612 | finally: 613 | logger.debug( 614 | 'closing client connection with pending client buffer size %d bytes' % self.client.buffer_size()) 615 | self.client.close() 616 | if self.server: 617 | logger.debug( 618 | 'closed client connection with pending server buffer size %d bytes' % self.server.buffer_size()) 619 | self._access_log() 620 | logger.debug('Closing proxy for connection %r at address %r' % (self.client.conn, self.client.addr)) 621 | 622 | def before_process_response(self): 623 | pass 624 | 625 | class TCP(object): 626 | """TCP server implementation. 627 | 628 | Subclass MUST implement `handle` method. It accepts an instance of accepted `Client` connection. 629 | """ 630 | 631 | def __init__(self, hostname='127.0.0.1', port=8899, backlog=100): 632 | self.hostname = hostname 633 | self.port = port 634 | self.backlog = backlog 635 | self.socket = None 636 | 637 | def handle(self, client): 638 | raise NotImplementedError() 639 | 640 | def run(self): 641 | try: 642 | logger.info('Starting server on port %d' % self.port) 643 | self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 644 | self.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) 645 | self.socket.bind((self.hostname, self.port)) 646 | self.socket.listen(self.backlog) 647 | while True: 648 | conn, addr = self.socket.accept() 649 | client = Client(conn, addr) 650 | self.handle(client) 651 | except Exception as e: 652 | logger.exception('Exception while running the server %r' % e) 653 | finally: 654 | logger.info('Closing server socket') 655 | self.socket.close() 656 | 657 | 658 | class HTTP(TCP): 659 | """HTTP proxy server implementation. 660 | 661 | Spawns new process to proxy accepted client connection. 662 | """ 663 | 664 | def __init__(self, hostname='127.0.0.1', port=8899, backlog=100, 665 | auth_code=None, server_recvbuf_size=8192, client_recvbuf_size=8192): 666 | super(HTTP, self).__init__(hostname, port, backlog) 667 | self.auth_code = auth_code 668 | self.client_recvbuf_size = client_recvbuf_size 669 | self.server_recvbuf_size = server_recvbuf_size 670 | 671 | def handle(self, client): 672 | proxy = Proxy(client, 673 | auth_code=self.auth_code, 674 | server_recvbuf_size=self.server_recvbuf_size, 675 | client_recvbuf_size=self.client_recvbuf_size) 676 | proxy.daemon = True 677 | proxy.start() 678 | 679 | 680 | def set_open_file_limit(soft_limit): 681 | """Configure open file description soft limit on supported OS.""" 682 | if os.name != 'nt': # resource module not available on Windows OS 683 | curr_soft_limit, curr_hard_limit = resource.getrlimit(resource.RLIMIT_NOFILE) 684 | if curr_soft_limit < soft_limit < curr_hard_limit: 685 | resource.setrlimit(resource.RLIMIT_NOFILE, (soft_limit, curr_hard_limit)) 686 | logger.info('Open file descriptor soft limit set to %d' % soft_limit) 687 | 688 | 689 | def main(): 690 | parser = argparse.ArgumentParser( 691 | description='proxy.py v%s' % __version__, 692 | epilog='Having difficulty using proxy.py? Report at: %s/issues/new' % __homepage__ 693 | ) 694 | 695 | parser.add_argument('--hostname', default='127.0.0.1', help='Default: 127.0.0.1') 696 | parser.add_argument('--port', default='8899', help='Default: 8899') 697 | parser.add_argument('--backlog', default='100', help='Default: 100. ' 698 | 'Maximum number of pending connections to proxy server') 699 | parser.add_argument('--basic-auth', default=None, help='Default: No authentication. ' 700 | 'Specify colon separated user:password ' 701 | 'to enable basic authentication.') 702 | parser.add_argument('--server-recvbuf-size', default='8192', help='Default: 8 KB. ' 703 | 'Maximum amount of data received from the ' 704 | 'server in a single recv() operation. Bump this ' 705 | 'value for faster downloads at the expense of ' 706 | 'increased RAM.') 707 | parser.add_argument('--client-recvbuf-size', default='8192', help='Default: 8 KB. ' 708 | 'Maximum amount of data received from the ' 709 | 'client in a single recv() operation. Bump this ' 710 | 'value for faster uploads at the expense of ' 711 | 'increased RAM.') 712 | parser.add_argument('--open-file-limit', default='1024', help='Default: 1024. ' 713 | 'Maximum number of files (TCP connections) ' 714 | 'that proxy.py can open concurrently.') 715 | parser.add_argument('--log-level', default='INFO', help='DEBUG, INFO (default), WARNING, ERROR, CRITICAL') 716 | args = parser.parse_args() 717 | 718 | logging.basicConfig(level=getattr(logging, args.log_level), 719 | format='%(asctime)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s') 720 | 721 | try: 722 | set_open_file_limit(int(args.open_file_limit)) 723 | 724 | auth_code = None 725 | if args.basic_auth: 726 | auth_code = b'Basic %s' % base64.b64encode(bytes_(args.basic_auth)) 727 | 728 | proxy = HTTP(hostname=args.hostname, 729 | port=int(args.port), 730 | backlog=int(args.backlog), 731 | auth_code=auth_code, 732 | server_recvbuf_size=int(args.server_recvbuf_size), 733 | client_recvbuf_size=int(args.client_recvbuf_size)) 734 | proxy.run() 735 | except KeyboardInterrupt: 736 | pass 737 | 738 | 739 | if __name__ == '__main__': 740 | main() 741 | -------------------------------------------------------------------------------- /Src/Log/LogHandler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | 5 | import logging 6 | 7 | from logging.handlers import TimedRotatingFileHandler 8 | from Config import ConfigManager 9 | 10 | LOG_LEVEL = { 11 | "CRITICAL": 50, 12 | "FATAL": 50, 13 | "ERROR": 40, 14 | "WARNING": 30, 15 | "WARN": 30, 16 | "INFO": 20, 17 | "DEBUG": 10, 18 | "NOTSET": 0, 19 | } 20 | 21 | CURRENT_PATH = os.path.dirname(os.path.abspath(__file__)) 22 | ROOT_PATH = os.path.join(CURRENT_PATH, os.pardir, os.pardir) 23 | LOG_PATH = os.path.join(ROOT_PATH, 'logs') 24 | if not os.path.exists(LOG_PATH): 25 | os.mkdir(LOG_PATH) 26 | 27 | class LogHandler(logging.Logger): 28 | 29 | def __init__(self, level=None, stream=True, file=True): 30 | self.name = "ProxyPool" 31 | if level: 32 | self.level = level 33 | else: 34 | self.level = LOG_LEVEL.get(ConfigManager.base_config.setting.get("log_level"), LOG_LEVEL["INFO"]) 35 | 36 | super(LogHandler, self).__init__(self.name, level=self.level) 37 | if stream: 38 | self.__setStreamHandler__() 39 | if file: 40 | self.__setFileHandler__() 41 | 42 | def __setFileHandler__(self, level=None): 43 | file_name = os.path.join(LOG_PATH, '{name}.log'.format(name=self.name)) 44 | # 设置日志回滚, 保存在log目录, 一天保存一个文件, 保留15天 45 | file_handler = TimedRotatingFileHandler(filename=file_name, when='D', interval=1, backupCount=15) 46 | file_handler.suffix = '%Y%m%d.log' 47 | if not level: 48 | file_handler.setLevel(self.level) 49 | else: 50 | file_handler.setLevel(level) 51 | formatter = logging.Formatter('%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s') 52 | 53 | file_handler.setFormatter(formatter) 54 | self.file_handler = file_handler 55 | self.addHandler(file_handler) 56 | 57 | def __setStreamHandler__(self, level=None): 58 | stream_handler = logging.StreamHandler() 59 | formatter = logging.Formatter('%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s') 60 | stream_handler.setFormatter(formatter) 61 | if not level: 62 | stream_handler.setLevel(self.level) 63 | else: 64 | stream_handler.setLevel(level) 65 | self.addHandler(stream_handler) 66 | 67 | def resetName(self, name): 68 | self.name = name 69 | self.removeHandler(self.file_handler) 70 | self.__setFileHandler__() 71 | 72 | 73 | if __name__ == '__main__': 74 | log = LogHandler() 75 | log.info('this is a test msg') 76 | -------------------------------------------------------------------------------- /Src/Log/LogManager.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from Log.LogHandler import LogHandler 3 | 4 | log = LogHandler() 5 | 6 | def init(): 7 | pass 8 | 9 | if __name__ == '__main__': 10 | log.info('this is a test msg') 11 | -------------------------------------------------------------------------------- /Src/Log/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ozhiwei/SmartProxyPool/f38a21857c9f5e3511bc65792a493cf45f6c491e/Src/Log/__init__.py -------------------------------------------------------------------------------- /Src/Manager/ProxyClean.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # !/usr/bin/env python 3 | 4 | import sys 5 | sys.path.append("Src") 6 | import time 7 | import threading 8 | 9 | from Manager.ProxyManager import proxy_manager 10 | from Log.LogManager import log 11 | from Config import ConfigManager 12 | 13 | try: 14 | from Queue import Queue # py3 15 | except: 16 | from queue import Queue # py2 17 | 18 | # 这样的实现多线程有问题, 后期无法扩展到独立的进程. 19 | # must call classmethod initQueue before when thread start 20 | class ProxyClean(threading.Thread): 21 | def __init__(self, **kwargs): 22 | super(ProxyClean, self).__init__(**kwargs) 23 | 24 | class ProxyCleanUseful(ProxyClean): 25 | 26 | def run(self): 27 | hold_number = ConfigManager.setting_config.setting.get("hold_useful_proxy_number") 28 | total_number = proxy_manager.getUsefulProxyNumber() 29 | clean_number = proxy_manager.cleanUsefulProxy(hold_number=hold_number) 30 | 31 | 32 | log.info("clean useful, total_number:{total_number}, clean_number:{clean_number}, hold_number:{hold_number}".format(total_number=total_number, clean_number=clean_number, hold_number=hold_number)) 33 | 34 | class ProxyCleanRaw(ProxyClean): 35 | 36 | def run(self): 37 | total_number = proxy_manager.getRawProxyNumber() 38 | clean_number = proxy_manager.cleanRawProxy() 39 | remain_number = total_number - clean_number 40 | 41 | log.info("clean raw_proxy, total_number:{total_number}, clean_number:{clean_number}, remain_number:{remain_number}".format(total_number=total_number, clean_number=clean_number, remain_number=remain_number)) 42 | 43 | if __name__ == "__main__": 44 | t1 = ProxyCleanUseful() 45 | t1.daemon = True 46 | t1.start() 47 | 48 | t2 = ProxyCleanRaw() 49 | t2.daemon = True 50 | t2.start() 51 | 52 | t1.join() 53 | t2.join() -------------------------------------------------------------------------------- /Src/Manager/ProxyFetch.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # !/usr/bin/env python 3 | 4 | from gevent import monkey, pool 5 | monkey.patch_all() 6 | 7 | import sys 8 | sys.path.append("Src") 9 | import time 10 | import threading 11 | import gevent 12 | 13 | from Manager import ProxyManager 14 | # from ProxyGetter.getFreeProxy import GetFreeProxy 15 | from Fetcher import FetcherManager 16 | from Log.LogManager import log 17 | from Config import ConfigManager 18 | from Util.utilFunction import verifyProxyFormat 19 | 20 | try: 21 | from Queue import Queue # py3 22 | except: 23 | from queue import Queue # py2 24 | 25 | # 这样的实现多线程有问题, 后期无法扩展到独立的机器上. 26 | # must call classmethod initQueue before 27 | class ProxyFetch(object): 28 | queue = Queue() 29 | 30 | @classmethod 31 | def initQueue(cls): 32 | fetchers = ProxyManager.proxy_manager.getExecFetcher() 33 | for fetcher in fetchers: 34 | cls.queue.put(fetcher) 35 | 36 | def start(self): 37 | concurrency = ConfigManager.setting_config.setting.get("fetch_new_proxy_concurrency") 38 | task_pool = pool.Pool(concurrency) 39 | 40 | queue_size = self.queue.qsize() 41 | if queue_size > 0: 42 | greenlet_list = [] 43 | for _ in range(queue_size): 44 | greenlet_list.append(task_pool.spawn(self.fetch)) 45 | 46 | gevent.joinall(greenlet_list) 47 | else: 48 | log.info("Not Have Fetcher Of Now, skip!") 49 | 50 | def fetch(self): 51 | start_time = time.time() 52 | total = 0 53 | succ = 0 54 | fail = 0 55 | skip = 0 56 | 57 | fetcher = self.queue.get() 58 | name = fetcher["name"] 59 | 60 | fetcher_class = FetcherManager.getFetcherClass(name) 61 | log.debug("fetch [{name}] proxy start".format(name=name)) 62 | try: 63 | f = fetcher_class() 64 | for proxy in f.run(): 65 | proxy = proxy.strip() 66 | if proxy and verifyProxyFormat(proxy) and \ 67 | not ProxyManager.proxy_manager.checkUsefulProxyExists(proxy): 68 | 69 | ProxyManager.proxy_manager.saveUsefulProxy(proxy) 70 | succ = succ + 1 71 | log.debug("fetch [{name}] proxy {proxy} succ".format(name=name, proxy=proxy)) 72 | else: 73 | skip = skip + 1 74 | log.debug("fetch [{name}] proxy {proxy} skip".format(name=name, proxy=proxy)) 75 | 76 | total = total + 1 77 | except Exception as e: 78 | log.error("fetch [{name}] proxy fail: {error}".format(name=name, error=e)) 79 | fail = fail + 1 80 | 81 | self.queue.task_done() 82 | 83 | now = int(time.time()) 84 | elapsed_time = int(now - start_time) 85 | 86 | next_fetch_time = now + (fetcher["interval"] * 60) 87 | 88 | data = { 89 | "$inc": { 90 | "succ": succ, 91 | "fail": fail, 92 | "skip": skip, 93 | "total": total, 94 | }, 95 | "$set": { 96 | "next_fetch_time": next_fetch_time, 97 | } 98 | } 99 | 100 | ProxyManager.proxy_manager.updateFetcher(name, data) 101 | log.info("fetch [{name:^15}] proxy finish, \ 102 | total:{total}, succ:{succ}, fail:{fail}, skip:{skip}, elapsed_time:{elapsed_time}s". \ 103 | format(name=name, total=total, succ=succ, fail=fail, skip=skip, elapsed_time=elapsed_time)) 104 | 105 | def run(self): 106 | while self.queue.qsize(): 107 | self.fetch() 108 | 109 | 110 | 111 | if __name__ == "__main__": 112 | ProxyFetch.initQueue() 113 | t = ProxyFetch() 114 | t.start() -------------------------------------------------------------------------------- /Src/Manager/ProxyManager.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # !/usr/bin/env python 3 | 4 | import random 5 | 6 | import datx 7 | import time 8 | 9 | from Util import EnvUtil 10 | from DB.DbClient import UsefulProxyDocsModel, RawProxyDocsModel, DomainCounterDocsModel, FetchersDocsModel 11 | from Config import ConfigManager 12 | from Util.utilFunction import verifyProxyFormat 13 | from ProxyGetter.getFreeProxy import GetFreeProxy 14 | from Log.LogManager import log 15 | 16 | PROXY_LAST_STATUS = { 17 | "UNKNOWN": 0, 18 | "SUCC": 1, 19 | "FAIL": 2, 20 | } 21 | 22 | PROXY_TYPE = { 23 | "UNKNOWN": 0, 24 | "CLEAR": 1, 25 | "ANONYMOUS": 2, 26 | "DYNAMIC": 3, 27 | } 28 | 29 | PROXY_HTTPS = { 30 | "UNKNOWN": 0, 31 | "ENABLE": 1, 32 | "DISABLE": 2, 33 | } 34 | 35 | IP_DATA_PATH = "Data/17monipdb.datx" 36 | 37 | class ProxyManager(object): 38 | 39 | def __init__(self): 40 | self.useful_proxy = UsefulProxyDocsModel() 41 | self.raw_proxy = RawProxyDocsModel() 42 | self.domain_counter = DomainCounterDocsModel() 43 | self.fetchers = FetchersDocsModel() 44 | self.datx = datx.City(IP_DATA_PATH) 45 | 46 | self.quality_useful_proxy_list = [] 47 | self.quality_domain_index = {} 48 | 49 | def cleanUsefulProxy(self, **kwargs): 50 | result = self.useful_proxy.cleanUsefulProxy(**kwargs) 51 | return result 52 | 53 | def cleanRawProxy(self, **kwargs): 54 | result = self.raw_proxy.cleanRawProxy(**kwargs) 55 | return result 56 | 57 | def getAllValidUsefulProxy(self, **kwargs): 58 | result = self.useful_proxy.getAllValidUsefulProxy(**kwargs) 59 | return result 60 | 61 | def getAllUsefulProxy(self, **kwargs): 62 | result = self.useful_proxy.getAllUsefulProxy(**kwargs) 63 | return result 64 | 65 | def getVerifyUsefulProxy(self): 66 | now = int(time.time()) 67 | result = self.useful_proxy.getVerifyUsefulProxy(now) 68 | return result 69 | 70 | def getLowQualityUsefulProxy(self, **kwagrs): 71 | result = self.useful_proxy.getLowQualityUsefulProxy(**kwagrs) 72 | return result 73 | 74 | def getHighQualityUsefulProxy(self, **kwagrs): 75 | result = self.useful_proxy.getHighQualityUsefulProxy(**kwagrs) 76 | return result 77 | 78 | def getAllRawProxy(self): 79 | result = self.raw_proxy.getAllRawProxy() 80 | return result 81 | 82 | def checkRawProxyExists(self, proxy): 83 | result = self.raw_proxy.checkRawProxyExists(proxy) 84 | return result 85 | 86 | def checkUsefulProxyExists(self, proxy): 87 | result = self.useful_proxy.checkUsefulProxyExists(proxy) 88 | return result 89 | 90 | def getSampleUsefulProxy(self, **kwargs): 91 | result = self.useful_proxy.getSampleUsefulProxy(**kwargs) 92 | return result 93 | 94 | def getQualityUsefulProxy(self, **kwargs): 95 | count = kwargs.get("count", 1) 96 | domain = kwargs.get("domain", None) 97 | 98 | index = self.quality_domain_index.get(domain, 0) 99 | 100 | if index == 0: 101 | self.quality_useful_proxy_list = self.useful_proxy.getQualityUsefulProxy(**kwargs) 102 | 103 | index = (count-1) % len(self.quality_useful_proxy_list) 104 | self.quality_domain_index[domain] = index+1 105 | 106 | result = self.quality_useful_proxy_list[index] 107 | return result 108 | 109 | def deleteRawProxy(self, proxy): 110 | self.raw_proxy.deleteRawProxy(proxy) 111 | 112 | def saveRawProxy(self, proxy): 113 | data = { 114 | "proxy": proxy, 115 | "health": ConfigManager.setting_config.setting.get("init_raw_proxy_health") 116 | } 117 | self.raw_proxy.saveRawProxy(data) 118 | 119 | def getProxyRegion(self, ip): 120 | data = self.datx.find(ip) 121 | region_list = data[:3] 122 | result = [] 123 | for item in region_list: 124 | if item and item not in result: 125 | result.append(item) 126 | 127 | return result 128 | 129 | def saveUsefulProxy(self, proxy): 130 | ip = proxy.split(":")[0] 131 | region_list = self.getProxyRegion(ip) 132 | 133 | data = { 134 | "proxy": proxy, 135 | "succ": 0, 136 | "keep_succ": 0, 137 | "fail": 0, 138 | "total": 0, 139 | "quality": 0, 140 | "https": PROXY_HTTPS["UNKNOWN"], 141 | "type": PROXY_TYPE["UNKNOWN"], 142 | "region_list": region_list, 143 | "last_status": PROXY_LAST_STATUS["UNKNOWN"], 144 | "last_succ_time": 0, 145 | "next_verify_time": 0, 146 | 147 | } 148 | 149 | self.useful_proxy.saveUsefulProxy(data) 150 | 151 | def updateUsefulProxy(self, item, info): 152 | data = { 153 | "$set": {} 154 | } 155 | 156 | if item.get("type") == PROXY_TYPE["UNKNOWN"]: 157 | data["$set"]["type"] = info["type"] 158 | 159 | if item.get("https") == PROXY_HTTPS["UNKNOWN"]: 160 | data["$set"]["https"] = info["https"] 161 | 162 | if len(data["$set"]) > 0: 163 | self.useful_proxy.updateUsefulProxy(item["proxy"], data) 164 | 165 | def deleteUsefulProxy(self, proxy): 166 | self.useful_proxy.deleteUsefulProxy(proxy) 167 | 168 | def tickUsefulProxyVaildSucc(self, proxy): 169 | self.useful_proxy.tickUsefulProxyVaildSucc(proxy) 170 | 171 | def tickUsefulProxyVaildFail(self, proxy): 172 | self.useful_proxy.tickUsefulProxyVaildFail(proxy) 173 | 174 | def tickUsefulProxyVaildTotal(self, proxy): 175 | self.useful_proxy.tickUsefulProxyVaildTotal(proxy) 176 | 177 | def updateUsefulProxyNextVerifyTime(self, proxy, start_time=None): 178 | 179 | item = self.getProxy(proxy) 180 | multiple = abs(item["quality"]) 181 | if item["quality"] > 0: 182 | multiple = 0 183 | 184 | start_time = start_time if start_time else int(time.time()) 185 | interval = ConfigManager.setting_config.setting.get("verify_useful_proxy_interval") 186 | next_verify_time = start_time + (multiple * interval * 60) 187 | 188 | query = { 189 | "proxy": proxy 190 | } 191 | data = { 192 | "$set": { 193 | "next_verify_time": next_verify_time 194 | } 195 | } 196 | self.useful_proxy.updateProxy(query, data) 197 | 198 | def tickRawProxyVaildFail(self, proxy): 199 | self.raw_proxy.tickRawProxyVaildFail(proxy) 200 | 201 | def getProxy(self, proxy): 202 | result = self.useful_proxy.getProxy(proxy) 203 | return result 204 | 205 | def getProxyNumber(self): 206 | total_raw_proxy = self.getRawProxyNumber() 207 | total_useful_queue = self.getUsefulProxyNumber() 208 | result = {'raw_proxy': total_raw_proxy, 'useful_proxy': total_useful_queue} 209 | return result 210 | 211 | def getRawProxyNumber(self): 212 | result = self.raw_proxy.getProxyNum() 213 | return result 214 | 215 | def getUsefulProxyNumber(self): 216 | result = self.useful_proxy.getProxyNum() 217 | return result 218 | 219 | def tickDomainRequestState(self, domain, code): 220 | 221 | self.domain_counter.tickDomainRequestState(domain, code) 222 | 223 | def getDomainCounter(self, domain): 224 | result = self.domain_counter.getDomainCounter(domain) 225 | return result 226 | 227 | 228 | def getAllFetcher(self): 229 | result = self.fetchers.getAllFetcher() 230 | return result 231 | 232 | def getExecFetcher(self): 233 | now = int(time.time()) 234 | result = self.fetchers.getExecFetcher(now) 235 | return result 236 | 237 | def getFetcher(self, name): 238 | result = self.fetchers.getFetcher(name) 239 | return result 240 | 241 | 242 | def updateFetcher(self, name, data): 243 | self.fetchers.updateFetcher(name, data) 244 | 245 | proxy_manager = ProxyManager() 246 | 247 | if __name__ == '__main__': 248 | # proxy_manager.refresh() 249 | pass 250 | -------------------------------------------------------------------------------- /Src/Manager/ProxyVerify.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # !/usr/bin/env python 3 | 4 | from gevent import monkey, pool 5 | monkey.patch_all() 6 | 7 | import sys 8 | sys.path.append("Src") 9 | 10 | import threading 11 | import requests 12 | import re 13 | import time 14 | import gevent 15 | 16 | from Manager import ProxyManager 17 | from Log.LogManager import log 18 | from Config import ConfigManager 19 | 20 | try: 21 | from Queue import Queue # py3 22 | except: 23 | from queue import Queue # py2 24 | 25 | class ProxyVerify(object): 26 | 27 | # http可用才会检查https, 会不会有只开通https的代理呢? 28 | def getProxyInfo(self, proxy): 29 | info = {} 30 | 31 | data = proxy.split(':') 32 | info["ip"] = data[0] 33 | info["port"] = data[1] 34 | info["address"] = proxy 35 | 36 | proxies = { 37 | "http": proxy, 38 | "https": proxy, 39 | } 40 | http_url = "http://httpbin.org/ip" 41 | https_url = "https://httpbin.org/ip" 42 | 43 | result = False 44 | 45 | info["https"] = ProxyManager.PROXY_HTTPS["UNKNOWN"] 46 | info["type"] = ProxyManager.PROXY_TYPE["UNKNOWN"] 47 | # http verify 48 | try: 49 | r = requests.get(http_url, proxies=proxies, timeout=10, verify=False) 50 | data = r.json() 51 | ip_string = data["origin"] 52 | ip_list = ip_string.split(", ") 53 | 54 | status_result = r.status_code == 200 55 | content_result = "origin" in data 56 | if status_result and content_result: 57 | result = True 58 | 59 | if len(ip_list) > 1: 60 | info["type"] = ProxyManager.PROXY_TYPE["CLEAR"] 61 | else: 62 | info["type"] = ProxyManager.PROXY_TYPE["ANONYMOUS"] 63 | 64 | except Exception as e: 65 | log.debug("proxy:[{proxy}] http verify fail, error:{error}".format(proxy=proxy, error=e)) 66 | result = False 67 | 68 | if result: 69 | 70 | # https verify 71 | try: 72 | r = requests.get(https_url, proxies=proxies, timeout=10, verify=False) 73 | status_result = r.status_code == 200 74 | content_result = "origin" in data 75 | if status_result and content_result: 76 | info["https"] = ProxyManager.PROXY_HTTPS["ENABLE"] 77 | 78 | except Exception as e: 79 | log.debug("proxy [{proxy}] https verify fail, error:{error}".format(proxy=proxy, error=e)) 80 | info["https"] = ProxyManager.PROXY_HTTPS["DISABLE"] 81 | 82 | return info 83 | 84 | def defaultVerifyProxy(self, proxy): 85 | result = None 86 | 87 | if isinstance(proxy, bytes): 88 | proxy = proxy.decode('utf8') 89 | 90 | proxies = { 91 | "http": proxy, 92 | } 93 | http_url = "http://httpbin.org/ip" 94 | 95 | try: 96 | r = requests.get(http_url, proxies=proxies, timeout=10, verify=False) 97 | data = r.json() 98 | 99 | status_result = r.status_code == 200 100 | content_result = "origin" in data 101 | if status_result and content_result: 102 | result = True 103 | 104 | except Exception as e: 105 | log.debug("proxy:{proxy} http verify proxy fail, error:{error}".format(proxy=proxy, error=e)) 106 | result = False 107 | 108 | return result 109 | 110 | def customVerifyProxy(self, proxy): 111 | result = None 112 | 113 | if isinstance(proxy, bytes): 114 | proxy = proxy.decode('utf8') 115 | 116 | proxies = { 117 | "http": proxy, 118 | "https": proxy, 119 | } 120 | verify_url = ConfigManager.setting_config.setting.get("custom_verify_url") 121 | 122 | try: 123 | content_result = True 124 | r = requests.get(verify_url, proxies=proxies, timeout=10, verify=False) 125 | pattern = ConfigManager.setting_config.setting.get("custom_verify_content") 126 | if pattern: 127 | content = r.content.decode('utf-8') 128 | search_result = re.search(pattern, content) 129 | content_result = search_result != None 130 | 131 | status_result = r.status_code == 200 132 | if status_result and content_result: 133 | result = True 134 | 135 | except Exception as e: 136 | log.debug("proxy:{proxy} http verify proxy fail, error:{error}".format(proxy=proxy, error=e)) 137 | result = False 138 | 139 | return result 140 | 141 | def verify(self): 142 | pass 143 | 144 | def run(self): 145 | while self.queue.qsize(): 146 | self.verify() 147 | 148 | # 这样的实现多线程有问题, 后期无法扩展到独立的机器上. 149 | # must call classmethod initQueue before 150 | class ProxyVerifyRaw(ProxyVerify): 151 | queue = Queue() 152 | useful_proxies = {} 153 | 154 | @classmethod 155 | def initQueue(cls): 156 | items = ProxyManager.proxy_manager.getAllRawProxy() 157 | for item in items: 158 | cls.queue.put(item) 159 | 160 | items = ProxyManager.proxy_manager.getAllUsefulProxy() 161 | data = { item["proxy"]: 1 for item in items } 162 | cls.useful_proxies = data 163 | 164 | def verify(self): 165 | raw_proxy_item = self.queue.get() 166 | raw_proxy = raw_proxy_item.get("proxy") 167 | if isinstance(raw_proxy, bytes): 168 | raw_proxy = raw_proxy.decode('utf8') 169 | 170 | if raw_proxy not in self.useful_proxies: 171 | if ConfigManager.setting_config.setting.get("custom_verify_url"): 172 | verify_result = self.customVerifyProxy(raw_proxy) 173 | else: 174 | verify_result = self.defaultVerifyProxy(raw_proxy) 175 | 176 | if verify_result: 177 | ProxyManager.proxy_manager.saveUsefulProxy(raw_proxy) 178 | ProxyManager.proxy_manager.deleteRawProxy(raw_proxy) 179 | self.useful_proxies[raw_proxy] = True 180 | 181 | self.stat["succ"] = self.stat["succ"] + 1 182 | log.debug("raw_proxy:{raw_proxy} verify succ".format(raw_proxy=raw_proxy)) 183 | else: 184 | ProxyManager.proxy_manager.tickRawProxyVaildFail(raw_proxy) 185 | 186 | self.stat["fail"] = self.stat["fail"] + 1 187 | log.debug("raw_proxy:{raw_proxy} verify fail".format(raw_proxy=raw_proxy)) 188 | else: 189 | ProxyManager.proxy_manager.deleteRawProxy(raw_proxy) 190 | 191 | self.stat["skip"] = self.stat["skip"] + 1 192 | log.debug("raw_proxy:{raw_proxy} verify repetition".format(raw_proxy=raw_proxy)) 193 | 194 | self.queue.task_done() 195 | self.stat["total"] = self.stat["total"] + 1 196 | 197 | def start(self): 198 | 199 | start_time = time.time() 200 | log.debug("raw_proxy proxy verify start") 201 | 202 | self.stat = dict( 203 | total = 0, 204 | succ = 0, 205 | fail = 0, 206 | skip = 0, 207 | ) 208 | 209 | concurrency = ConfigManager.setting_config.setting.get("verify_raw_proxy_concurrency") 210 | queue_size = self.queue.qsize() 211 | if concurrency > queue_size: 212 | spawn_num = queue_size 213 | else: 214 | spawn_num = concurrency 215 | 216 | greenlet_list = [] 217 | for _ in range(spawn_num): 218 | greenlet_list.append(gevent.spawn(self.run)) 219 | 220 | gevent.joinall(greenlet_list) 221 | 222 | end_time = time.time() 223 | elapsed_time = int(end_time - start_time) 224 | log.info("raw_proxy verify proxy finish, total:{total}, succ:{succ}, fail:{fail}, skip:{skip}, elapsed_time:{elapsed_time}s".format(total=self.stat["total"], succ=self.stat["succ"], fail=self.stat["fail"], skip=self.stat["skip"], elapsed_time=elapsed_time)) 225 | 226 | # 这样的实现多线程有问题, 后期无法扩展到独立的机器上. 227 | # must call classmethod initQueue before 228 | class ProxyVerifyUseful(ProxyVerify): 229 | queue = Queue() 230 | 231 | @classmethod 232 | def initQueue(cls): 233 | proxies = ProxyManager.proxy_manager.getVerifyUsefulProxy() 234 | for proxy in proxies: 235 | cls.queue.put(proxy) 236 | 237 | def checkProxyInfo(self, item): 238 | result = False 239 | if item.get("type") == ProxyManager.PROXY_TYPE["UNKNOWN"] or item.get("type") == None: 240 | result = True 241 | 242 | if item.get("https") == ProxyManager.PROXY_HTTPS["UNKNOWN"] or item.get("https") == None: 243 | result = True 244 | 245 | return result 246 | 247 | def updateUsefulProxy(self, item): 248 | proxy = item.get("proxy") 249 | info = self.getProxyInfo(proxy) 250 | ProxyManager.proxy_manager.updateUsefulProxy(item, info) 251 | 252 | def verify(self): 253 | item = self.queue.get() 254 | proxy = item.get("proxy") 255 | 256 | if ConfigManager.setting_config.setting.get("custom_verify_url"): 257 | verify_result = self.customVerifyProxy(proxy) 258 | else: 259 | verify_result = self.defaultVerifyProxy(proxy) 260 | 261 | if verify_result: 262 | if self.checkProxyInfo(item): 263 | self.updateUsefulProxy(item) 264 | 265 | ProxyManager.proxy_manager.tickUsefulProxyVaildSucc(proxy) 266 | self.stat["succ"] = self.stat["succ"] + 1 267 | log.debug("useful_proxy:{proxy} verify succ".format(proxy=proxy)) 268 | else: 269 | ProxyManager.proxy_manager.tickUsefulProxyVaildFail(proxy) 270 | self.stat["fail"] = self.stat["fail"] + 1 271 | log.debug("useful_proxy:{proxy} verify fail".format(proxy=proxy)) 272 | 273 | self.queue.task_done() 274 | ProxyManager.proxy_manager.tickUsefulProxyVaildTotal(proxy) 275 | ProxyManager.proxy_manager.updateUsefulProxyNextVerifyTime(proxy, self.start_time) 276 | self.stat["total"] = self.stat["total"] + 1 277 | 278 | def start(self): 279 | 280 | start_time = time.time() 281 | self.start_time = int(start_time) 282 | 283 | log.debug("useful_proxy proxy verify start") 284 | 285 | self.stat = dict( 286 | total = 0, 287 | succ = 0, 288 | fail = 0, 289 | ) 290 | 291 | concurrency = ConfigManager.setting_config.setting.get("verify_useful_proxy_concurrency") 292 | task_pool = pool.Pool(concurrency) 293 | 294 | queue_size = self.queue.qsize() 295 | greenlet_list = [] 296 | for _ in range(queue_size): 297 | greenlet_list.append(task_pool.spawn(self.verify)) 298 | 299 | gevent.joinall(greenlet_list) 300 | 301 | end_time = time.time() 302 | elapsed_time = int(end_time - start_time) 303 | log.info('useful_proxy verify proxy finish, total:{total}, succ:{succ}, fail:{fail}, elapsed_time:{elapsed_time}s' 304 | .format(total=self.stat["total"], succ=self.stat["succ"], fail=self.stat["fail"], elapsed_time=elapsed_time)) 305 | 306 | if __name__ == "__main__": 307 | ProxyVerifyRaw.initQueue() 308 | t = ProxyVerifyRaw() 309 | t.start() 310 | 311 | ProxyVerifyUseful.initQueue() 312 | t = ProxyVerifyUseful() 313 | t.start() -------------------------------------------------------------------------------- /Src/Manager/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ozhiwei/SmartProxyPool/f38a21857c9f5e3511bc65792a493cf45f6c491e/Src/Manager/__init__.py -------------------------------------------------------------------------------- /Src/Notify/NotifyManager.py: -------------------------------------------------------------------------------- 1 | 2 | NOTIFY_LIST = [ 3 | "AFTER_SETTING_CHANGE" 4 | ] 5 | 6 | NOTIFY_HANDLER = {} 7 | 8 | NOTIFY_EVENT = {} 9 | for name in NOTIFY_LIST: 10 | NOTIFY_EVENT[name] = name 11 | NOTIFY_HANDLER[name] = [] 12 | 13 | def register_event(name, handler): 14 | handler_list = NOTIFY_HANDLER[name] 15 | handler_list.append(handler) 16 | 17 | def dispatch_event(name, **kwargs): 18 | if name in NOTIFY_HANDLER: 19 | try: 20 | handler_list = NOTIFY_HANDLER[name] 21 | for handler in handler_list: 22 | handler(**kwargs) 23 | except Exception as e: 24 | # tmp handle 25 | # print("dispatch_notify err: {name}, {e}".format(name=name, e=e)) 26 | pass 27 | -------------------------------------------------------------------------------- /Src/Notify/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ozhiwei/SmartProxyPool/f38a21857c9f5e3511bc65792a493cf45f6c491e/Src/Notify/__init__.py -------------------------------------------------------------------------------- /Src/ProxyGetter/CheckProxy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | from getFreeProxy import GetFreeProxy 5 | from Util.utilFunction import verifyProxyFormat 6 | 7 | 8 | from Util.LogHandler import LogHandler 9 | 10 | log = LogHandler('check_proxy', file=False) 11 | 12 | 13 | class CheckProxy(object): 14 | 15 | @staticmethod 16 | def checkAllGetProxyFunc(): 17 | """ 18 | 检查getFreeProxy所有代理获取函数运行情况 19 | Returns: 20 | None 21 | """ 22 | import inspect 23 | member_list = inspect.getmembers(GetFreeProxy, predicate=inspect.isfunction) 24 | proxy_count_dict = dict() 25 | for func_name, func in member_list: 26 | log.info(u"开始运行 {}".format(func_name)) 27 | try: 28 | proxy_list = [_ for _ in func() if verifyProxyFormat(_)] 29 | proxy_count_dict[func_name] = len(proxy_list) 30 | except Exception as e: 31 | log.info(u"代理获取函数 {} 运行出错!".format(func_name)) 32 | log.error(str(e)) 33 | log.info(u"所有函数运行完毕 " + "***" * 5) 34 | for func_name, func in member_list: 35 | log.info(u"函数 {n}, 获取到代理数: {c}".format(n=func_name, c=proxy_count_dict.get(func_name, 0))) 36 | 37 | @staticmethod 38 | def checkGetProxyFunc(func): 39 | """ 40 | 检查指定的getFreeProxy某个function运行情况 41 | Args: 42 | func: getFreeProxy中某个可调用方法 43 | 44 | Returns: 45 | None 46 | """ 47 | func_name = getattr(func, '__name__', "None") 48 | log.info("start running func: {}".format(func_name)) 49 | count = 0 50 | for proxy in func(): 51 | if verifyProxyFormat(proxy): 52 | log.info("fetch proxy: {}".format(proxy)) 53 | count += 1 54 | log.info("{n} completed, fetch proxy number: {c}".format(n=func_name, c=count)) 55 | 56 | 57 | if __name__ == '__main__': 58 | CheckProxy.checkAllGetProxyFunc() 59 | CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFirst) 60 | -------------------------------------------------------------------------------- /Src/ProxyGetter/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: __init__.py.py 5 | Description : 6 | Author : JHao 7 | date: 2016/11/25 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2016/11/25: 11 | ------------------------------------------------- 12 | """ -------------------------------------------------------------------------------- /Src/ProxyGetter/getFreeProxy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # !/usr/bin/env python 3 | 4 | import re 5 | import sys 6 | import requests 7 | 8 | from Util.WebRequest import WebRequest 9 | from Util.utilFunction import getHtmlTree 10 | from Util.utilFunction import verifyProxyFormat 11 | 12 | # for debug to disable insecureWarning 13 | requests.packages.urllib3.disable_warnings() 14 | 15 | class GetFreeProxy(object): 16 | 17 | def __init__(self): 18 | pass 19 | 20 | @staticmethod 21 | def freeProxyFirst(page=10): 22 | url_list = [ 23 | 'http://www.data5u.com/', 24 | 'http://www.data5u.com/free/gngn/index.shtml', 25 | 'http://www.data5u.com/free/gnpt/index.shtml' 26 | ] 27 | for url in url_list: 28 | html_tree = getHtmlTree(url) 29 | ul_list = html_tree.xpath('//ul[@class="l2"]') 30 | for ul in ul_list: 31 | try: 32 | yield ':'.join(ul.xpath('.//li/text()')[0:2]) 33 | except Exception as e: 34 | print(e) 35 | 36 | @staticmethod 37 | def freeProxySecond(area=33, page=1): 38 | area = 33 if area > 33 else area 39 | for area_index in range(1, area + 1): 40 | for i in range(1, page + 1): 41 | url = "http://www.66ip.cn/areaindex_{}/{}.html".format(area_index, i) 42 | html_tree = getHtmlTree(url) 43 | tr_list = html_tree.xpath("//*[@id='footer']/div/table/tr[position()>1]") 44 | if len(tr_list) == 0: 45 | continue 46 | for tr in tr_list: 47 | yield tr.xpath("./td[1]/text()")[0] + ":" + tr.xpath("./td[2]/text()")[0] 48 | break 49 | 50 | @staticmethod 51 | def freeProxyThird(days=1): 52 | url = 'http://www.ip181.com/' 53 | html_tree = getHtmlTree(url) 54 | try: 55 | tr_list = html_tree.xpath('//tr')[1:] 56 | for tr in tr_list: 57 | yield ':'.join(tr.xpath('./td/text()')[0:2]) 58 | except Exception as e: 59 | pass 60 | 61 | @staticmethod 62 | def freeProxyFourth(page_count=2): 63 | url_list = [ 64 | 'http://www.xicidaili.com/nn/', # 高匿 65 | 'http://www.xicidaili.com/nt/', # 透明 66 | ] 67 | for each_url in url_list: 68 | for i in range(1, page_count + 1): 69 | page_url = each_url + str(i) 70 | tree = getHtmlTree(page_url) 71 | proxy_list = tree.xpath('.//table[@id="ip_list"]//tr[position()>1]') 72 | for proxy in proxy_list: 73 | try: 74 | yield ':'.join(proxy.xpath('./td/text()')[0:2]) 75 | except Exception as e: 76 | pass 77 | 78 | @staticmethod 79 | def freeProxyFifth(): 80 | url = "http://www.goubanjia.com/" 81 | tree = getHtmlTree(url) 82 | proxy_list = tree.xpath('//td[@class="ip"]') 83 | # 此网站有隐藏的数字干扰,或抓取到多余的数字或.符号 84 | # 需要过滤掉

的内容 85 | xpath_str = """.//*[not(contains(@style, 'display: none')) 86 | and not(contains(@style, 'display:none')) 87 | and not(contains(@class, 'port')) 88 | ]/text() 89 | """ 90 | for each_proxy in proxy_list: 91 | try: 92 | # :符号裸放在td下,其他放在div span p中,先分割找出ip,再找port 93 | ip_addr = ''.join(each_proxy.xpath(xpath_str)) 94 | port = each_proxy.xpath(".//span[contains(@class, 'port')]/text()")[0] 95 | yield '{}:{}'.format(ip_addr, port) 96 | except Exception as e: 97 | pass 98 | 99 | @staticmethod 100 | def freeProxySixth(): 101 | url = 'http://www.xdaili.cn/ipagent/freeip/getFreeIps?page=1&rows=10' 102 | request = WebRequest() 103 | try: 104 | res = request.get(url).json() 105 | for row in res['RESULT']['rows']: 106 | yield '{}:{}'.format(row['ip'], row['port']) 107 | except Exception as e: 108 | pass 109 | 110 | @staticmethod 111 | def freeProxySeventh(): 112 | url_list = [ 113 | 'https://www.kuaidaili.com/free/inha/{page}/', 114 | 'https://www.kuaidaili.com/free/intr/{page}/' 115 | ] 116 | for url in url_list: 117 | for page in range(1, 5): 118 | page_url = url.format(page=page) 119 | tree = getHtmlTree(page_url) 120 | proxy_list = tree.xpath('.//table//tr') 121 | for tr in proxy_list[1:]: 122 | yield ':'.join(tr.xpath('./td/text()')[0:2]) 123 | 124 | @staticmethod 125 | def freeProxyEight(): 126 | url_gngao = ['http://www.mimiip.com/gngao/%s' % n for n in range(1, 10)] # 国内高匿 127 | url_gnpu = ['http://www.mimiip.com/gnpu/%s' % n for n in range(1, 10)] # 国内普匿 128 | url_gntou = ['http://www.mimiip.com/gntou/%s' % n for n in range(1, 10)] # 国内透明 129 | url_list = url_gngao + url_gnpu + url_gntou 130 | 131 | request = WebRequest() 132 | for url in url_list: 133 | r = request.get(url) 134 | proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\w\W].*(\d+)', r.text) 135 | for proxy in proxies: 136 | yield ':'.join(proxy) 137 | 138 | @staticmethod 139 | def freeProxyNinth(): 140 | urls = ['https://proxy.coderbusy.com/classical/country/cn.aspx?page=1'] 141 | request = WebRequest() 142 | for url in urls: 143 | r = request.get(url) 144 | proxies = re.findall('data-ip="(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})".+?>(\d+)', r.text) 145 | for proxy in proxies: 146 | yield ':'.join(proxy) 147 | 148 | @staticmethod 149 | def freeProxyTen(): 150 | urls = [ 151 | "http://www.ip3366.net/free/?stype=1", 152 | "http://www.ip3366.net/free/?stype=2", 153 | "http://www.ip3366.net/free/?stype=3", 154 | "http://www.ip3366.net/free/?stype=4", 155 | ] 156 | request = WebRequest() 157 | for url in urls: 158 | r = request.get(url) 159 | proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?(\d+)', r.text) 160 | for proxy in proxies: 161 | yield ":".join(proxy) 162 | 163 | @staticmethod 164 | def freeProxyEleven(): 165 | urls = [ 166 | 'http://www.iphai.com/free/ng', 167 | 'http://www.iphai.com/free/np', 168 | 'http://www.iphai.com/free/wg', 169 | 'http://www.iphai.com/free/wp' 170 | ] 171 | request = WebRequest() 172 | for url in urls: 173 | r = request.get(url) 174 | proxies = re.findall(r'\s*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s*?[\s\S]*?\s*?(\d+)\s*?', 175 | r.text) 176 | for proxy in proxies: 177 | yield ":".join(proxy) 178 | 179 | @staticmethod 180 | def freeProxyTwelve(page_count=8): 181 | for i in range(1, page_count + 1): 182 | url = 'http://ip.jiangxianli.com/?page={}'.format(i) 183 | html_tree = getHtmlTree(url) 184 | tr_list = html_tree.xpath("/html/body/div[1]/div/div[1]/div[2]/table/tbody/tr") 185 | if len(tr_list) == 0: 186 | continue 187 | for tr in tr_list: 188 | yield tr.xpath("./td[2]/text()")[0] + ":" + tr.xpath("./td[3]/text()")[0] 189 | 190 | @staticmethod 191 | def freeProxyWallFirst(): 192 | urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218'] 193 | request = WebRequest() 194 | for url in urls: 195 | r = request.get(url) 196 | proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\w\W](\d+)', r.text) 197 | for proxy in proxies: 198 | yield ':'.join(proxy) 199 | 200 | @staticmethod 201 | def freeProxyWallSecond(): 202 | urls = ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)] 203 | request = WebRequest() 204 | import base64 205 | for url in urls: 206 | r = request.get(url) 207 | proxies = re.findall(r"Proxy\('(.*?)'\)", r.text) 208 | for proxy in proxies: 209 | yield base64.b64decode(proxy).decode() 210 | 211 | @staticmethod 212 | def freeProxyWallThird(): 213 | urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1'] 214 | request = WebRequest() 215 | for url in urls: 216 | r = request.get(url) 217 | proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?(\d+)', r.text) 218 | for proxy in proxies: 219 | yield ':'.join(proxy) 220 | 221 | 222 | if __name__ == '__main__': 223 | pass 224 | -------------------------------------------------------------------------------- /Src/Run/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ozhiwei/SmartProxyPool/f38a21857c9f5e3511bc65792a493cf45f6c491e/Src/Run/__init__.py -------------------------------------------------------------------------------- /Src/Run/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from gevent import monkey 4 | monkey.patch_all() 5 | 6 | import sys 7 | sys.path.append("Src") 8 | 9 | import time 10 | import signal 11 | from threading import Thread 12 | 13 | from Log import LogManager 14 | from Web import WebManager 15 | from Forward.ForwardManager import ForwardHttp 16 | from Manager.ProxyFetch import ProxyFetch 17 | 18 | from Schedule.ProxyVerifySchedule import ProxyVerifySchedule 19 | from Schedule.ProxyFetchSchedule import ProxyFetchSchedule 20 | 21 | TASK_LIST = { 22 | "ProxyVerifySchedule": ProxyVerifySchedule, 23 | "ProxyFetchSchedule": ProxyFetchSchedule, 24 | "ForwardHttp": ForwardHttp, 25 | } 26 | 27 | def show_time(): 28 | date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 29 | content = "{newline}{symbol} ProxyPool Start, date:{date} {symbol}{newline}".format(newline="\n", symbol="-"*50, date=date) 30 | print(content) 31 | 32 | def start_fetch(): 33 | ProxyFetch.initQueue() 34 | t = ProxyFetch() 35 | t.start() 36 | 37 | def start_task(): 38 | start_fetch() 39 | 40 | task_list = [] 41 | for name in TASK_LIST.keys(): 42 | task = TASK_LIST[name]() 43 | t = Thread(target=task.run, name=name) 44 | task_list.append(t) 45 | 46 | for t in task_list: 47 | t.daemon = True 48 | t.start() 49 | 50 | def stop_handler(signum, frame): 51 | print('Received Signal [%s], Stop Program' % signum) 52 | sys.exit() 53 | 54 | def register_signal(): 55 | signal.signal(signal.SIGINT, stop_handler) 56 | 57 | 58 | def main(test=False): 59 | show_time() 60 | register_signal() 61 | 62 | LogManager.init() 63 | 64 | start_task() 65 | 66 | WebManager.run() 67 | 68 | if __name__ == '__main__': 69 | main() -------------------------------------------------------------------------------- /Src/Schedule/ProxyCleanSchedule.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # !/usr/bin/env python 3 | 4 | import sys 5 | sys.path.append("Src") 6 | import time 7 | import threading 8 | import datetime 9 | 10 | from Schedule.ProxySchedule import ProxySchedule 11 | from Manager.ProxyClean import ProxyCleanRaw, ProxyCleanUseful 12 | 13 | from Log.LogManager import log 14 | from Config import ConfigManager 15 | 16 | class ProxyCleanSchedule(ProxySchedule): 17 | rightnow = True 18 | 19 | def __init__(self, **kwargs): 20 | super(ProxyCleanSchedule, self).__init__(**kwargs) 21 | self.task_handler_hash = { 22 | "clean_raw_proxy_interval": self.clean_raw_proxy, 23 | "clean_useful_proxy_interval": self.clean_useful_proxy, 24 | } 25 | 26 | def clean_raw_proxy(self): 27 | t = ProxyCleanRaw() 28 | t.daemon = True 29 | t.start() 30 | t.join() 31 | 32 | def clean_useful_proxy(self): 33 | t = ProxyCleanUseful() 34 | t.daemon = True 35 | t.start() 36 | t.join() 37 | 38 | if __name__ == '__main__': 39 | sch = ProxyCleanSchedule() 40 | sch.run() -------------------------------------------------------------------------------- /Src/Schedule/ProxyFetchSchedule.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # !/usr/bin/env python 3 | 4 | from gevent import monkey 5 | monkey.patch_all() 6 | 7 | import sys 8 | sys.path.append("Src") 9 | import time 10 | import threading 11 | import datetime 12 | 13 | from Manager.ProxyFetch import ProxyFetch 14 | from Manager import ProxyManager 15 | from Schedule.ProxySchedule import ProxySchedule 16 | 17 | from Log.LogManager import log 18 | from Config import ConfigManager 19 | 20 | class ProxyFetchSchedule(ProxySchedule): 21 | rightnow = False 22 | 23 | def __init__(self, **kwargs): 24 | super(ProxyFetchSchedule, self).__init__(**kwargs) 25 | self.task_handler_hash = { 26 | "fetch_new_proxy_interval": self.fetchNewProxy, 27 | } 28 | 29 | def checkFetchNewProxy(self): 30 | 31 | total_number = ProxyManager.proxy_manager.getUsefulProxyNumber() 32 | hold_number = ConfigManager.setting_config.setting.get("hold_useful_proxy_number") 33 | if total_number < hold_number or hold_number == -1: 34 | log.debug("fetch new proxy start, exist raw_proxy total_number:{total_number}, hold_number:{hold_number}".format(total_number=total_number, hold_number=hold_number)) 35 | result = True 36 | else: 37 | log.debug("fetch new proxy skip, exist raw_proxy total_number:{total_number}, hold_number:{hold_number}".format(total_number=total_number, hold_number=hold_number)) 38 | result = False 39 | 40 | return result 41 | 42 | def fetchNewProxy(self): 43 | if self.checkFetchNewProxy(): 44 | ProxyFetch.initQueue() 45 | t = ProxyFetch() 46 | t.start() 47 | 48 | if __name__ == '__main__': 49 | sch = ProxyFetchSchedule() 50 | sch.run() 51 | -------------------------------------------------------------------------------- /Src/Schedule/ProxySchedule.py: -------------------------------------------------------------------------------- 1 | from apscheduler.schedulers.blocking import BlockingScheduler 2 | from Config import ConfigManager 3 | from Notify import NotifyManager 4 | from Log.LogManager import log 5 | 6 | import logging 7 | import datetime 8 | 9 | DISPATCH_EVENT_LIST = [ 10 | "fetch_new_proxy_interval", 11 | "verify_raw_proxy_interval", 12 | "verify_useful_proxy_interval", 13 | "clean_raw_proxy_interval", 14 | "clean_useful_proxy_interval", 15 | ] 16 | 17 | SCHEDULE_LOG_PATH = "logs/schedule.log" 18 | 19 | logger = logging.getLogger() 20 | file_handler = logging.FileHandler(SCHEDULE_LOG_PATH) 21 | logger.addHandler(file_handler) 22 | logger.setLevel(logging.INFO) 23 | 24 | class ProxySchedule(BlockingScheduler): 25 | def __init__(self, **kwargs): 26 | super(ProxySchedule, self).__init__(logger=logger, **kwargs) 27 | self.task_handler_hash = {} 28 | 29 | NotifyManager.register_event(NotifyManager.NOTIFY_EVENT["AFTER_SETTING_CHANGE"], self.dispatch_event) 30 | 31 | def dispatch_event(self, **kwargs): 32 | event_name = kwargs.get("event_name") 33 | event_data = kwargs.get("event_data") 34 | 35 | if event_name in DISPATCH_EVENT_LIST: 36 | self.update_job_interval(**event_data) 37 | 38 | 39 | def update_job_interval(self, **kwargs): 40 | job_name = kwargs.get("job_name") 41 | 42 | value = ConfigManager.setting_config.setting.get(job_name) 43 | trigger_args = { "minutes": value } 44 | trigger='interval' 45 | job = self._update_job(job_name, trigger, **trigger_args) 46 | log.info("update_job_interval: {job_name}, {job}".format(job_name=job_name, job=job)) 47 | return job 48 | 49 | def _update_job(self, job_name, trigger, **trigger_args): 50 | return self.reschedule_job(job_name, trigger=trigger, **trigger_args) 51 | 52 | def run(self): 53 | now = datetime.datetime.now() 54 | for name, handler in self.task_handler_hash.items(): 55 | value = ConfigManager.setting_config.setting.get(name) 56 | if self.rightnow: 57 | next_run_time=now 58 | else: 59 | next_run_time=None 60 | 61 | self.add_job(handler, "interval", id=name, minutes=value, next_run_time=next_run_time) 62 | 63 | self.start() -------------------------------------------------------------------------------- /Src/Schedule/ProxyVerifySchedule.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # !/usr/bin/env python 3 | from gevent import monkey 4 | monkey.patch_all() 5 | 6 | import sys 7 | sys.path.append("Src") 8 | import time 9 | 10 | from Manager.ProxyVerify import ProxyVerifyRaw, ProxyVerifyUseful 11 | from Schedule.ProxySchedule import ProxySchedule 12 | 13 | from Log.LogManager import log 14 | from Config import ConfigManager 15 | 16 | class ProxyVerifySchedule(ProxySchedule): 17 | rightnow = True 18 | 19 | def __init__(self, **kwargs): 20 | super(ProxyVerifySchedule, self).__init__(**kwargs) 21 | self.task_handler_hash = { 22 | "verify_useful_proxy_interval": self.verifyUsefulProxy, 23 | } 24 | 25 | def verifyUsefulProxy(self): 26 | ProxyVerifyUseful.initQueue() 27 | t = ProxyVerifyUseful() 28 | t.start() 29 | 30 | if __name__ == '__main__': 31 | sch = ProxyVerifySchedule() 32 | sch.run() 33 | -------------------------------------------------------------------------------- /Src/Schedule/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- -------------------------------------------------------------------------------- /Src/Util/EnvUtil.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | 5 | PY3 = sys.version_info >= (3,) -------------------------------------------------------------------------------- /Src/Util/GetConfig.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # !/usr/bin/env python 3 | 4 | import os 5 | from Util.utilClass import ConfigParse 6 | from Util.utilClass import LazyProperty 7 | 8 | 9 | class GetConfig(object): 10 | """ 11 | to get config from config.ini 12 | """ 13 | 14 | def __init__(self): 15 | self.pwd = os.path.split(os.path.realpath(__file__))[0] 16 | config_dir = os.path.split(self.pwd)[0] 17 | self.config_path = os.path.join(config_dir, 'Config.ini') 18 | if not os.path.isfile(self.config_path): 19 | self.config_path = os.path.join(config_dir, 'Config.ini.default') 20 | 21 | self.config_file = ConfigParse() 22 | self.config_file.read(self.config_path) 23 | 24 | @LazyProperty 25 | def db_type(self): 26 | return self.config_file.get('DB', 'type') 27 | 28 | @LazyProperty 29 | def db_name(self): 30 | return self.config_file.get('DB', 'name') 31 | 32 | @LazyProperty 33 | def db_host(self): 34 | return self.config_file.get('DB', 'host') 35 | 36 | @LazyProperty 37 | def db_port(self): 38 | return int(self.config_file.get('DB', 'port')) 39 | 40 | @LazyProperty 41 | def db_password(self): 42 | try: 43 | password = self.config_file.get('DB', 'password') 44 | except Exception: 45 | password = None 46 | return password 47 | 48 | @LazyProperty 49 | def db_username(self): 50 | try: 51 | username = self.config_file.get('DB', 'username') 52 | except Exception: 53 | username = None 54 | return username 55 | 56 | @LazyProperty 57 | def log_level(self): 58 | try: 59 | log_level = self.config_file.get('LOG', 'level') 60 | except Exception: 61 | log_level = None 62 | return log_level 63 | 64 | @LazyProperty 65 | def proxy_getter_functions(self): 66 | return self.config_file.options('ProxyGetter') 67 | 68 | @LazyProperty 69 | def host_ip(self): 70 | return self.config_file.get('API','ip') 71 | 72 | @LazyProperty 73 | def host_port(self): 74 | return int(self.config_file.get('API', 'port')) 75 | 76 | @LazyProperty 77 | def processes(self): 78 | return int(self.config_file.get('API', 'processes')) 79 | 80 | config = GetConfig() 81 | 82 | if __name__ == '__main__': 83 | gg = GetConfig() 84 | print(gg.db_type) 85 | print(gg.db_name) 86 | print(gg.db_host) 87 | print(gg.db_port) 88 | print(gg.proxy_getter_functions) 89 | print(gg.host_ip) 90 | print(gg.host_port) 91 | print(gg.processes) 92 | -------------------------------------------------------------------------------- /Src/Util/WebRequest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from requests.models import Response 4 | import requests 5 | import random 6 | import time 7 | 8 | 9 | class WebRequest(object): 10 | def __init__(self, *args, **kwargs): 11 | pass 12 | 13 | @property 14 | def user_agent(self): 15 | """ 16 | return an User-Agent at random 17 | :return: 18 | """ 19 | ua_list = [ 20 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101', 21 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122', 22 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71', 23 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95', 24 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71', 25 | 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)', 26 | 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50', 27 | 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', 28 | ] 29 | return random.choice(ua_list) 30 | 31 | @property 32 | def header(self): 33 | """ 34 | basic header 35 | :return: 36 | """ 37 | return {'User-Agent': self.user_agent, 38 | 'Accept': '*/*', 39 | 'Connection': 'keep-alive', 40 | 'Accept-Language': 'zh-CN,zh;q=0.8'} 41 | 42 | def get(self, url, header=None, retry_time=1, timeout=10, 43 | *args, **kwargs): 44 | 45 | headers = self.header 46 | if header and isinstance(header, dict): 47 | headers.update(header) 48 | 49 | try: 50 | resp = requests.get(url, headers=headers, timeout=timeout, **kwargs) 51 | except Exception as e: 52 | # print("request url error", url, e) 53 | resp = Response() 54 | resp.status_code = 504 55 | 56 | return resp -------------------------------------------------------------------------------- /Src/Util/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: __init__.py.py 5 | Description : 6 | Author : JHao 7 | date: 2016/11/25 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2016/11/25: 11 | ------------------------------------------------- 12 | """ -------------------------------------------------------------------------------- /Src/Util/utilClass.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # !/usr/bin/env python 3 | 4 | class LazyProperty(object): 5 | """ 6 | LazyProperty 7 | explain: http://www.spiderpy.cn/blog/5/ 8 | """ 9 | 10 | def __init__(self, func): 11 | self.func = func 12 | 13 | def __get__(self, instance, owner): 14 | if instance is None: 15 | return self 16 | else: 17 | value = self.func(instance) 18 | setattr(instance, self.func.__name__, value) 19 | return value 20 | 21 | 22 | try: 23 | from configparser import ConfigParser # py3 24 | except: 25 | from ConfigParser import ConfigParser # py2 26 | 27 | 28 | class ConfigParse(ConfigParser): 29 | """ 30 | rewrite ConfigParser, for support upper option 31 | """ 32 | 33 | def __init__(self): 34 | ConfigParser.__init__(self) 35 | 36 | def optionxform(self, optionstr): 37 | return optionstr 38 | 39 | 40 | class Singleton(type): 41 | """ 42 | Singleton Metaclass 43 | """ 44 | 45 | _inst = {} 46 | 47 | def __call__(cls, *args, **kwargs): 48 | if cls not in cls._inst: 49 | cls._inst[cls] = super(Singleton, cls).__call__(*args) 50 | return cls._inst[cls] 51 | -------------------------------------------------------------------------------- /Src/Util/utilFunction.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # !/usr/bin/env python 3 | """ 4 | ------------------------------------------------- 5 | File Name: utilFunction.py 6 | Description : tool function 7 | Author : JHao 8 | date: 2016/11/25 9 | ------------------------------------------------- 10 | Change Activity: 11 | 2016/11/25: 添加robustCrawl、verifyProxy、getHtmlTree 12 | ------------------------------------------------- 13 | """ 14 | import requests 15 | import time 16 | import re 17 | from lxml import etree 18 | 19 | from Util.WebRequest import WebRequest 20 | 21 | # noinspection PyPep8Naming 22 | def robustCrawl(func): 23 | def decorate(*args, **kwargs): 24 | try: 25 | return func(*args, **kwargs) 26 | except Exception as e: 27 | pass 28 | # logger.info(u"sorry, 抓取出错。错误原因:") 29 | # logger.info(e) 30 | 31 | return decorate 32 | 33 | 34 | # noinspection PyPep8Naming 35 | def verifyProxyFormat(proxy): 36 | """ 37 | 检查代理格式 38 | :param proxy: 39 | :return: 40 | """ 41 | import re 42 | verify_regex = r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}" 43 | _proxy = re.findall(verify_regex, proxy) 44 | return True if len(_proxy) == 1 and _proxy[0] == proxy else False 45 | 46 | 47 | # noinspection PyPep8Naming 48 | def getHtmlTree(url, **kwargs): 49 | """ 50 | 获取html树 51 | :param url: 52 | :param kwargs: 53 | :return: 54 | """ 55 | 56 | header = { 57 | 'Connection': 'keep-alive', 58 | 'Cache-Control': 'max-age=0', 59 | 'Upgrade-Insecure-Requests': '1', 60 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko)', 61 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 62 | 'Accept-Encoding': 'gzip, deflate, sdch', 63 | 'Accept-Language': 'zh-CN,zh;q=0.8', 64 | } 65 | # TODO 取代理服务器用代理服务器访问 66 | wr = WebRequest() 67 | 68 | # delay 2s for per request 69 | # time.sleep(2) 70 | 71 | html = wr.get(url=url, header=header).content 72 | try: 73 | result = etree.HTML(html) 74 | except Exception as e: 75 | # print("getHtmlTree error: ", url, e) 76 | result = etree.HTML("") 77 | 78 | return result 79 | 80 | 81 | def tcpConnect(proxy): 82 | """ 83 | TCP 三次握手 84 | :param proxy: 85 | :return: 86 | """ 87 | from socket import socket, AF_INET, SOCK_STREAM 88 | s = socket(AF_INET, SOCK_STREAM) 89 | ip, port = proxy.split(':') 90 | result = s.connect_ex((ip, int(port))) 91 | return True if result == 0 else False 92 | 93 | 94 | # TODO: 逻辑应该有问题, 但不确定 95 | # http是可用的才会保存https, 会不会有只开通https的代理呢? 96 | def validUsefulProxy(proxy): 97 | """ 98 | 检验代理是否可用 99 | :param proxy: 100 | :return: 101 | """ 102 | if isinstance(proxy, bytes): 103 | proxy = proxy.decode('utf8') 104 | proxies = { 105 | "http": proxy, 106 | "https": proxy, 107 | } 108 | http_url = "http://httpbin.org/ip" 109 | https_url = "https://httpbin.org/ip" 110 | 111 | http_result = False 112 | https_result = False 113 | 114 | # http valid 115 | try: 116 | r = requests.get(http_url, proxies=proxies, timeout=10, verify=False) 117 | 118 | content = r.content 119 | if isinstance(content, bytes): 120 | content = content.decode('utf8') 121 | 122 | status_result = r.status_code == 200 123 | content_result = re.search("\"origin\"", content) != None 124 | if status_result and content_result: 125 | http_result = True 126 | 127 | except Exception as e: 128 | # print(str(e)) 129 | http_result = False 130 | 131 | if http_result: 132 | 133 | # https vaild 134 | try: 135 | r = requests.get(https_url, proxies=proxies, timeout=10, verify=False) 136 | 137 | content = r.content 138 | if isinstance(content, bytes): 139 | content = content.decode('utf8') 140 | 141 | status_right = r.status_code == 200 142 | content_right = re.search("\"origin\"", content) != None 143 | if status_right and content_right: 144 | https_result = True 145 | 146 | except Exception as e: 147 | # print(str(e)) 148 | https_result = False 149 | 150 | return (http_result, https_result) -------------------------------------------------------------------------------- /Src/Version/VersionManger.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append("Src") 3 | 4 | from pymongo import MongoClient 5 | from Config import ConfigManager 6 | 7 | import importlib 8 | 9 | mc = MongoClient(ConfigManager.base_config.setting.get("db_host"), ConfigManager.base_config.setting.get("db_port")) 10 | 11 | VERSION_FILE_PATH = "version" 12 | version_list = [] 13 | 14 | def init(): 15 | with open(VERSION_FILE_PATH) as f: 16 | items = f.readlines() 17 | for item in items: 18 | version_list.append(item.strip()) 19 | 20 | def get_last_version(): 21 | result = version_list[0] 22 | return result 23 | 24 | def update_version(cur_version): 25 | index = version_list.index(cur_version) 26 | while index: 27 | index = index - 1 28 | next_version = version_list[index] 29 | version_name = next_version.replace(".", "_") 30 | last_version = get_last_version() 31 | 32 | module_name = "version.version_{version_name}".format(version_name=version_name) 33 | 34 | try: 35 | module = importlib.import_module(module_name) 36 | result = module.run(mc, last_version, next_version, cur_version) 37 | except Exception: 38 | result = False 39 | 40 | query = {"setting_name": "version"} 41 | data = { 42 | "$set": { 43 | "setting_value": next_version 44 | } 45 | } 46 | mc.proxy.setting.update(query, data) 47 | 48 | cur_version = next_version 49 | 50 | def run(): 51 | item = mc.proxy.setting.find_one({"setting_name": "version"}) 52 | if item: 53 | cur_version = item["setting_value"] 54 | update_version(cur_version) 55 | else: 56 | last_version = get_last_version() 57 | data = { 58 | "setting_name": "version", 59 | "setting_value": last_version, 60 | "setting_value": True, 61 | } 62 | mc.proxy.setting.insert(data) 63 | 64 | 65 | mc.close() 66 | 67 | if __name__ == '__main__': 68 | init() 69 | run() -------------------------------------------------------------------------------- /Src/Version/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ozhiwei/SmartProxyPool/f38a21857c9f5e3511bc65792a493cf45f6c491e/Src/Version/__init__.py -------------------------------------------------------------------------------- /Src/Version/version/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ozhiwei/SmartProxyPool/f38a21857c9f5e3511bc65792a493cf45f6c491e/Src/Version/version/__init__.py -------------------------------------------------------------------------------- /Src/Version/version/version_1_0_0.py: -------------------------------------------------------------------------------- 1 | 2 | # just example 3 | def run(mc, last_version, update_version, cur_version): 4 | print("nothing to do", last_version, update_version, cur_version, __file__) 5 | 6 | if __name__ == '__main__': 7 | pass -------------------------------------------------------------------------------- /Src/Web/WebManager.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # !/usr/bin/env python 3 | 4 | # base import 5 | from gevent import monkey 6 | monkey.patch_all() 7 | 8 | import time 9 | import math 10 | import os 11 | import sys 12 | sys.path.append("Src/") 13 | 14 | import logging 15 | from flask import Flask 16 | from gevent.pywsgi import WSGIServer 17 | from Config import ConfigManager 18 | 19 | ACCESS_LOG_PATH = "logs/app_access.log" 20 | 21 | app = Flask(__name__) 22 | app.config['JSON_AS_ASCII'] = False 23 | app.config['JSONIFY_PRETTYPRINT_REGULAR'] = True 24 | 25 | 26 | logger = logging.getLogger() 27 | def init_log(): 28 | file_handler = logging.FileHandler(ACCESS_LOG_PATH) 29 | logger.addHandler(file_handler) 30 | logger.setLevel(logging.INFO) 31 | 32 | return logger 33 | 34 | def init_config(): 35 | app.config.from_pyfile('config.py') 36 | app.config["MONGODB_SETTINGS"] = { 37 | 'db': ConfigManager.base_config.setting.get("db_name"), 38 | 'host': ConfigManager.base_config.setting.get("db_host"), 39 | 'port': ConfigManager.base_config.setting.get("db_port"), 40 | 'username': ConfigManager.base_config.setting.get("db_user"), 41 | 'password': ConfigManager.base_config.setting.get("db_pass"), 42 | } 43 | 44 | def init_app(): 45 | init_config() 46 | init_log() 47 | 48 | def start_app(): 49 | from Web.admin import admin 50 | from Web.api import api 51 | 52 | admin.init_app(app) 53 | api.init_app(app) 54 | 55 | http_server = WSGIServer((ConfigManager.base_config.setting.get("web_bind_host"), ConfigManager.base_config.setting.get("web_bind_port")), app, log=logger, error_log=logger) 56 | http_server.serve_forever() 57 | 58 | def run(): 59 | init_app() 60 | start_app() 61 | 62 | 63 | if __name__ == '__main__': 64 | run() -------------------------------------------------------------------------------- /Src/Web/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ozhiwei/SmartProxyPool/f38a21857c9f5e3511bc65792a493cf45f6c491e/Src/Web/__init__.py -------------------------------------------------------------------------------- /Src/Web/admin/__init__.py: -------------------------------------------------------------------------------- 1 | from . import admin -------------------------------------------------------------------------------- /Src/Web/admin/admin.py: -------------------------------------------------------------------------------- 1 | from .views import ProxyView, SettingView, FetcherView, ProxyPoolAdminIndexView 2 | from .model import ProxyModel, SettingModel, FetcherModel 3 | 4 | from flask_mongoengine import MongoEngine 5 | 6 | from flask_security import Security, MongoEngineUserDatastore, \ 7 | UserMixin, RoleMixin, login_required, current_user, forms 8 | 9 | from flask_mongoengine import MongoEngine 10 | 11 | from .model import User, Role 12 | from .forms import LoginForm 13 | 14 | from flask_security.utils import hash_password 15 | from flask import url_for 16 | 17 | from flask_admin import AdminIndexView, helpers, expose 18 | 19 | import flask_admin 20 | 21 | def init_base_data(*args): 22 | create_roles(*args) 23 | create_admin_user(*args) 24 | 25 | def create_roles(user_datastore, app): 26 | with app.app_context(): 27 | if not user_datastore.find_role(role='user'): 28 | user_datastore.create_role(name='user') 29 | if not user_datastore.find_role(role='superuser'): 30 | user_datastore.create_role(name='superuser') 31 | 32 | def create_admin_user(user_datastore, app): 33 | with app.app_context(): 34 | if user_datastore.get_user("admin"): 35 | pass 36 | else: 37 | user_role = user_datastore.find_role(role='user') 38 | super_user_role = user_datastore.find_role(role='superuser') 39 | user_datastore.create_user(name='admin', email='admin', password=hash_password('admin'), roles=[user_role, super_user_role]) 40 | 41 | def init_security(user_datastore, app, admin): 42 | security = Security(app, user_datastore, login_form=LoginForm) 43 | 44 | @security.context_processor 45 | def security_context_processor(): 46 | return dict( 47 | admin_base_template=admin.base_template, 48 | admin_view=admin.index_view, 49 | h=helpers, 50 | get_url=url_for 51 | ) 52 | 53 | def init_app(app): 54 | admin = flask_admin.Admin(app=app, name='ProxyPool Admin', base_template="admin/master_base.html", index_view=ProxyPoolAdminIndexView(), template_mode='bootstrap3') 55 | admin.add_view(ProxyView(ProxyModel)) 56 | admin.add_view(SettingView(SettingModel)) 57 | # admin.add_view(ProxyPoolView(ProxyPoolModel)) 58 | admin.add_view(FetcherView(FetcherModel)) 59 | 60 | db = MongoEngine() 61 | db.init_app(app) 62 | 63 | user_datastore = MongoEngineUserDatastore(db, User, Role) 64 | init_security(user_datastore, app, admin) 65 | 66 | init_base_data(user_datastore, app) 67 | 68 | -------------------------------------------------------------------------------- /Src/Web/admin/forms.py: -------------------------------------------------------------------------------- 1 | from flask_security import Security, MongoEngineUserDatastore, \ 2 | UserMixin, RoleMixin, login_required, current_user, forms 3 | 4 | from wtforms import fields, validators 5 | 6 | class LoginForm(forms.LoginForm): 7 | name = fields.StringField() 8 | email = fields.StringField(label="Name or Email", validators=[validators.required()]) 9 | -------------------------------------------------------------------------------- /Src/Web/admin/model.py: -------------------------------------------------------------------------------- 1 | import mongoengine 2 | from flask_mongoengine import Document 3 | from flask_security import UserMixin, RoleMixin 4 | 5 | class ProxyModel(Document): 6 | meta = {'collection': 'useful_proxy'} 7 | 8 | proxy = mongoengine.StringField(required=True, max_length=40) 9 | last_status = mongoengine.IntField(default=0) 10 | last_succ_time = mongoengine.IntField(default=0) 11 | next_verify_time = mongoengine.IntField(default=0) 12 | succ = mongoengine.IntField(default=0) 13 | fail = mongoengine.IntField(default=0) 14 | total = mongoengine.IntField(default=0) 15 | keep_succ = mongoengine.IntField(default=0) 16 | quality = mongoengine.IntField(default=0) 17 | type = mongoengine.IntField(default=0) 18 | https = mongoengine.BooleanField(default=False) 19 | region_list = mongoengine.ListField(mongoengine.StringField(max_length=20)) 20 | 21 | def __unicode__(self): 22 | return self.name 23 | 24 | class SettingModel(Document): 25 | meta = {'collection': 'setting'} 26 | 27 | setting_name = mongoengine.StringField(required=True, unique=True, max_length=40) 28 | setting_value = mongoengine.StringField(required=True, max_length=40) 29 | setting_state = mongoengine.BooleanField(default=True) 30 | 31 | class FetcherModel(Document): 32 | meta = {'collection': 'fetchers'} 33 | 34 | name = mongoengine.StringField(required=True, unique=True, max_length=40) 35 | host = mongoengine.StringField(required=True, unique=True, max_length=40) 36 | total = mongoengine.IntField(default=0) 37 | succ = mongoengine.IntField(default=0) 38 | fail = mongoengine.IntField(default=0) 39 | skip = mongoengine.IntField(default=0) 40 | interval = mongoengine.IntField(default=0) 41 | next_fetch_time = mongoengine.IntField(default=0) 42 | # fetch_time = mongoengine.DateTimeField() 43 | status = mongoengine.BooleanField(default=True) 44 | # fetch_desc = mongoengine.StringField(max_length=40) 45 | 46 | 47 | class ProxyPoolModel(Document): 48 | meta = {'collection': 'proxy_pool'} 49 | 50 | token = mongoengine.StringField(required=True, max_length=40) 51 | filter_name = mongoengine.StringField(required=True, max_length=40) 52 | verifier_name = mongoengine.StringField(required=True, max_length=40) 53 | 54 | 55 | class Role(Document, RoleMixin): 56 | name = mongoengine.StringField(max_length=80, unique=True) 57 | description = mongoengine.StringField(max_length=255) 58 | 59 | class User(Document, UserMixin): 60 | email = mongoengine.StringField(max_length=255) 61 | name = mongoengine.StringField(max_length=255) 62 | password = mongoengine.StringField(max_length=255) 63 | active = mongoengine.BooleanField(default=True) 64 | confirmed_at = mongoengine.DateTimeField() 65 | roles = mongoengine.ListField(mongoengine.ReferenceField(Role), default=[]) -------------------------------------------------------------------------------- /Src/Web/admin/views.py: -------------------------------------------------------------------------------- 1 | import math 2 | import time 3 | from flask import request 4 | from flask_security import current_user 5 | 6 | import flask_admin 7 | from flask import Flask, jsonify, url_for, redirect, render_template, request 8 | from flask_admin.contrib.mongoengine import ModelView 9 | from flask_admin import expose 10 | 11 | from Notify.NotifyManager import dispatch_event, NOTIFY_EVENT 12 | 13 | # project import 14 | from Config import ConfigManager 15 | from Manager.ProxyManager import proxy_manager 16 | 17 | CUSTOM_COLUMN_FORMAT = { 18 | "type" : [ 19 | "未知", 20 | "透明", 21 | "匿名", 22 | ], 23 | "https" : [ 24 | "未知", 25 | "开启", 26 | "关闭", 27 | ], 28 | "last_status": [ 29 | "未知", 30 | "成功", 31 | "失败" 32 | ] 33 | } 34 | 35 | def ElapseTimeFormat(all_time): 36 | day = 24*60*60 37 | hour = 60*60 38 | min = 60 39 | if all_time <60: 40 | return "%d sec"%math.ceil(all_time) 41 | elif all_time > day: 42 | days = divmod(all_time,day) 43 | return "%d days"%(int(days[0])) 44 | elif all_time > hour: 45 | hours = divmod(all_time,hour) 46 | return '%d hours'%(int(hours[0])) 47 | else: 48 | mins = divmod(all_time,min) 49 | return "%d mins"%(int(mins[0])) 50 | 51 | def LastSuccTimeFormat(last_time): 52 | if last_time: 53 | result = ElapseTimeFormat(int(time.time() - last_time)) 54 | else: 55 | result = 0 56 | 57 | return result 58 | 59 | def TimeStampFormat(timeStamp): 60 | time_object = time.localtime(timeStamp) 61 | result = time.strftime("%m-%d %H:%M", time_object) 62 | return result 63 | 64 | def PercentFormat(cur, total): 65 | if total == 0: 66 | percent = 0 67 | else: 68 | percent = float(cur / total * 100) 69 | result = "%d(%.2f%%)" % (cur, percent) 70 | 71 | return result 72 | 73 | class ProxyView(ModelView): 74 | name = "ProxyPool" 75 | 76 | column_list = ("proxy", "succ", "total", "keep_succ", "quality", "type", "https", 77 | "last_status", "last_succ_time", "next_verify_time", "region_list") 78 | can_create = False 79 | column_default_sort = ("quality", True) 80 | column_formatters = dict( 81 | type=lambda v, c, m, p: CUSTOM_COLUMN_FORMAT[p][m.type], 82 | https=lambda v, c, m, p: CUSTOM_COLUMN_FORMAT[p][m.https], 83 | last_status=lambda v, c, m, p: CUSTOM_COLUMN_FORMAT[p][m.last_status], 84 | last_succ_time=lambda v, c, m, p: LastSuccTimeFormat(m.last_succ_time), 85 | next_verify_time=lambda v, c, m, p: TimeStampFormat(m.next_verify_time), 86 | succ=lambda v, c, m, p: PercentFormat(m.succ, m.total), 87 | ) 88 | 89 | def is_accessible(self): 90 | if not current_user.is_active or not current_user.is_authenticated: 91 | return False 92 | 93 | if current_user.has_role('superuser'): 94 | return True 95 | 96 | return False 97 | 98 | def _handle_view(self, name, **kwargs): 99 | if current_user.is_authenticated: 100 | pass 101 | else: 102 | return redirect(url_for('security.login', next=request.url)) 103 | 104 | class SettingView(ModelView): 105 | name="Setting" 106 | 107 | can_create = False 108 | can_delete = False 109 | can_view_details = True 110 | column_searchable_list = ['setting_name'] 111 | column_editable_list = [ "setting_value", "setting_state"] 112 | 113 | def is_accessible(self): 114 | result = None 115 | if not current_user.is_active or not current_user.is_authenticated: 116 | result = False 117 | 118 | if current_user.has_role('superuser'): 119 | result = True 120 | 121 | return result 122 | 123 | def _handle_view(self, name, **kwargs): 124 | if current_user.is_authenticated: 125 | pass 126 | else: 127 | return redirect(url_for('security.login', next=request.url)) 128 | 129 | def after_model_change(self, form, model, is_created): 130 | kwargs = dict( 131 | event_name=model.setting_name, 132 | event_data=dict( 133 | job_id=model.setting_name 134 | ) 135 | ) 136 | dispatch_event(NOTIFY_EVENT["AFTER_SETTING_CHANGE"], **kwargs) 137 | 138 | class FetcherView(ModelView): 139 | name="Fethers" 140 | 141 | column_list = ("name", "host", "succ", "fail", "skip", "total", "status", "interval", "next_fetch_time") 142 | can_create = False 143 | can_delete = False 144 | can_view_details = True 145 | column_default_sort = ("succ", True) 146 | column_searchable_list = ['name'] 147 | column_editable_list = [ "status", "interval"] 148 | column_formatters = dict( 149 | succ=lambda v, c, m, p: PercentFormat(m.succ, m.total), 150 | fail=lambda v, c, m, p: PercentFormat(m.fail, m.total), 151 | skip=lambda v, c, m, p: PercentFormat(m.skip, m.total), 152 | next_fetch_time=lambda v, c, m, p: TimeStampFormat(m.next_fetch_time), 153 | ) 154 | 155 | def is_accessible(self): 156 | result = None 157 | if not current_user.is_active or not current_user.is_authenticated: 158 | result = False 159 | 160 | if current_user.has_role('superuser'): 161 | result = True 162 | 163 | return result 164 | 165 | def _handle_view(self, name, **kwargs): 166 | if current_user.is_authenticated: 167 | pass 168 | else: 169 | return redirect(url_for('security.login', next=request.url)) 170 | 171 | class ProxyPoolAdminIndexView(flask_admin.AdminIndexView): 172 | 173 | @expose() 174 | def index(self): 175 | if not current_user.is_authenticated: 176 | return redirect(url_for('security.login')) 177 | return super(ProxyPoolAdminIndexView, self).index() 178 | -------------------------------------------------------------------------------- /Src/Web/api/__init__.py: -------------------------------------------------------------------------------- 1 | from . import api -------------------------------------------------------------------------------- /Src/Web/api/api.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, jsonify, url_for, redirect, render_template, request 2 | from flask_restful import reqparse, abort, Api, Resource 3 | 4 | from Manager.ProxyManager import proxy_manager 5 | from Log.LogManager import log 6 | 7 | API_LIST = { 8 | "/api/v1/proxy/": { 9 | "args": { 10 | "token": { 11 | "value": "random string + random number", 12 | "desc": "Avoid Get Repetition Proxy", 13 | "required": False, 14 | }, 15 | "https": { 16 | "value": [1], 17 | "desc": "need https proxy? 1 == true", 18 | "required": False, 19 | }, 20 | "region": { 21 | "value": "region name like 中国 or 广州 or 江苏", 22 | "desc": "Get Region Proxy", 23 | "required": False, 24 | }, 25 | "type": { 26 | "value": [1,2], 27 | "desc": "clear proxy 1 or (common) anonymous 2", 28 | "required": False, 29 | } 30 | }, 31 | "desc": "Get A Random Proxy" 32 | }, 33 | "/api/v1/proxies/": { 34 | "args": { 35 | "https": { 36 | "value": [1], 37 | "desc": "need https proxy? 1 == true", 38 | "required": False, 39 | }, 40 | "region": { 41 | "value": "region name like 中国 or 广州 or 江苏", 42 | "desc": "Get Region Proxy", 43 | "required": False, 44 | }, 45 | "type": { 46 | "value": [1,2], 47 | "desc": "clear proxy 1 or (common) anonymous 2", 48 | "required": False, 49 | } 50 | }, 51 | "desc": "Get All Proxy", 52 | } 53 | } 54 | 55 | class ApiList(Resource): 56 | def get(self): 57 | result = jsonify(API_LIST) 58 | 59 | return result 60 | 61 | class Proxy(Resource): 62 | def __init__(self, **kwargs): 63 | super(Proxy, self).__init__(**kwargs) 64 | 65 | parser = reqparse.RequestParser() 66 | parser.add_argument('https', type=int, choices=[1], location='args') 67 | parser.add_argument('type', type=int, choices=[1,2], location='args') 68 | parser.add_argument('region', type=str, location='args') 69 | parser.add_argument('token', type=str, location='args') 70 | self.args = parser.parse_args() 71 | 72 | def get(self): 73 | result = { 74 | "data": {} 75 | } 76 | 77 | options = { 78 | "https": self.args.get('https'), 79 | "type": self.args.get('type'), 80 | "region": self.args.get('region'), 81 | } 82 | log.debug("receive params: {}".format(options)) 83 | 84 | item = proxy_manager.getSampleUsefulProxy(**options) 85 | if item: 86 | del item["_id"] 87 | 88 | result["data"] = item 89 | 90 | return result 91 | 92 | 93 | class Proxies(Resource): 94 | def __init__(self, **kwargs): 95 | super(Proxies, self).__init__(**kwargs) 96 | 97 | parser = reqparse.RequestParser() 98 | parser.add_argument('https', type=int, choices=[1], location='args') 99 | parser.add_argument('type', type=int, choices=[1,2], location='args') 100 | parser.add_argument('region', type=str, location='args') 101 | self.args = parser.parse_args() 102 | 103 | def get(self): 104 | result = { 105 | "data": [] 106 | } 107 | 108 | options = { 109 | "https": bool(self.args.get('https')), 110 | "type": self.args.get('type'), 111 | "region": self.args.get('region'), 112 | } 113 | 114 | data = proxy_manager.getAllUsefulProxy(**options) 115 | 116 | for item in data: 117 | del item["_id"] 118 | 119 | result["data"] = data 120 | 121 | return result 122 | 123 | def init_api(app): 124 | @app.errorhandler(404) 125 | def miss(e): 126 | data = [ 127 | {"result": "not found"}, 128 | {"status_code": 404}, 129 | {"github": "https://github.com/1again/ProxyPool"}, 130 | {"api_list": API_LIST}, 131 | ] 132 | result = jsonify(data) 133 | return result, 404 134 | 135 | 136 | def init_app(app): 137 | app.config.update(RESTFUL_JSON=dict(ensure_ascii=False)) 138 | init_api(app) 139 | 140 | api = Api(app) 141 | api.add_resource(Proxies, '/api/v1/proxies/') 142 | api.add_resource(Proxy, '/api/v1/proxy/') 143 | api.add_resource(ApiList, '/api/v1/') 144 | -------------------------------------------------------------------------------- /Src/Web/config.py: -------------------------------------------------------------------------------- 1 | # Create dummy secrey key so we can use sessions 2 | SECRET_KEY = '1234567890' 3 | 4 | # Flask-Security config 5 | SECURITY_URL_PREFIX = "/admin" 6 | SECURITY_PASSWORD_HASH = "pbkdf2_sha256" 7 | SECURITY_PASSWORD_SALT = "ATGUOHAELKiubahiughaerGOJAEGj" 8 | 9 | SECURITY_USER_IDENTITY_ATTRIBUTES = ["name"] 10 | 11 | # Flask-Security URLs, overridden because they don't put a / at the end 12 | SECURITY_LOGIN_URL = "/login/" 13 | SECURITY_LOGOUT_URL = "/logout/" 14 | SECURITY_REGISTER_URL = "/register/" 15 | 16 | SECURITY_POST_LOGIN_VIEW = "/admin/" 17 | SECURITY_POST_LOGOUT_VIEW = "/admin/" 18 | SECURITY_POST_REGISTER_VIEW = "/admin/" 19 | 20 | # Flask-Security features 21 | SECURITY_REGISTERABLE = True 22 | SECURITY_SEND_REGISTER_EMAIL = False 23 | SQLALCHEMY_TRACK_MODIFICATIONS = False -------------------------------------------------------------------------------- /Src/Web/templates/admin/index.html: -------------------------------------------------------------------------------- 1 | {% extends 'admin/master.html' %} 2 | {% block body %} 3 | {{ super() }} 4 |

5 |
6 |
7 |

ProxyPool-Admin

8 |

9 | Nothing to tell you, Have good day! 10 |

11 |
12 |
13 |
14 | {% endblock body %} 15 | -------------------------------------------------------------------------------- /Src/Web/templates/admin/master_base.html: -------------------------------------------------------------------------------- 1 | {% extends 'admin/base.html' %} 2 | 3 | {% block access_control %} 4 | {% if current_user.is_authenticated %} 5 | 17 | {% endif %} 18 | {% endblock %} 19 | -------------------------------------------------------------------------------- /Src/Web/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | Go to admin! 5 |
6 | 7 | 8 | -------------------------------------------------------------------------------- /Src/Web/templates/security/_macros.html: -------------------------------------------------------------------------------- 1 | {% macro render_field_with_errors(field) %} 2 | 3 |
4 | {{ field.label }} {{ field(class_='form-control', **kwargs)|safe }} 5 | {% if field.errors %} 6 | 11 | {% endif %} 12 |
13 | {% endmacro %} 14 | 15 | {% macro render_field(field) %} 16 |

{{ field(class_='form-control', **kwargs)|safe }}

17 | {% endmacro %} 18 | 19 | {% macro render_checkbox_field(field) -%} 20 |
21 |
22 | 25 |
26 |
27 | {%- endmacro %} -------------------------------------------------------------------------------- /Src/Web/templates/security/_menu.html: -------------------------------------------------------------------------------- 1 | {% if security.registerable or security.recoverable or security.confirmable %} 2 |

Menu

3 | 15 | {% endif %} 16 | -------------------------------------------------------------------------------- /Src/Web/templates/security/_messages.html: -------------------------------------------------------------------------------- 1 | {%- with messages = get_flashed_messages(with_categories=true) -%} 2 | {% if messages %} 3 | 8 | {% endif %} 9 | {%- endwith %} -------------------------------------------------------------------------------- /Src/Web/templates/security/login_user.html: -------------------------------------------------------------------------------- 1 | {% extends 'admin/master.html' %} 2 | {% from "security/_macros.html" import render_field, render_field_with_errors, render_checkbox_field %} 3 | {% include "security/_messages.html" %} 4 | {% block body %} 5 | {{ super() }} 6 |
7 |
8 |

Admin Login

9 |

10 | ProxyPool Github 11 |

12 |
13 |
14 | {{ login_user_form.hidden_tag() }} 15 | {{ render_field_with_errors(login_user_form.email) }} 16 | {{ render_field_with_errors(login_user_form.password) }} 17 | {{ render_checkbox_field(login_user_form.remember) }} 18 | {{ render_field(login_user_form.next) }} 19 | {{ render_field(login_user_form.submit, class="btn btn-primary") }} 20 |
21 |
22 |
23 |
24 | {% endblock body %} -------------------------------------------------------------------------------- /Src/Web/templates/security/register_user.html: -------------------------------------------------------------------------------- 1 | {% extends 'admin/master.html' %} 2 | {% from "security/_macros.html" import render_field_with_errors, render_field %} 3 | {% include "security/_messages.html" %} 4 | {% block body %} 5 | {{ super() }} 6 |
7 |
8 |

Register

9 |
10 |
11 | {{ register_user_form.hidden_tag() }} 12 | {{ render_field_with_errors(register_user_form.email) }} 13 | {{ render_field_with_errors(register_user_form.password) }} 14 | {% if register_user_form.password_confirm %} 15 | {{ render_field_with_errors(register_user_form.password_confirm) }} 16 | {% endif %} 17 | {{ render_field(register_user_form.submit, class="btn btn-primary") }} 18 |
19 |

Already signed up? Please log in.

20 |
21 |
22 |
23 | {% endblock body %} -------------------------------------------------------------------------------- /Test/.pytest_cache/v/cache/lastfailed: -------------------------------------------------------------------------------- 1 | { 2 | "testGetFreeProxy.py::testGetFreeProxy": true 3 | } -------------------------------------------------------------------------------- /Test/.pytest_cache/v/cache/nodeids: -------------------------------------------------------------------------------- 1 | [ 2 | "testGetFreeProxy.py::testGetFreeProxy" 3 | ] -------------------------------------------------------------------------------- /Test/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- -------------------------------------------------------------------------------- /Test/testGetConfig.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from Util.GetConfig import GetConfig 4 | 5 | 6 | # noinspection PyPep8Naming 7 | def testGetConfig(): 8 | """ 9 | test class GetConfig in Util/GetConfig 10 | :return: 11 | """ 12 | gg = GetConfig() 13 | print(gg.db_type) 14 | print(gg.db_name) 15 | print(gg.db_host) 16 | print(gg.db_port) 17 | assert isinstance(gg.proxy_getter_functions, list) 18 | print(gg.proxy_getter_functions) 19 | 20 | if __name__ == '__main__': 21 | testGetConfig() 22 | -------------------------------------------------------------------------------- /Test/testGetFreeProxy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import re 4 | import sys 5 | import requests 6 | 7 | 8 | try: 9 | from importlib import reload # py3 实际不会实用,只是为了不显示语法错误 10 | except: 11 | reload(sys) 12 | sys.setdefaultencoding('utf-8') 13 | 14 | sys.path.append('..') 15 | from ProxyGetter.getFreeProxy import GetFreeProxy 16 | from Util.GetConfig import GetConfig 17 | 18 | 19 | # noinspection PyPep8Naming 20 | def testGetFreeProxy(): 21 | """ 22 | test class GetFreeProxy in ProxyGetter/GetFreeProxy 23 | :return: 24 | """ 25 | gc = GetConfig() 26 | proxy_getter_functions = gc.proxy_getter_functions 27 | for proxyGetter in proxy_getter_functions: 28 | proxy_count = 0 29 | for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): 30 | if proxy: 31 | print('{func}: fetch proxy {proxy},proxy_count:{proxy_count}'.format(func=proxyGetter, proxy=proxy,proxy_count=proxy_count)) 32 | proxy_count += 1 33 | #assert proxy_count >= 20, '{} fetch proxy fail'.format(proxyGetter) 34 | 35 | 36 | if __name__ == '__main__': 37 | testGetFreeProxy() 38 | -------------------------------------------------------------------------------- /Test/testLogHandler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from Util.LogHandler import LogHandler 4 | 5 | 6 | # noinspection PyPep8Naming 7 | def testLogHandler(): 8 | """ 9 | test function LogHandler in Util/LogHandler 10 | :return: 11 | """ 12 | log = LogHandler('test') 13 | log.info('this is a log from test') 14 | 15 | log.resetName(name='test1') 16 | log.info('this is a log from test1') 17 | 18 | log.resetName(name='test2') 19 | log.info('this is a log from test2') 20 | 21 | 22 | if __name__ == '__main__': 23 | testLogHandler() 24 | -------------------------------------------------------------------------------- /Test/testWebRequest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from Util.WebRequest import WebRequest 4 | 5 | 6 | # noinspection PyPep8Naming 7 | def testWebRequest(): 8 | """ 9 | test class WebRequest in Util/WebRequest.py 10 | :return: 11 | """ 12 | wr = WebRequest() 13 | request_object = wr.get('https://www.baidu.com/') 14 | assert request_object.status_code == 200 15 | 16 | 17 | if __name__ == '__main__': 18 | testWebRequest() 19 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-time-machine -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | APScheduler==3.2.0 2 | Werkzeug==0.15.3 3 | Flask==1.0.2 4 | requests==2.20.0 5 | lxml==4.3.3 6 | gevent==1.4.0 7 | Flask-RESTful==0.3.6 8 | 9 | ipip-datx==0.4.0 10 | pymongo==3.7.2 11 | 12 | flask-mongoengine==0.8.2 13 | Flask-Admin==1.5.3 14 | Flask-Security==3.0.0 15 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import sys 4 | sys.path.append("Src") 5 | 6 | from Run.main import main 7 | 8 | main(test=True) -------------------------------------------------------------------------------- /version: -------------------------------------------------------------------------------- 1 | 0.1.0 --------------------------------------------------------------------------------