├── .dockerignore
├── .gitignore
├── .travis.yml
├── CICD
├── Scripts
│ └── deploy.sh
├── id_rsa.enc
└── ssh_config
├── Config
└── Config.ini.default
├── Data
└── 17monipdb.datx
├── Docker
├── Dockerfile
└── docker-compose.yml
├── Docs
├── Design.md
└── images
│ ├── 2019-06-12-22-11-21.png
│ ├── 2019-06-12-22-13-10.png
│ ├── 2019-06-12-22-17-34.png
│ ├── 2019-06-12-22-22-46.png
│ ├── 2019-06-15-08-18-36.png
│ ├── 2019-06-15-10-35-29.png
│ └── 2019-06-15-13-18-47.png
├── LICENSE
├── README.md
├── Src
├── Config
│ ├── ConfigManager.py
│ └── __init__.py
├── DB
│ ├── DbClient.py
│ ├── MongodbClient.py
│ └── __init__.py
├── Fetcher
│ ├── FetcherManager.py
│ ├── __init__.py
│ └── fetchers
│ │ ├── 66ip.py
│ │ ├── __init__.py
│ │ ├── cn-proxy.py
│ │ ├── coderbusy.py
│ │ ├── data5u.py
│ │ ├── goubanjia.py
│ │ ├── ip181.py
│ │ ├── ip3366.py
│ │ ├── iphai.py
│ │ ├── jiangxianli.py
│ │ ├── kuaidaili.py
│ │ ├── mimiip.py
│ │ ├── proxy-list.py
│ │ ├── proxylistplus.py
│ │ ├── xdaili.py
│ │ └── xicidaili.py
├── Forward
│ ├── ForwardManager.py
│ ├── __init__.py
│ └── base.py
├── Log
│ ├── LogHandler.py
│ ├── LogManager.py
│ └── __init__.py
├── Manager
│ ├── ProxyClean.py
│ ├── ProxyFetch.py
│ ├── ProxyManager.py
│ ├── ProxyVerify.py
│ └── __init__.py
├── Notify
│ ├── NotifyManager.py
│ └── __init__.py
├── ProxyGetter
│ ├── CheckProxy.py
│ ├── __init__.py
│ └── getFreeProxy.py
├── Run
│ ├── __init__.py
│ └── main.py
├── Schedule
│ ├── ProxyCleanSchedule.py
│ ├── ProxyFetchSchedule.py
│ ├── ProxySchedule.py
│ ├── ProxyVerifySchedule.py
│ └── __init__.py
├── Util
│ ├── EnvUtil.py
│ ├── GetConfig.py
│ ├── WebRequest.py
│ ├── __init__.py
│ ├── utilClass.py
│ └── utilFunction.py
├── Version
│ ├── VersionManger.py
│ ├── __init__.py
│ └── version
│ │ ├── __init__.py
│ │ └── version_1_0_0.py
└── Web
│ ├── WebManager.py
│ ├── __init__.py
│ ├── admin
│ ├── __init__.py
│ ├── admin.py
│ ├── forms.py
│ ├── model.py
│ └── views.py
│ ├── api
│ ├── __init__.py
│ └── api.py
│ ├── config.py
│ └── templates
│ ├── admin
│ ├── index.html
│ └── master_base.html
│ ├── index.html
│ └── security
│ ├── _macros.html
│ ├── _menu.html
│ ├── _messages.html
│ ├── login_user.html
│ └── register_user.html
├── Test
├── .pytest_cache
│ └── v
│ │ └── cache
│ │ ├── lastfailed
│ │ └── nodeids
├── __init__.py
├── testGetConfig.py
├── testGetFreeProxy.py
├── testLogHandler.py
└── testWebRequest.py
├── _config.yml
├── requirements.txt
├── test.py
└── version
/.dockerignore:
--------------------------------------------------------------------------------
1 | Config/Config.ini
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | *.pyc
3 | *.log
4 | .vscode/
5 | site-packages/
6 |
7 | Config\.ini
8 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | os:
2 | - linux
3 |
4 | services:
5 | - docker
6 |
7 | before_install:
8 | - openssl aes-256-cbc -K $encrypted_42099b4af021_key -iv $encrypted_42099b4af021_iv -in CICD/id_rsa.enc -out ~/.ssh/id_rsa -d
9 | - chmod 600 ~/.ssh/id_rsa
10 | - cp CICD/ssh_config ~/.ssh/
11 |
12 | script:
13 | - docker build -t 1again/proxy_pool -f Docker/Dockerfile .
14 |
15 | after_success:
16 | - echo $DOCKER_1AGAIN_PASSWORD | docker login -u 1again --password-stdin
17 | - docker push 1again/proxy_pool
18 |
19 | deploy:
20 | provider: script
21 | script: bash CICD/Scripts/deploy.sh
22 | on:
23 | branch: develop
--------------------------------------------------------------------------------
/CICD/Scripts/deploy.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | echo "======== Start Pull Image ========"
4 | echo
5 | ssh -o StrictHostKeyChecking=no root@$DEPLOY_SSH_HOST -p$DEPLOY_SSH_PORT "docker pull 1again/proxy_pool"
6 | echo
7 |
8 | echo "======== Start Update Code ========"
9 | echo
10 | ssh -o StrictHostKeyChecking=no root@$DEPLOY_SSH_HOST -p$DEPLOY_SSH_PORT "cd $WORKDIR && git pull"
11 | echo
12 |
13 | echo "======== Start Update Container ========"
14 | echo
15 | ssh -o StrictHostKeyChecking=no root@$DEPLOY_SSH_HOST -p$DEPLOY_SSH_PORT "cd $WORKDIR && docker-compose -f Docker/docker-compose.yml up -d"
16 | echo
--------------------------------------------------------------------------------
/CICD/id_rsa.enc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ozhiwei/SmartProxyPool/f38a21857c9f5e3511bc65792a493cf45f6c491e/CICD/id_rsa.enc
--------------------------------------------------------------------------------
/CICD/ssh_config:
--------------------------------------------------------------------------------
1 | Host *
2 | User root
3 | StrictHostKeyChecking no
4 | IdentityFile ~/.ssh/id_rsa
5 | IdentitiesOnly yes
--------------------------------------------------------------------------------
/Config/Config.ini.default:
--------------------------------------------------------------------------------
1 | [DB]
2 | ; Configure the database information
3 | db_type = MONGODB
4 | db_host = proxy_pool_db
5 | db_port = 27017
6 | db_name = proxy
7 | ; user = your_username (Only Mongodb)
8 | ; pass = your_password
9 |
10 | [LOG]
11 | log_level = INFO
12 |
13 | [BIND]
14 | web_bind_host = 0.0.0.0
15 | web_bind_port = 35050
16 |
17 | forward_bind_host = 0.0.0.0
18 | forward_bind_port = 36050
19 |
--------------------------------------------------------------------------------
/Data/17monipdb.datx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ozhiwei/SmartProxyPool/f38a21857c9f5e3511bc65792a493cf45f6c491e/Data/17monipdb.datx
--------------------------------------------------------------------------------
/Docker/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.7
2 | WORKDIR /usr/src/app
3 | COPY . .
4 |
5 | ENV DEBIAN_FRONTEND noninteractive
6 | ENV TZ Asia/Shanghai
7 |
8 | RUN pip install -r requirements.txt
9 |
10 | EXPOSE 35050
11 | EXPOSE 36050
12 |
13 | CMD [ "python", "Src/Run/main.py" ]
14 |
--------------------------------------------------------------------------------
/Docker/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '2'
2 | services:
3 | proxy_pool:
4 | ports:
5 | - "35050:35050"
6 | - "36050:36050"
7 | image: "1again/proxy_pool"
8 | proxy_pool_db:
9 | volumes:
10 | - /data/mongodb/:/data/
11 | ports:
12 | - "27017:27017"
13 | image: "mongo"
--------------------------------------------------------------------------------
/Docs/Design.md:
--------------------------------------------------------------------------------
1 | # 设计目标
2 |
3 | 高质量, 高灵活
4 |
5 | 我们HTTP/HTTPS的代理是从网上获取来的.
6 |
7 | 所有天生具有不稳定的属性.
8 |
9 | 我们就需要通过持续的校验来判断代理是否可靠.
10 |
11 | # 高质量
12 |
13 | 问题是我们怎么定义高质量呢?
14 |
15 | 起初, 我们认为高质量为 `代理的可用性=校验成功次数/校验总共次数`
16 |
17 | 但这个思路不严谨, 举个栗子:
18 |
19 | 有个代理A, 一个月前总共校验500次, 成功校验486次, 代理的可用性=486/500=97.2%
20 |
21 | 看起来似乎不错, 然而, 有个问题, 假如这个代理突然无效了.
22 |
23 | 那这个代理的可用性会慢慢的降低, 486/501=97%, 486/502=96.81%, 486/503=96.62%
24 |
25 | 这个数字看起来依然是一个可用性很高的代理, 但却无法正常使用.
26 |
27 | 所以我们需要额外的信息来判断一个高质量的代理是否可以使用.
28 |
29 | 这个额外的信息就是代理`最后一次校验的状态`.
30 |
31 | 所以我们通过`可用性`和`最后一次成功的状态`来过滤高质量的代理.
32 |
33 | # 高灵活
34 |
35 | 怎么定义高灵活呢? 不太好理解.
36 |
37 | 换个角度, 作为一个代理池, 如何才能更好的提供服务.
38 |
39 | 一开始的设计是通过RESTful API暴露接口提供服务.
40 |
41 | 但这种方式对代码有侵入性, 使用的体验非常不好.
42 |
43 | 于是, 我们可以在做一个动态代理.
44 |
45 | 客户端 -> 动态代理 -> 一般代理 -> WEB服务器
46 |
47 | 在这个动态代理里, 我们可以根据统计的数据做一些过滤, 以及对代理进行反馈.
48 |
49 | # 校验代理
50 |
51 | 实现没什么问题, 关键是在于规模.
52 |
53 | 当代理池的代理数量非常大时, 会导致大量校验, 但其中大部分的校验是无效的.
54 |
55 | 根据`最近最少使用算法`的思想:
56 |
57 | 如果一个代理, 之前大部分时间不可用, 那它在以后不可用的概率非常大.
58 |
59 | 但是, 如果一个代理, 之前大部分时间可用, 那它在以后可用的概率其实是`未知的`.
60 |
61 | 所以我们不需要对`一直不能用`的代理进行校验, 而对`一直可用`的代理进行校验.
62 |
63 | 关键在于应该多久校验一次呢?
64 |
65 | 大部分时间不可用, 换个说法也就是校验失败次数的占比较大.
66 |
67 | 大部分时间可用, 换个说法也就是校验成功次数的占比较大.
68 |
69 | 于是, 我们可以把`校验的时间间隔`与`校验成功次数/校验失败次数`做关联.
70 |
71 | 想法没什么问题, 但实现起来有点麻烦.
72 |
73 | 这里我们可以使用一个额外的信息来代表代理的质量, 这个额外的信息就是
74 |
75 | 每次校验代理后, 如果成功, 我们就+1分, 如果失败, 我们就-1分.
76 |
77 | 最后我们用`代理的评分` 乘以 `常量间隔数`, 就能减少`一直不能用`代理的校验次数.
78 |
79 | 而`一直可用`的代理, 我们直接用`常量间隔数`频率来验证.
80 |
81 | 这样其实就能大量的减少无效的校验.
82 |
83 | 总结一句话: 太差的代理不管了, 表现好的代理要紧盯着.
--------------------------------------------------------------------------------
/Docs/images/2019-06-12-22-11-21.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ozhiwei/SmartProxyPool/f38a21857c9f5e3511bc65792a493cf45f6c491e/Docs/images/2019-06-12-22-11-21.png
--------------------------------------------------------------------------------
/Docs/images/2019-06-12-22-13-10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ozhiwei/SmartProxyPool/f38a21857c9f5e3511bc65792a493cf45f6c491e/Docs/images/2019-06-12-22-13-10.png
--------------------------------------------------------------------------------
/Docs/images/2019-06-12-22-17-34.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ozhiwei/SmartProxyPool/f38a21857c9f5e3511bc65792a493cf45f6c491e/Docs/images/2019-06-12-22-17-34.png
--------------------------------------------------------------------------------
/Docs/images/2019-06-12-22-22-46.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ozhiwei/SmartProxyPool/f38a21857c9f5e3511bc65792a493cf45f6c491e/Docs/images/2019-06-12-22-22-46.png
--------------------------------------------------------------------------------
/Docs/images/2019-06-15-08-18-36.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ozhiwei/SmartProxyPool/f38a21857c9f5e3511bc65792a493cf45f6c491e/Docs/images/2019-06-15-08-18-36.png
--------------------------------------------------------------------------------
/Docs/images/2019-06-15-10-35-29.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ozhiwei/SmartProxyPool/f38a21857c9f5e3511bc65792a493cf45f6c491e/Docs/images/2019-06-15-10-35-29.png
--------------------------------------------------------------------------------
/Docs/images/2019-06-15-13-18-47.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ozhiwei/SmartProxyPool/f38a21857c9f5e3511bc65792a493cf45f6c491e/Docs/images/2019-06-15-13-18-47.png
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 J_hao104
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | # 当前状况
3 |
4 | 不维护(新功能), 原因如下
5 |
6 | 1. 个人没有使用场景, 做出的东西可能不适用于现实场景.
7 | 2. 代码结构比较混乱, 因为是fork其他的项目, 代码的整体比较松散, 对代码洁癖来说, 是种折磨.
8 | 3. 使用场景不好, 无论是代理爬虫或者其他, 都不是一个有利于国家和社会的行为, 技术无罪, 人心难测.
9 |
10 | 以上, 不再开发新功能, 如果有bug之类的会进行修复, 仅供学习交流.
11 |
12 | # 介绍
13 |
14 | 高质量, 高灵活的开放代理池服务
15 |
16 | 可能是`全球第一个`带有`智能动态代理`的代理池服务.
17 |
18 | 这下牛皮吹大了, 不好下来.
19 |
20 | [ProxyPool Demo](http://proxy.1again.cc:35050/api/v1/proxy/) (我就是个栗子, 别指望我能有多稳定!)
21 |
22 | ---
23 |
24 | # 功能/特点
25 |
26 | 我们的目标是`高质量`, `高灵活`.
27 |
28 | 所有功能都是围绕这两点开发的:
29 |
30 | 1. 所有代理都有验证的`计数`和`评分`, 验证成功的次数 / 总计验证的次数 == 代理可用率 (数据库界面)
31 |
32 | 
33 |
34 | 2. 支持动态代理(手动加粗)
35 |
36 | ```
37 | root@1again:~# curl -x "proxy.1again.cc:36050" https://httpbin.org/ip
38 | {
39 | "origin": "183.82.32.56"
40 | }
41 | root@1again:~# curl -x "proxy.1again.cc:36050" https://httpbin.org/ip
42 | {
43 | "origin": "200.149.19.170"
44 | }
45 | root@1again:~# curl -x "proxy.1again.cc:36050" https://httpbin.org/ip
46 | {
47 | "origin": "125.21.43.82"
48 | }
49 | root@1again:~# curl -x "proxy.1again.cc:36050" https://httpbin.org/ip
50 | {
51 | "origin": "110.52.235.124"
52 | }
53 | root@1again:~# curl -x "proxy.1again.cc:36050" https://httpbin.org/ip
54 | {
55 | "origin": "176.74.134.6"
56 | }
57 | ```
58 |
59 | 3. 获取代理时可以根据是否支持`https`, 透明还是匿名(普匿)`type`, 代理的所在的区域`region`进行过滤, 举栗子
60 |
61 | ```
62 | # 获取支持https的proxy
63 | http://proxy.1again.cc:35050/api/v1/proxy/?https=1
64 |
65 | # 获取匿名的proxy
66 | http://proxy.1again.cc:35050/api/v1/proxy/?type=2
67 |
68 | # 获取所在区域为中国的proxy
69 | http://proxy.1again.cc:35050/api/v1/proxy/?region=中国
70 |
71 | # 获取所在区域不为中国的proxy
72 | http://proxy.1again.cc:35050/api/v1/proxy/?region=!中国
73 |
74 | # 获取支持https, 匿名, 所在区域为中国的rpoxy
75 | http://proxy.1again.cc:35050/api/v1/proxy/?https=1&type=2®ion=中国
76 | ```
77 |
78 | 4. [WEB页面的管理](http://proxy.1again.cc:35050/admin) 用户名:admin 密码:admin (尔敢乱动, 打洗雷啊!)
79 |
80 | 
81 |
82 | 5. 可以通过WEB界面配置参数.
83 |
84 | 
85 |
86 | 6. WEB管理`抓取代理的站点`
87 |
88 | 
89 |
90 | 7. 支持`gevent`并发模式, 效果杠杠的, 别看广告, 看疗效!
91 |
92 | ```
93 | 2019-06-13 10:00:26,656 ProxyFetch.py[line:103] INFO fetch [ xicidaili ] proxy finish, total:400, succ:65, fail:0, skip:335, elapsed_time:1s
94 | 2019-06-13 10:00:26,662 ProxyFetch.py[line:103] INFO fetch [ proxylistplus ] proxy finish, total:0, succ:0, fail:0, skip:0, elapsed_time:1s
95 | 2019-06-13 10:00:27,179 ProxyFetch.py[line:103] INFO fetch [ iphai ] proxy finish, total:83, succ:17, fail:0, skip:66, elapsed_time:2s
96 | 2019-06-13 10:00:27,374 ProxyFetch.py[line:103] INFO fetch [ 66ip ] proxy finish, total:0, succ:0, fail:0, skip:0, elapsed_time:2s
97 | 2019-06-13 10:00:32,276 ProxyFetch.py[line:103] INFO fetch [ ip3366 ] proxy finish, total:15, succ:0, fail:0, skip:15, elapsed_time:7s
98 | 2019-06-13 10:00:33,888 ProxyFetch.py[line:103] INFO fetch [ ip181 ] proxy finish, total:0, succ:0, fail:0, skip:0, elapsed_time:8s
99 | 2019-06-13 10:00:34,978 ProxyFetch.py[line:103] INFO fetch [ mimiip ] proxy finish, total:0, succ:0, fail:0, skip:0, elapsed_time:9s
100 | 2019-06-13 10:00:38,182 ProxyFetch.py[line:103] INFO fetch [ proxy-list ] proxy finish, total:28, succ:28, fail:0, skip:0, elapsed_time:13s
101 | 2019-06-13 10:01:36,432 ProxyVerify.py[line:301] INFO useful_proxy verify proxy finish, total:636, succ:327, fail:309, elapsed_time:58s
102 | 2019-06-13 10:31:15,800 ProxyVerify.py[line:301] INFO useful_proxy verify proxy finish, total:481, succ:299, fail:182, elapsed_time:37s
103 | 2019-06-13 11:01:37,569 ProxyVerify.py[line:301] INFO useful_proxy verify proxy finish, total:639, succ:315, fail:324, elapsed_time:59s
104 | 2019-06-13 11:31:54,798 ProxyVerify.py[line:301] INFO useful_proxy verify proxy finish, total:977, succ:342, fail:635, elapsed_time:76s
105 | 2019-06-13 12:01:21,659 ProxyVerify.py[line:301] INFO useful_proxy verify proxy finish, total:608, succ:314, fail:294, elapsed_time:43s
106 | ```
107 |
108 | 8. 实在编不下去了, 你行你来!
109 |
110 | # 文档
111 |
112 | [设计文档](Docs/Design.md)
113 |
114 | # 目前
115 |
116 | 基本上满足了当初的设想, 准备开始写文档和代码优化.
117 |
118 | # 使用场景
119 |
120 | 1. 主要还是用于爬虫.
121 |
122 | 2. 公司需要有个内部代理池服务, 用来干一些丧尽天良的坏事.
123 |
124 | 3. 个人需要用来干一些见不得人的事.
125 |
126 | # 安装/部署
127 |
128 | ## 生产环境
129 |
130 | ```shell
131 | # Install Docker
132 | curl -sSL https://get.docker.com | sh
133 |
134 | # start mongo database
135 | docker run -d --name mongo -v /data/mongodb:/data -p 27017:27017 mongo
136 |
137 | # Start proxy_pool container
138 | docker run -d --name proxy_pool --link mongo:proxy_pool_db -p 35050:35050 -p 36050:36050 1again/proxy_pool
139 | ```
140 |
141 | ## 开发环境
142 |
143 | ```shell
144 | # Clone Repo
145 | git clone https://github.com/1again/ProxyPool
146 |
147 | # Entry Dir
148 | cd ProxyPool
149 |
150 | # Install Docker
151 | curl -sSL https://get.docker.com | sh
152 |
153 | # start mongo database
154 | docker run -d --name mongo -v /data/mongodb:/data -p 27017:27017 mongo
155 |
156 | # Start proxy_pool container
157 | docker run -it --rm --link mongo:proxy_pool_db -v $(pwd):/usr/src/app -p 35050:35050 -p 36050:36050 1again/proxy_pool
158 | ```
159 |
160 | # 使用
161 |
162 | 启动过几分钟后就能看到抓取到的代理IP, 你可以直接在WEB管理界面中中查看
163 |
164 | ## DYNAMIC PROXY
165 |
166 | ```shell
167 | curl -x 'your_server_ip:36050' your_access_url
168 |
169 | like this:
170 | curl -x "proxy.1again.cc:36050" https://httpbin.org/ip
171 | ```
172 |
173 | ## RESTFUL API
174 |
175 | ```python
176 |
177 | API_LIST = {
178 | "/api/v1/proxy/": {
179 | "args": {
180 | "https": {
181 | "value": [1],
182 | "desc": "need https proxy? 1 == true",
183 | "required": False,
184 | },
185 | "region": {
186 | "value": "region name like 中国 or 广州 or 江苏",
187 | "desc": "Get Region Proxy",
188 | "required": False,
189 | },
190 | "type": {
191 | "value": [1,2],
192 | "desc": "clear proxy 1 or (common) anonymous 2",
193 | "required": False,
194 | }
195 | },
196 | "desc": "Get A Random Proxy"
197 | },
198 | "/api/v1/proxies/": {
199 | "args": {
200 | "https": {
201 | "value": [1],
202 | "desc": "need https proxy? 1 == true",
203 | "required": False,
204 | },
205 | "region": {
206 | "value": "region name like 中国 or 广州 or 江苏",
207 | "desc": "Get Region Proxy",
208 | "required": False,
209 | },
210 | "type": {
211 | "value": [1,2],
212 | "desc": "clear proxy 1 or (common) anonymous 2",
213 | "required": False,
214 | }
215 | },
216 | "desc": "Get All Proxy",
217 | },
218 | }
219 |
220 | ```
221 |
222 | ## 扩展代理
223 |
224 | 项目默认包含几个免费的代理获取方法
225 |
226 | 如果遇到好的免费代理渠道, 可以自行添加其他代理获取的方法.
227 |
228 | 添加一个新的代理获取方法如下:
229 |
230 | 首先在`Src/Fetcher/fetchers/`目录中添加你的代理类.
231 |
232 | 该类需要有一个`run`方法, 以生成器(yield)形式返回`host:ip`格式的代理,例如:
233 |
234 | ```python
235 |
236 | # 文件名任意, 一般建议与`fetcher_host`的中间部分保持一致方便识别
237 | # Class名, 固定为`CustomFetcher`
238 | class CustomFetcher():
239 | # 只用来识别的, 会映射到数据库里面
240 | fetcher_host = "www.66ip.cn"
241 |
242 | def run(self):
243 | url_list = [
244 | 'http://www.xxx.com/',
245 | ]
246 | for url in url_list:
247 | html_tree = getHtmlTree(url)
248 | ul_list = html_tree.xpath('//ul[@class="l2"]')
249 | for ul in ul_list:
250 | try:
251 | yield ':'.join(ul.xpath('.//li/text()')[0:2])
252 | except Exception as e:
253 | print(e)
254 | ```
255 |
256 | `ProxyFetchSchedule` 会每隔一段时间抓取一次代理,下次抓取时会自动识别调用你定义的方法。
257 |
258 | # Contributing
259 |
260 | 感谢你的支持, 让我们变得更好!
261 |
262 | 为了规范和清晰, 我们需要一起做些简单约定.
263 |
264 | 两个主要的分支
265 | develop 为下个版本的内容
266 | master 为当前稳定版本的内容
267 |
268 | 1. 小修小改, 不影响原版本的修改, 可以在develop上进行, 然后pull requests
269 | 2. 大动干戈, 影响之前版本的修改, 需要新建一个分支eg: feature_random_proxy, 然后进行pull requests.
270 |
271 | 我会将新分支合并到develop上, 并在演示的机器上运行一段时间后合并至master.
272 |
273 | 以上, 感谢!
274 |
275 | # 问题反馈
276 |
277 | 任何问题欢迎在[Issues](https://github.com/1again/ProxyPool/issues)中反馈.
278 |
279 | 我们的目标是, 没有蛀牙!
280 |
--------------------------------------------------------------------------------
/Src/Config/ConfigManager.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # !/usr/bin/env python
3 |
4 | import sys, os
5 | sys.path.append("Src")
6 |
7 | from Util.utilClass import ConfigParse
8 | from pymongo import MongoClient
9 | from Notify.NotifyManager import register_event, NOTIFY_EVENT
10 |
11 | def is_number(s):
12 | result = False
13 | try:
14 | float(s)
15 | result = True
16 | except ValueError:
17 | pass
18 |
19 | if not result:
20 | try:
21 | import unicodedata
22 | unicodedata.numeric(s)
23 | result = True
24 | except (TypeError, ValueError):
25 | pass
26 |
27 | return result
28 |
29 | class BaseConfig(object):
30 | config_name = "Config.ini"
31 | config_dir = "../../Config/"
32 |
33 | def __init__(self):
34 | pwd = os.path.dirname(os.path.realpath(__file__))
35 | relative_path = "{pwd}/{config_dir}".format(pwd=pwd, config_dir=self.config_dir)
36 | self.config_dir = os.path.realpath(relative_path)
37 | self.setting = {}
38 |
39 | self.load_config()
40 |
41 | def load_config(self):
42 | self.config_path = os.path.join(self.config_dir, self.config_name)
43 | self.default_config_path = os.path.join(self.config_dir, "Config.ini.default")
44 |
45 | self.config = self.load_config_from_path()
46 |
47 | self.load_setting()
48 |
49 | def load_config_from_path(self):
50 | config = ConfigParse()
51 | if os.path.exists(self.config_path):
52 | config.read(self.config_path)
53 | else:
54 | config.read(self.default_config_path)
55 |
56 | return config
57 |
58 | def load_setting(self):
59 | for section in self.config.sections():
60 | for item in self.config.items(section):
61 | field = item[0]
62 | value = int(item[1]) if is_number(item[1]) else item[1]
63 | self.setting[field] = value
64 |
65 | class DBConfig(object):
66 | db_name = "proxy"
67 | docs_name = "default"
68 |
69 | def __init__(self):
70 | client = MongoClient(host=base_config.setting.get("db_host"), port=base_config.setting.get("db_port"), username=base_config.setting.get("db_user"), password=base_config.setting.get("db_pass"))
71 |
72 | self.db = client[self.db_name]
73 |
74 | class SettingConfig(DBConfig):
75 | db_name = "proxy"
76 | docs_name = "setting"
77 | default_config = dict(
78 | verify_useful_proxy_concurrency = 100,
79 | verify_useful_proxy_interval = 30,
80 |
81 | fetch_new_proxy_concurrency = 100,
82 | fetch_new_proxy_interval = 30,
83 |
84 | # clean proxy when number is positive
85 | # disable clean proxy when number is -1
86 | hold_useful_proxy_number = -1,
87 | )
88 |
89 | def __init__(self):
90 | super(SettingConfig, self).__init__()
91 |
92 | self.setting = {}
93 | self.load_data_to_db()
94 | self.load_setting_from_db()
95 |
96 | register_event(NOTIFY_EVENT["AFTER_SETTING_CHANGE"], self.dispatch_event)
97 |
98 | def dispatch_event(self, **kwargs):
99 | self.reload_setting_from_db(**kwargs)
100 |
101 | def load_data_to_db(self):
102 | for field, value in self.default_config.items():
103 | query = { "setting_name": field }
104 | if self.db[self.docs_name].find_one(query):
105 | pass
106 | else:
107 | data = dict(
108 | setting_name = field,
109 | setting_value = value,
110 | setting_state = True,
111 | )
112 |
113 | self.db[self.docs_name].insert_one(data)
114 |
115 | def load_setting_from_db(self):
116 | self.reload_setting_from_db()
117 |
118 | def reload_setting_from_db(self, **kwargs):
119 | cursor = self.db.setting.find()
120 | for item in cursor:
121 | if item["setting_state"]:
122 | field = item["setting_name"]
123 | value = item["setting_value"]
124 | value = int(value) if is_number(value) else value
125 | self.setting[field] = value
126 | else:
127 | field = item["setting_name"]
128 | self.setting[field] = None
129 |
130 | class FetcherConfig(DBConfig):
131 | db_name = "proxy"
132 | docs_name = "fetchers"
133 |
134 | def __init__(self):
135 | super(FetcherConfig, self).__init__()
136 |
137 | self.fetcher_list = []
138 | cursor = self.db[self.docs_name].find()
139 | for item in cursor:
140 | if item["status"]:
141 | self.fetcher_list.append(item["name"])
142 |
143 | def update_fetcher_list(self, items):
144 | for item in items:
145 | query = { "name": item }
146 | if self.db[self.docs_name].find_one(query):
147 | pass
148 | else:
149 | data = dict(
150 | name = item,
151 | status = True,
152 | succ=0,
153 | fail=0,
154 | skip=0,
155 | total=0,
156 | )
157 | self.db[self.docs_name].insert_one(data)
158 | self.fetcher_list.append(item)
159 |
160 | def get_fetcher_list(self):
161 | result = self.fetcher_list
162 | return result
163 |
164 | def update_stat(self, name, stat):
165 | query = {
166 | "name": name,
167 | }
168 |
169 | data = {
170 | "$inc": {
171 | "succ": stat["succ"],
172 | "fail": stat["fail"],
173 | "skip": stat["skip"],
174 | "total": stat["total"],
175 | }
176 | }
177 |
178 | self.db[self.docs_name].update(query, data)
179 |
180 | base_config = BaseConfig()
181 | setting_config = SettingConfig()
182 | # fetcher_config = FetcherConfig()
183 |
184 | if __name__ == '__main__':
185 | pass
--------------------------------------------------------------------------------
/Src/Config/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ozhiwei/SmartProxyPool/f38a21857c9f5e3511bc65792a493cf45f6c491e/Src/Config/__init__.py
--------------------------------------------------------------------------------
/Src/DB/DbClient.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # !/usr/bin/env python
3 |
4 | import os
5 | import sys
6 | import time
7 |
8 | from Config import ConfigManager
9 | from Util.utilClass import Singleton
10 | from DB.MongodbClient import MongodbClient
11 | from Log.LogManager import log
12 | from Manager import ProxyManager
13 |
14 | class DocsModel(object):
15 | docs_name = "test"
16 |
17 | def __init__(self):
18 | db_name = ConfigManager.base_config.setting.get("db_name")
19 | db_host = ConfigManager.base_config.setting.get("db_host")
20 | db_port = ConfigManager.base_config.setting.get("db_port")
21 | db_username = ConfigManager.base_config.setting.get("db_user")
22 | db_password = ConfigManager.base_config.setting.get("db_pass")
23 |
24 | self.mc = MongodbClient(
25 | host=db_host,
26 | port=db_port,
27 | db_name=db_name,
28 | docs_name=self.docs_name,
29 | username=db_username,
30 | password=db_password,
31 | )
32 |
33 | def parse_regin_to_mongo(region_str):
34 | if region_str.startswith('!'):
35 | return { "$nin": [region_str[1:]] }
36 | else:
37 | return { "$in": [region_str] }
38 |
39 | class UsefulProxyDocsModel(DocsModel):
40 | docs_name = "useful_proxy"
41 |
42 | def cleanUsefulProxy(self, **kwargs):
43 | result = 0
44 | hold_number = kwargs.get("hold_number")
45 |
46 | query = {"total": {"$ne": 0}}
47 | total_number = self.mc.count(query)
48 | clean_number = total_number - hold_number
49 |
50 | if clean_number > 0 and hold_number != -1:
51 | operation_list = [
52 | {
53 | "$match": query,
54 | },
55 | {
56 | "$project": { "total": 1, "disable_rate": { "$divide": ["$fail", "$total"] } },
57 | },
58 | {
59 | "$sort": { "disable_rate": -1, "total": -1 },
60 | },
61 | {
62 | "$limit": clean_number,
63 | },
64 | ]
65 |
66 |
67 | items = self.mc.aggregate(operation_list)
68 | result = len(items)
69 | for item in items:
70 | query = {
71 | "_id": item["_id"]
72 | }
73 | self.mc.delete(query)
74 |
75 | return result
76 |
77 | def cleanRawProxy(self, **kwargs):
78 |
79 | query = {
80 | "health": {
81 | "$lt": 1
82 | }
83 | }
84 |
85 | data = self.mc.delete(query)
86 | result = data['n']
87 |
88 | return result
89 |
90 | def getAllValidUsefulProxy(self, **kwargs):
91 | https = kwargs.get("https", None)
92 | region = kwargs.get("region", None)
93 | type_ = kwargs.get("type", None)
94 |
95 | result = []
96 | operation_list = [
97 | {
98 | "$match": { "total": { "$ne": 0 } }
99 | }
100 | ]
101 |
102 | if https:
103 | operation_list[0]["$match"]["https"] = { "$eq": https }
104 |
105 | if type_:
106 | operation_list[0]["$match"]["type"] = { "$eq": type_ }
107 |
108 | if region:
109 | operation_list[0]["$match"]["region_list"] = parse_regin_to_mongo(region)
110 |
111 | log.debug("getAllValidUsefulProxy, operation_list:{operation_list}, ".format(operation_list=str(operation_list)))
112 | result = self.mc.aggregate(operation_list)
113 |
114 | return result
115 |
116 | def getHighQualityUsefulProxy(self, **kwargs):
117 | query = { "quality": { "$gt": -1 } }
118 | result= self.mc.find(query)
119 | return result
120 |
121 | def getLowQualityUsefulProxy(self, **kwargs):
122 | query = { "quality": { "$lt": 0 } }
123 | result= self.mc.find(query)
124 | return result
125 |
126 | def getAllUsefulProxy(self, **kwargs):
127 | query = {}
128 | result = self.mc.find(query)
129 | return result
130 |
131 | def checkProxyExists(self, proxy):
132 | query = {"proxy": proxy}
133 | result = self.mc.exists(query)
134 | return result
135 |
136 | def checkUsefulProxyExists(self, proxy):
137 | result = self.checkProxyExists(proxy)
138 | return result
139 |
140 | # TODO: refine function
141 | def getSampleUsefulProxy(self, **kwargs):
142 | https = kwargs.get("https", None)
143 | region = kwargs.get("region", None)
144 | type_ = kwargs.get("type", None)
145 |
146 | result = None
147 | operation_list = [
148 | {
149 | "$match": {
150 | "total": { "$ne": 0},
151 | "last_status": { "$eq": ProxyManager.PROXY_LAST_STATUS["SUCC"] },
152 | }
153 | },
154 | {
155 | "$sample": { "size": 1}
156 | }
157 | ]
158 |
159 | if https:
160 | operation_list[0]["$match"]["https"] = { "$eq": https }
161 |
162 | if type_:
163 | operation_list[0]["$match"]["type"] = { "$eq": type_ }
164 |
165 | if region:
166 | operation_list[0]["$match"]["region_list"] = parse_regin_to_mongo(region)
167 |
168 | log.debug("getSampleUsefulProxy, operation_list:{operation_list}, ".format(operation_list=str(operation_list)))
169 | data = self.mc.aggregate(operation_list)
170 | if data:
171 | result = data[0]
172 |
173 | return result
174 |
175 | def getVerifyUsefulProxy(self, now):
176 | query = {
177 | "next_verify_time": {
178 | "$lt": now
179 | }
180 | }
181 | result = self.mc.find(query)
182 | return result
183 |
184 | def getQualityUsefulProxy(self, **kwargs):
185 | https = kwargs.get("https", None)
186 | region = kwargs.get("region", None)
187 | type_ = kwargs.get("type", None)
188 |
189 | result = None
190 | operation_list = [
191 | {
192 | "$match": {
193 | "total": { "$ne": 0 },
194 | }
195 | },
196 | {
197 | "$sort": { "quality": -1, "total": -1 },
198 | },
199 | ]
200 |
201 | if https:
202 | operation_list[0]["$match"]["https"] = { "$eq": https }
203 |
204 | if type_:
205 | operation_list[0]["$match"]["type"] = { "$eq": type_ }
206 |
207 | if region:
208 | operation_list[0]["$match"]["region_list"] = parse_regin_to_mongo(region)
209 |
210 | log.debug("getSampleUsefulProxy, operation_list:{operation_list}, ".format(operation_list=str(operation_list)))
211 | result = self.mc.aggregate(operation_list)
212 |
213 | return result
214 |
215 | def getProxyNum(self):
216 | result = self.mc.count()
217 | return result
218 |
219 | def saveUsefulProxy(self, data):
220 | self.mc.insert(data)
221 |
222 | def updateUsefulProxy(self, proxy, data):
223 | query = {"proxy": proxy}
224 | self.updateProxy(query, data)
225 |
226 | def deleteUsefulProxy(self, proxy):
227 | query = {"proxy": proxy}
228 | self.mc.delete(query)
229 |
230 | def tickUsefulProxyVaildSucc(self, proxy):
231 | now_time = int(time.time())
232 | query = {"proxy": proxy}
233 |
234 | data = {
235 | "$inc": {
236 | "succ": 1,
237 | "keep_succ": 1,
238 | },
239 | "$set": {
240 | "last_status": ProxyManager.PROXY_LAST_STATUS["SUCC"],
241 | "last_succ_time": now_time
242 | },
243 | }
244 |
245 | item = self.mc.find_one(query)
246 | if item["quality"] < 0:
247 | data["$set"]["quality"] = 1
248 | else:
249 | data["$inc"]["quality"] = 1
250 |
251 | self.updateProxy(query, data)
252 |
253 | def getProxy(self, proxy):
254 | query = {"proxy": proxy}
255 | result = self.mc.find_one(query)
256 | return result
257 |
258 | def updateProxy(self, query, data):
259 | self.mc.upsert(query, data)
260 |
261 | def tickUsefulProxyVaildFail(self, proxy):
262 | query = {"proxy": proxy}
263 | data = {
264 | "$inc": {
265 | "fail": 1,
266 | "quality": -1
267 | },
268 | "$set": {
269 | "last_status": ProxyManager.PROXY_LAST_STATUS["FAIL"],
270 | "keep_succ": 0
271 | },
272 | }
273 | self.updateProxy(query, data)
274 |
275 | def tickUsefulProxyVaildTotal(self, proxy):
276 | query = {"proxy": proxy}
277 | data = {'$inc': {'total': 1}}
278 | self.updateProxy(query, data)
279 |
280 | class RawProxyDocsModel(DocsModel):
281 | docs_name = "raw_proxy"
282 |
283 | def getAll(self):
284 | result = self.mc.find()
285 | return result
286 |
287 | def getAllRawProxy(self, **kwargs):
288 | result = self.getAll()
289 | return result
290 |
291 | def cleanRawProxy(self, **kwargs):
292 |
293 | query = {
294 | "health": {
295 | "$lt": 1
296 | }
297 | }
298 |
299 | data = self.mc.delete(query)
300 | result = data['n']
301 |
302 | return result
303 |
304 | def checkProxyExists(self, proxy):
305 | query = {"proxy": proxy}
306 | result = self.mc.exists(query)
307 | return result
308 |
309 | def checkRawProxyExists(self, proxy):
310 | result = self.checkProxyExists(proxy)
311 | return result
312 |
313 | def getProxyNum(self):
314 | result = self.mc.count()
315 | return result
316 |
317 | def saveRawProxy(self, data):
318 | result = self.mc.insert(data)
319 | return result
320 |
321 | def deleteRawProxy(self, proxy):
322 | query = {"proxy": proxy}
323 | result = self.mc.delete(query)
324 | return result
325 |
326 | def tickRawProxyVaildFail(self, proxy):
327 | query = {"proxy": proxy}
328 | data = {'$inc': {'health': -1}}
329 | self.updateProxy(query, data)
330 |
331 | class DomainCounterDocsModel(DocsModel):
332 | docs_name = "domain_counter"
333 |
334 | def tickDomainRequestState(self, domain, code):
335 | query = {"domain": domain}
336 | data = {'$inc': {code: 1}}
337 | self.mc.upsert(query, data)
338 |
339 | def getDomainCounter(self, domain):
340 | query = {"domain": domain}
341 | result = self.mc.find_one(query)
342 | return result
343 |
344 | class FetchersDocsModel(DocsModel):
345 | docs_name = "fetchers"
346 |
347 | def getAllFetcher(self):
348 | query = {}
349 | result = self.mc.find(query)
350 | return result
351 |
352 | def getExecFetcher(self, now):
353 | query = {"next_fetch_time": {"$lt": now}}
354 | result = self.mc.find(query)
355 | return result
356 |
357 | def getFetcher(self, name):
358 | query = { "name": name }
359 | result = self.mc.find(query)
360 | return result
361 |
362 | def updateFetcher(self, name, data):
363 | query = {"name": name}
364 | self.mc.upsert(query, data)
365 |
366 | if __name__ == "__main__":
367 | pass
368 |
--------------------------------------------------------------------------------
/Src/DB/MongodbClient.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | from pymongo import MongoClient
4 |
5 |
6 | class MongodbClient(object):
7 | db_name = "proxy"
8 |
9 | def __init__(self, host, port, db_name, docs_name, **kwargs):
10 | self.conn = MongoClient(host, port, **kwargs)
11 | self.db = self.conn[db_name]
12 | self.docs = self.db[docs_name]
13 |
14 | def find_one(self, query):
15 | result = self.docs.find_one(query)
16 | return result
17 |
18 | def insert(self, data):
19 | result = self.docs.insert(data)
20 | return result
21 |
22 | def aggregate(self, operation_list):
23 | result = list(self.docs.aggregate(operation_list))
24 | return result
25 |
26 | def delete(self, query):
27 | result = self.docs.remove(query)
28 | return result
29 |
30 | def find(self, query):
31 | result = list(self.docs.find(query))
32 | return result
33 |
34 | def update(self, query, data):
35 | result = self.docs.update(query, data)
36 | return result
37 |
38 | def upsert(self, query, data):
39 | result = self.docs.update(query, data, upsert=True)
40 | return result
41 |
42 | def exists(self, query):
43 | result = False
44 | data = self.find_one(query)
45 | if data:
46 | result = True
47 |
48 | return result
49 |
50 | def count(self, query={}):
51 | result = self.docs.count(query)
52 | return result
53 |
54 | if __name__ == "__main__":
55 | # db = MongodbClient('first', 'localhost', 27017)
56 | # db.put('127.0.0.1:1')
57 | # db2 = MongodbClient('second', 'localhost', 27017)
58 | # db2.put('127.0.0.1:2')
59 | # print(db.pop())
60 | pass
61 |
--------------------------------------------------------------------------------
/Src/DB/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | File Name: __init__.py.py
5 | Description :
6 | Author : JHao
7 | date: 2016/12/2
8 | -------------------------------------------------
9 | Change Activity:
10 | 2016/12/2:
11 | -------------------------------------------------
12 | """
--------------------------------------------------------------------------------
/Src/Fetcher/FetcherManager.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append("Src")
3 | import os
4 | import importlib
5 |
6 | from Config import ConfigManager
7 | from Manager import ProxyManager
8 |
9 | SKIP_FILE_LIST = [
10 | "__init__.py",
11 | "__pycache__",
12 | ]
13 |
14 | def init():
15 | file_names = os.listdir("Src/Fetcher/fetchers")
16 | for file_name in file_names:
17 | if file_name in SKIP_FILE_LIST:
18 | pass
19 | else:
20 | fetcher_name = os.path.splitext(file_name)[0]
21 | fetcher_class = getFetcherClass(fetcher_name)
22 | fetcher_host = fetcher_class.fetcher_host
23 |
24 | item = ProxyManager.proxy_manager.getFetcher(fetcher_name)
25 | if item:
26 | pass
27 | else:
28 | saveDefaultFetcher(fetcher_name, fetcher_host)
29 |
30 | return True
31 |
32 | def saveDefaultFetcher(name, host):
33 | data = dict(
34 | name = name,
35 | host = host,
36 | status = True,
37 | succ=0,
38 | fail=0,
39 | skip=0,
40 | total=0,
41 | interval=30,
42 | next_fetch_time=0,
43 | )
44 | ProxyManager.proxy_manager.updateFetcher(name, data)
45 |
46 | def getFetcherClass(name):
47 | module_name = "Fetcher.fetchers.%s" % (name)
48 | module = importlib.import_module(module_name)
49 | result = getattr(module, "CustomFetcher")
50 | return result
51 |
52 | init()
53 |
54 | if __name__ == '__main__':
55 | pass
--------------------------------------------------------------------------------
/Src/Fetcher/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # !/usr/bin/env python
3 |
4 |
5 | class Fetcher():
6 |
7 | fetcher_host = "cn-proxy.com"
8 |
9 | def __init__(self):
10 | split_list = self.fetcher_host.split('.')
11 | split_length = len(split_list)
12 | if split_length == 4:
13 | name = split_list[-3]
14 | else:
15 | name = split_list[-2]
16 |
17 | self.fetcher_name = name
18 |
19 |
--------------------------------------------------------------------------------
/Src/Fetcher/fetchers/66ip.py:
--------------------------------------------------------------------------------
1 |
2 | # -*- coding: utf-8 -*-
3 | # !/usr/bin/env python
4 |
5 | import re
6 |
7 | from Util.WebRequest import WebRequest
8 | from Util.utilFunction import getHtmlTree
9 | from Fetcher import Fetcher
10 |
11 |
12 | class CustomFetcher(Fetcher):
13 |
14 | fetcher_host = "www.66ip.cn"
15 |
16 | def run(self):
17 | area = 33
18 | page = 1
19 | for area_index in range(1, area + 1):
20 | for i in range(1, page + 1):
21 | url = "http://www.66ip.cn/areaindex_{}/{}.html".format(area_index, i)
22 | html_tree = getHtmlTree(url)
23 | tr_list = html_tree.xpath("//*[@id='footer']/div/table/tr[position()>1]")
24 | if len(tr_list) == 0:
25 | continue
26 | for tr in tr_list:
27 | yield tr.xpath("./td[1]/text()")[0] + ":" + tr.xpath("./td[2]/text()")[0]
28 | break
29 |
--------------------------------------------------------------------------------
/Src/Fetcher/fetchers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ozhiwei/SmartProxyPool/f38a21857c9f5e3511bc65792a493cf45f6c491e/Src/Fetcher/fetchers/__init__.py
--------------------------------------------------------------------------------
/Src/Fetcher/fetchers/cn-proxy.py:
--------------------------------------------------------------------------------
1 |
2 | # -*- coding: utf-8 -*-
3 | # !/usr/bin/env python
4 |
5 | import re
6 |
7 | from Util.WebRequest import WebRequest
8 | from Util.utilFunction import getHtmlTree
9 |
10 |
11 | class CustomFetcher():
12 |
13 | fetcher_host = "cn-proxy.com"
14 |
15 | def run(self):
16 |
17 | urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218']
18 | request = WebRequest()
19 | for url in urls:
20 | r = request.get(url)
21 | proxies = re.findall(r'
(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) | [\w\W](\d+) | ', r.text)
22 | for proxy in proxies:
23 | yield ':'.join(proxy)
24 |
--------------------------------------------------------------------------------
/Src/Fetcher/fetchers/coderbusy.py:
--------------------------------------------------------------------------------
1 |
2 | # -*- coding: utf-8 -*-
3 | # !/usr/bin/env python
4 |
5 | import re
6 |
7 | from Util.WebRequest import WebRequest
8 | from Util.utilFunction import getHtmlTree
9 |
10 |
11 | class CustomFetcher():
12 |
13 | fetcher_host = "proxy.coderbusy.com"
14 |
15 | def run(self):
16 |
17 | urls = ['https://proxy.coderbusy.com/classical/country/cn.aspx?page=1']
18 | request = WebRequest()
19 | for url in urls:
20 | r = request.get(url)
21 | proxies = re.findall('data-ip="(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})".+?>(\d+)', r.text)
22 | for proxy in proxies:
23 | yield ':'.join(proxy)
24 |
--------------------------------------------------------------------------------
/Src/Fetcher/fetchers/data5u.py:
--------------------------------------------------------------------------------
1 |
2 | # -*- coding: utf-8 -*-
3 | # !/usr/bin/env python
4 |
5 | import re
6 |
7 | from Util.WebRequest import WebRequest
8 | from Util.utilFunction import getHtmlTree
9 |
10 |
11 | class CustomFetcher():
12 |
13 | fetcher_host = "www.data5u.com"
14 |
15 | def run(self):
16 |
17 | url_list = [
18 | 'http://www.data5u.com/',
19 | 'http://www.data5u.com/free/gngn/index.shtml',
20 | 'http://www.data5u.com/free/gnpt/index.shtml'
21 | ]
22 | for url in url_list:
23 | html_tree = getHtmlTree(url)
24 | ul_list = html_tree.xpath('//ul[@class="l2"]')
25 | for ul in ul_list:
26 | try:
27 | yield ':'.join(ul.xpath('.//li/text()')[0:2])
28 | except Exception as e:
29 | print(e)
30 |
--------------------------------------------------------------------------------
/Src/Fetcher/fetchers/goubanjia.py:
--------------------------------------------------------------------------------
1 |
2 | # -*- coding: utf-8 -*-
3 | # !/usr/bin/env python
4 |
5 | import re
6 |
7 | from Util.WebRequest import WebRequest
8 | from Util.utilFunction import getHtmlTree
9 |
10 |
11 | class CustomFetcher():
12 |
13 | fetcher_host = "www.goubanjia.com"
14 |
15 | def run(self):
16 |
17 | url = "http://www.goubanjia.com/"
18 | tree = getHtmlTree(url)
19 | proxy_list = tree.xpath('//td[@class="ip"]')
20 | # 此网站有隐藏的数字干扰,或抓取到多余的数字或.符号
21 | # 需要过滤掉的内容
22 | xpath_str = """.//*[not(contains(@style, 'display: none'))
23 | and not(contains(@style, 'display:none'))
24 | and not(contains(@class, 'port'))
25 | ]/text()
26 | """
27 | for each_proxy in proxy_list:
28 | try:
29 | # :符号裸放在td下,其他放在div span p中,先分割找出ip,再找port
30 | ip_addr = ''.join(each_proxy.xpath(xpath_str))
31 | port = each_proxy.xpath(".//span[contains(@class, 'port')]/text()")[0]
32 | yield '{}:{}'.format(ip_addr, port)
33 | except Exception as e:
34 | pass
35 |
--------------------------------------------------------------------------------
/Src/Fetcher/fetchers/ip181.py:
--------------------------------------------------------------------------------
1 |
2 | # -*- coding: utf-8 -*-
3 | # !/usr/bin/env python
4 |
5 | import re
6 |
7 | from Util.WebRequest import WebRequest
8 | from Util.utilFunction import getHtmlTree
9 |
10 |
11 | class CustomFetcher():
12 |
13 | fetcher_host = "www.ip181.com"
14 |
15 | def run(self):
16 |
17 | url = 'http://www.ip181.com/'
18 | html_tree = getHtmlTree(url)
19 | try:
20 | tr_list = html_tree.xpath('//tr')[1:]
21 | for tr in tr_list:
22 | yield ':'.join(tr.xpath('./td/text()')[0:2])
23 | except Exception as e:
24 | pass
25 |
--------------------------------------------------------------------------------
/Src/Fetcher/fetchers/ip3366.py:
--------------------------------------------------------------------------------
1 |
2 | # -*- coding: utf-8 -*-
3 | # !/usr/bin/env python
4 |
5 | import re
6 |
7 | from Util.WebRequest import WebRequest
8 | from Util.utilFunction import getHtmlTree
9 |
10 |
11 | class CustomFetcher():
12 |
13 | fetcher_host = "www.ip3366.net"
14 |
15 | def run(self):
16 |
17 | urls = ['http://www.ip3366.net/free/']
18 | request = WebRequest()
19 | for url in urls:
20 | r = request.get(url)
21 | proxies = re.findall(r'
(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) | [\s\S]*?(\d+) | ', r.text)
22 | for proxy in proxies:
23 | yield ":".join(proxy)
24 |
--------------------------------------------------------------------------------
/Src/Fetcher/fetchers/iphai.py:
--------------------------------------------------------------------------------
1 |
2 | # -*- coding: utf-8 -*-
3 | # !/usr/bin/env python
4 |
5 | import re
6 |
7 | from Util.WebRequest import WebRequest
8 | from Util.utilFunction import getHtmlTree
9 |
10 |
11 | class CustomFetcher():
12 |
13 | fetcher_host = "www.iphai.com"
14 |
15 | def run(self):
16 |
17 | urls = [
18 | 'http://www.iphai.com/free/ng',
19 | 'http://www.iphai.com/free/np',
20 | 'http://www.iphai.com/free/wg',
21 | 'http://www.iphai.com/free/wp'
22 | ]
23 | request = WebRequest()
24 | for url in urls:
25 | r = request.get(url)
26 | proxies = re.findall(r'\s*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s*? | [\s\S]*?\s*?(\d+)\s*? | ',
27 | r.text)
28 | for proxy in proxies:
29 | yield ":".join(proxy)
30 |
--------------------------------------------------------------------------------
/Src/Fetcher/fetchers/jiangxianli.py:
--------------------------------------------------------------------------------
1 |
2 | # -*- coding: utf-8 -*-
3 | # !/usr/bin/env python
4 |
5 | import re
6 |
7 | from Util.WebRequest import WebRequest
8 | from Util.utilFunction import getHtmlTree
9 |
10 |
11 | class CustomFetcher():
12 |
13 | fetcher_host = "ip.jiangxianli.com"
14 |
15 | def run(self):
16 | page_count = 8
17 | for i in range(1, page_count + 1):
18 | url = 'http://ip.jiangxianli.com/?page={}'.format(i)
19 | html_tree = getHtmlTree(url)
20 | tr_list = html_tree.xpath("/html/body/div[1]/div/div[1]/div[2]/table/tbody/tr")
21 | if len(tr_list) == 0:
22 | continue
23 | for tr in tr_list:
24 | yield tr.xpath("./td[2]/text()")[0] + ":" + tr.xpath("./td[3]/text()")[0]
25 |
--------------------------------------------------------------------------------
/Src/Fetcher/fetchers/kuaidaili.py:
--------------------------------------------------------------------------------
1 |
2 | # -*- coding: utf-8 -*-
3 | # !/usr/bin/env python
4 |
5 | import re
6 |
7 | from Util.WebRequest import WebRequest
8 | from Util.utilFunction import getHtmlTree
9 |
10 |
11 | class CustomFetcher():
12 |
13 | fetcher_host = "www.kuaidaili.com"
14 |
15 | def run(self):
16 |
17 | url_list = [
18 | 'https://www.kuaidaili.com/free/inha/{page}/',
19 | 'https://www.kuaidaili.com/free/intr/{page}/'
20 | ]
21 | for url in url_list:
22 | for page in range(1, 5):
23 | page_url = url.format(page=page)
24 | tree = getHtmlTree(page_url)
25 | proxy_list = tree.xpath('.//table//tr')
26 | for tr in proxy_list[1:]:
27 | yield ':'.join(tr.xpath('./td/text()')[0:2])
28 |
--------------------------------------------------------------------------------
/Src/Fetcher/fetchers/mimiip.py:
--------------------------------------------------------------------------------
1 |
2 | # -*- coding: utf-8 -*-
3 | # !/usr/bin/env python
4 |
5 | import re
6 |
7 | from Util.WebRequest import WebRequest
8 | from Util.utilFunction import getHtmlTree
9 |
10 |
11 | class CustomFetcher():
12 |
13 | fetcher_host = "www.mimiip.com"
14 |
15 | def run(self):
16 |
17 | url_gngao = ['http://www.mimiip.com/gngao/%s' % n for n in range(1, 10)] # 国内高匿
18 | url_gnpu = ['http://www.mimiip.com/gnpu/%s' % n for n in range(1, 10)] # 国内普匿
19 | url_gntou = ['http://www.mimiip.com/gntou/%s' % n for n in range(1, 10)] # 国内透明
20 | url_list = url_gngao + url_gnpu + url_gntou
21 |
22 | request = WebRequest()
23 | for url in url_list:
24 | r = request.get(url)
25 | if r.status_code != 200:
26 | break
27 | proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) | [\w\W].*(\d+) | ', r.text)
28 | for proxy in proxies:
29 | yield ':'.join(proxy)
30 |
--------------------------------------------------------------------------------
/Src/Fetcher/fetchers/proxy-list.py:
--------------------------------------------------------------------------------
1 |
2 | # -*- coding: utf-8 -*-
3 | # !/usr/bin/env python
4 |
5 | import re
6 |
7 | from Util.WebRequest import WebRequest
8 | from Util.utilFunction import getHtmlTree
9 |
10 |
11 | class CustomFetcher():
12 |
13 | fetcher_host = "proxy-list.org"
14 |
15 | def run(self):
16 |
17 | urls = ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)]
18 | request = WebRequest()
19 | import base64
20 | for url in urls:
21 | r = request.get(url)
22 | if r.status_code != 200:
23 | break
24 | proxies = re.findall(r"Proxy\('(.*?)'\)", r.text)
25 | for proxy in proxies:
26 | yield base64.b64decode(proxy).decode()
27 |
--------------------------------------------------------------------------------
/Src/Fetcher/fetchers/proxylistplus.py:
--------------------------------------------------------------------------------
1 |
2 | # -*- coding: utf-8 -*-
3 | # !/usr/bin/env python
4 |
5 | import re
6 |
7 | from Util.WebRequest import WebRequest
8 | from Util.utilFunction import getHtmlTree
9 |
10 |
11 | class CustomFetcher():
12 |
13 | fetcher_host = "list.proxylistplus.com"
14 |
15 | def run(self):
16 |
17 | urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1']
18 | request = WebRequest()
19 | for url in urls:
20 | r = request.get(url)
21 | proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) | [\s\S]*?(\d+) | ', r.text)
22 | for proxy in proxies:
23 | yield ':'.join(proxy)
24 |
--------------------------------------------------------------------------------
/Src/Fetcher/fetchers/xdaili.py:
--------------------------------------------------------------------------------
1 |
2 | # -*- coding: utf-8 -*-
3 | # !/usr/bin/env python
4 |
5 | import re
6 |
7 | from Util.WebRequest import WebRequest
8 | from Util.utilFunction import getHtmlTree
9 |
10 |
11 | class CustomFetcher():
12 |
13 | fetcher_host = "www.xdaili.cn"
14 |
15 | def run(self):
16 |
17 | url = 'http://www.xdaili.cn/ipagent/freeip/getFreeIps?page=1&rows=10'
18 | request = WebRequest()
19 | try:
20 | res = request.get(url).json()
21 | for row in res['RESULT']['rows']:
22 | yield '{}:{}'.format(row['ip'], row['port'])
23 | except Exception as e:
24 | pass
25 |
--------------------------------------------------------------------------------
/Src/Fetcher/fetchers/xicidaili.py:
--------------------------------------------------------------------------------
1 |
2 | # -*- coding: utf-8 -*-
3 | # !/usr/bin/env python
4 |
5 | import re
6 |
7 | from Util.WebRequest import WebRequest
8 | from Util.utilFunction import getHtmlTree
9 |
10 |
11 | class CustomFetcher():
12 |
13 | fetcher_host = "www.xicidaili.com"
14 |
15 | def run(self):
16 | page_count = 2
17 |
18 | url_list = [
19 | 'http://www.xicidaili.com/nn/', # 高匿
20 | 'http://www.xicidaili.com/nt/', # 透明
21 | ]
22 | for each_url in url_list:
23 | for i in range(1, page_count + 1):
24 | page_url = each_url + str(i)
25 | tree = getHtmlTree(page_url)
26 | proxy_list = tree.xpath('.//table[@id="ip_list"]//tr[position()>1]')
27 | for proxy in proxy_list:
28 | try:
29 | yield ':'.join(proxy.xpath('./td/text()')[0:2])
30 | except Exception as e:
31 | pass
32 |
--------------------------------------------------------------------------------
/Src/Forward/ForwardManager.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append("Src")
3 |
4 | from Forward.base import HTTP, Proxy
5 |
6 | from Manager import ProxyManager
7 | from Config import ConfigManager
8 |
9 | class ForwardProxy(Proxy):
10 |
11 | def _get_host_and_port(self):
12 | https = None
13 | if self.request.method == b'CONNECT':
14 | https = ProxyManager.PROXY_HTTPS["ENABLE"]
15 |
16 | domain = self.request.url.netloc
17 | if isinstance(domain, bytes):
18 | domain = domain.decode("utf8")
19 |
20 | ProxyManager.proxy_manager.tickDomainRequestState(domain, "total")
21 | counter = ProxyManager.proxy_manager.getDomainCounter(domain)
22 | count = counter.get("total")
23 | item = ProxyManager.proxy_manager.getQualityUsefulProxy(https=https, count=count, domain=domain)
24 | proxy = item.get("proxy")
25 | address = proxy.split(":")
26 | return address
27 |
28 | def before_process_response(self):
29 | domain = self.request.url.netloc
30 | if isinstance(domain, bytes):
31 | domain = domain.decode("utf8")
32 |
33 | if isinstance(self.response.code, bytes):
34 | status_code = "status_code_%s" % self.response.code.decode()
35 | else:
36 | status_code = "status_code_%s" % self.response.code
37 |
38 | ProxyManager.proxy_manager.tickDomainRequestState(domain, status_code)
39 |
40 | class ForwardHttp(HTTP):
41 |
42 | def __init__(self):
43 | bind_host = ConfigManager.base_config.setting.get("forward_bind_host")
44 | bind_port = ConfigManager.base_config.setting.get("forward_bind_port")
45 |
46 | super(ForwardHttp, self).__init__(hostname=bind_host, port=bind_port)
47 |
48 | def handle(self, client):
49 |
50 | fp = ForwardProxy(client,
51 | auth_code=self.auth_code,
52 | server_recvbuf_size=self.server_recvbuf_size,
53 | client_recvbuf_size=self.client_recvbuf_size,
54 | )
55 |
56 | fp.daemon = True
57 | fp.start()
58 |
59 |
60 | if __name__ == '__main__':
61 | fh = ForwardHttp()
62 | fh.run()
--------------------------------------------------------------------------------
/Src/Forward/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ozhiwei/SmartProxyPool/f38a21857c9f5e3511bc65792a493cf45f6c491e/Src/Forward/__init__.py
--------------------------------------------------------------------------------
/Src/Forward/base.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | """
4 | proxy.py
5 | ~~~~~~~~
6 |
7 | HTTP Proxy Server in Python.
8 |
9 | :copyright: (c) 2013-2018 by Abhinav Singh.
10 | :license: BSD, see LICENSE for more details.
11 | """
12 | import os
13 | import sys
14 | import errno
15 | import base64
16 | import socket
17 | import select
18 | import logging
19 | import argparse
20 | import datetime
21 | import threading
22 | from collections import namedtuple
23 |
24 | if os.name != 'nt':
25 | import resource
26 |
27 | VERSION = (0, 3)
28 | __version__ = '.'.join(map(str, VERSION[0:2]))
29 | __description__ = 'HTTP Proxy Server in Python'
30 | __author__ = 'Abhinav Singh'
31 | __author_email__ = 'mailsforabhinav@gmail.com'
32 | __homepage__ = 'https://github.com/abhinavsingh/proxy.py'
33 | __download_url__ = '%s/archive/master.zip' % __homepage__
34 | __license__ = 'BSD'
35 |
36 | logger = logging.getLogger(__name__)
37 |
38 | PY3 = sys.version_info[0] == 3
39 |
40 | if PY3: # pragma: no cover
41 | text_type = str
42 | binary_type = bytes
43 | from urllib import parse as urlparse
44 | else: # pragma: no cover
45 | text_type = unicode
46 | binary_type = str
47 | import urlparse
48 |
49 |
50 | def text_(s, encoding='utf-8', errors='strict'): # pragma: no cover
51 | """Utility to ensure text-like usability.
52 |
53 | If ``s`` is an instance of ``binary_type``, return
54 | ``s.decode(encoding, errors)``, otherwise return ``s``"""
55 | if isinstance(s, binary_type):
56 | return s.decode(encoding, errors)
57 | return s
58 |
59 |
60 | def bytes_(s, encoding='utf-8', errors='strict'): # pragma: no cover
61 | """Utility to ensure binary-like usability.
62 |
63 | If ``s`` is an instance of ``text_type``, return
64 | ``s.encode(encoding, errors)``, otherwise return ``s``"""
65 | if isinstance(s, text_type):
66 | return s.encode(encoding, errors)
67 | return s
68 |
69 |
70 | version = bytes_(__version__)
71 | CRLF, COLON, SP = b'\r\n', b':', b' '
72 | PROXY_AGENT_HEADER = b'Proxy-agent: proxy.py v' + version
73 |
74 | PROXY_TUNNEL_ESTABLISHED_RESPONSE_PKT = CRLF.join([
75 | b'HTTP/1.1 200 Connection established',
76 | PROXY_AGENT_HEADER,
77 | CRLF
78 | ])
79 |
80 | BAD_GATEWAY_RESPONSE_PKT = CRLF.join([
81 | b'HTTP/1.1 502 Bad Gateway',
82 | PROXY_AGENT_HEADER,
83 | b'Content-Length: 11',
84 | b'Connection: close',
85 | CRLF
86 | ]) + b'Bad Gateway'
87 |
88 | PROXY_AUTHENTICATION_REQUIRED_RESPONSE_PKT = CRLF.join([
89 | b'HTTP/1.1 407 Proxy Authentication Required',
90 | PROXY_AGENT_HEADER,
91 | b'Content-Length: 29',
92 | b'Connection: close',
93 | CRLF
94 | ]) + b'Proxy Authentication Required'
95 |
96 |
97 | class ChunkParser(object):
98 | """HTTP chunked encoding response parser."""
99 |
100 | states = namedtuple('ChunkParserStates', (
101 | 'WAITING_FOR_SIZE',
102 | 'WAITING_FOR_DATA',
103 | 'COMPLETE'
104 | ))(1, 2, 3)
105 |
106 | def __init__(self):
107 | self.state = ChunkParser.states.WAITING_FOR_SIZE
108 | self.body = b'' # Parsed chunks
109 | self.chunk = b'' # Partial chunk received
110 | self.size = None # Expected size of next following chunk
111 |
112 | def parse(self, data):
113 | more = True if len(data) > 0 else False
114 | while more:
115 | more, data = self.process(data)
116 |
117 | def process(self, data):
118 | if self.state == ChunkParser.states.WAITING_FOR_SIZE:
119 | # Consume prior chunk in buffer
120 | # in case chunk size without CRLF was received
121 | data = self.chunk + data
122 | self.chunk = b''
123 | # Extract following chunk data size
124 | line, data = HttpParser.split(data)
125 | if not line: # CRLF not received
126 | self.chunk = data
127 | data = b''
128 | else:
129 | self.size = int(line, 16)
130 | self.state = ChunkParser.states.WAITING_FOR_DATA
131 | elif self.state == ChunkParser.states.WAITING_FOR_DATA:
132 | remaining = self.size - len(self.chunk)
133 | self.chunk += data[:remaining]
134 | data = data[remaining:]
135 | if len(self.chunk) == self.size:
136 | data = data[len(CRLF):]
137 | self.body += self.chunk
138 | if self.size == 0:
139 | self.state = ChunkParser.states.COMPLETE
140 | else:
141 | self.state = ChunkParser.states.WAITING_FOR_SIZE
142 | self.chunk = b''
143 | self.size = None
144 | return len(data) > 0, data
145 |
146 |
147 | class HttpParser(object):
148 | """HTTP request/response parser."""
149 |
150 | states = namedtuple('HttpParserStates', (
151 | 'INITIALIZED',
152 | 'LINE_RCVD',
153 | 'RCVING_HEADERS',
154 | 'HEADERS_COMPLETE',
155 | 'RCVING_BODY',
156 | 'COMPLETE'))(1, 2, 3, 4, 5, 6)
157 |
158 | types = namedtuple('HttpParserTypes', (
159 | 'REQUEST_PARSER',
160 | 'RESPONSE_PARSER'
161 | ))(1, 2)
162 |
163 | def __init__(self, parser_type):
164 | assert parser_type in (HttpParser.types.REQUEST_PARSER, HttpParser.types.RESPONSE_PARSER)
165 | self.type = parser_type
166 | self.state = HttpParser.states.INITIALIZED
167 |
168 | self.raw = b''
169 | self.buffer = b''
170 |
171 | self.headers = dict()
172 | self.body = None
173 |
174 | self.method = None
175 | self.url = None
176 | self.code = None
177 | self.reason = None
178 | self.version = None
179 |
180 | self.chunk_parser = None
181 |
182 | def is_chunked_encoded_response(self):
183 | return self.type == HttpParser.types.RESPONSE_PARSER and \
184 | b'transfer-encoding' in self.headers and \
185 | self.headers[b'transfer-encoding'][1].lower() == b'chunked'
186 |
187 | def parse(self, data):
188 | self.raw += data
189 | data = self.buffer + data
190 | self.buffer = b''
191 |
192 | more = True if len(data) > 0 else False
193 | while more:
194 | more, data = self.process(data)
195 | self.buffer = data
196 |
197 | def process(self, data):
198 | if self.state in (HttpParser.states.HEADERS_COMPLETE,
199 | HttpParser.states.RCVING_BODY,
200 | HttpParser.states.COMPLETE) and \
201 | (self.method == b'POST' or self.type == HttpParser.types.RESPONSE_PARSER):
202 | if not self.body:
203 | self.body = b''
204 |
205 | if b'content-length' in self.headers:
206 | self.state = HttpParser.states.RCVING_BODY
207 | self.body += data
208 | if len(self.body) >= int(self.headers[b'content-length'][1]):
209 | self.state = HttpParser.states.COMPLETE
210 | elif self.is_chunked_encoded_response():
211 | if not self.chunk_parser:
212 | self.chunk_parser = ChunkParser()
213 | self.chunk_parser.parse(data)
214 | if self.chunk_parser.state == ChunkParser.states.COMPLETE:
215 | self.body = self.chunk_parser.body
216 | self.state = HttpParser.states.COMPLETE
217 |
218 | return False, b''
219 |
220 | line, data = HttpParser.split(data)
221 | if line is False:
222 | return line, data
223 |
224 | if self.state == HttpParser.states.INITIALIZED:
225 | self.process_line(line)
226 | elif self.state in (HttpParser.states.LINE_RCVD, HttpParser.states.RCVING_HEADERS):
227 | self.process_header(line)
228 |
229 | # When connect request is received without a following host header
230 | # See `TestHttpParser.test_connect_request_without_host_header_request_parse` for details
231 | if self.state == HttpParser.states.LINE_RCVD and \
232 | self.type == HttpParser.types.REQUEST_PARSER and \
233 | self.method == b'CONNECT' and \
234 | data == CRLF:
235 | self.state = HttpParser.states.COMPLETE
236 |
237 | # When raw request has ended with \r\n\r\n and no more http headers are expected
238 | # See `TestHttpParser.test_request_parse_without_content_length` and
239 | # `TestHttpParser.test_response_parse_without_content_length` for details
240 | elif self.state == HttpParser.states.HEADERS_COMPLETE and \
241 | self.type == HttpParser.types.REQUEST_PARSER and \
242 | self.method != b'POST' and \
243 | self.raw.endswith(CRLF * 2):
244 | self.state = HttpParser.states.COMPLETE
245 | elif self.state == HttpParser.states.HEADERS_COMPLETE and \
246 | self.type == HttpParser.types.REQUEST_PARSER and \
247 | self.method == b'POST' and \
248 | (b'content-length' not in self.headers or
249 | (b'content-length' in self.headers and
250 | int(self.headers[b'content-length'][1]) == 0)) and \
251 | self.raw.endswith(CRLF * 2):
252 | self.state = HttpParser.states.COMPLETE
253 |
254 | return len(data) > 0, data
255 |
256 | def process_line(self, data):
257 | line = data.split(SP)
258 | if self.type == HttpParser.types.REQUEST_PARSER:
259 | self.method = line[0].upper()
260 | self.url = urlparse.urlsplit(line[1])
261 | self.version = line[2]
262 | else:
263 | self.version = line[0]
264 | self.code = line[1]
265 | self.reason = b' '.join(line[2:])
266 | self.state = HttpParser.states.LINE_RCVD
267 |
268 | def process_header(self, data):
269 | if len(data) == 0:
270 | if self.state == HttpParser.states.RCVING_HEADERS:
271 | self.state = HttpParser.states.HEADERS_COMPLETE
272 | elif self.state == HttpParser.states.LINE_RCVD:
273 | self.state = HttpParser.states.RCVING_HEADERS
274 | else:
275 | self.state = HttpParser.states.RCVING_HEADERS
276 | parts = data.split(COLON)
277 | key = parts[0].strip()
278 | value = COLON.join(parts[1:]).strip()
279 | self.headers[key.lower()] = (key, value)
280 |
281 | def build_url(self):
282 | if not self.url:
283 | return b'/None'
284 |
285 | url = self.url.path
286 | if url == b'':
287 | url = b'/'
288 | if not self.url.query == b'':
289 | url += b'?' + self.url.query
290 | if not self.url.fragment == b'':
291 | url += b'#' + self.url.fragment
292 | return url
293 |
294 | def build(self, del_headers=None, add_headers=None):
295 | req = b' '.join([self.method, self.build_url(), self.version])
296 | req += CRLF
297 |
298 | if not del_headers:
299 | del_headers = []
300 | for k in self.headers:
301 | if k not in del_headers:
302 | req += self.build_header(self.headers[k][0], self.headers[k][1]) + CRLF
303 |
304 | if not add_headers:
305 | add_headers = []
306 | for k in add_headers:
307 | req += self.build_header(k[0], k[1]) + CRLF
308 |
309 | req += CRLF
310 | if self.body:
311 | req += self.body
312 |
313 | return req
314 |
315 | @staticmethod
316 | def build_header(k, v):
317 | return k + b': ' + v
318 |
319 | @staticmethod
320 | def split(data):
321 | pos = data.find(CRLF)
322 | if pos == -1:
323 | return False, data
324 | line = data[:pos]
325 | data = data[pos + len(CRLF):]
326 | return line, data
327 |
328 |
329 | class Connection(object):
330 | """TCP server/client connection abstraction."""
331 |
332 | def __init__(self, what):
333 | self.conn = None
334 | self.buffer = b''
335 | self.closed = False
336 | self.what = what # server or client
337 |
338 | def send(self, data):
339 | # TODO: Gracefully handle BrokenPipeError exceptions
340 | return self.conn.send(data)
341 |
342 | def recv(self, bufsiz=8192):
343 | try:
344 | data = self.conn.recv(bufsiz)
345 | if len(data) == 0:
346 | logger.debug('rcvd 0 bytes from %s' % self.what)
347 | return None
348 | logger.debug('rcvd %d bytes from %s' % (len(data), self.what))
349 | return data
350 | except Exception as e:
351 | if e.errno == errno.ECONNRESET:
352 | logger.debug('%r' % e)
353 | else:
354 | logger.exception(
355 | 'Exception while receiving from connection %s %r with reason %r' % (self.what, self.conn, e))
356 | return None
357 |
358 | def close(self):
359 | self.conn.close()
360 | self.closed = True
361 |
362 | def buffer_size(self):
363 | return len(self.buffer)
364 |
365 | def has_buffer(self):
366 | return self.buffer_size() > 0
367 |
368 | def queue(self, data):
369 | self.buffer += data
370 |
371 | def flush(self):
372 | sent = self.send(self.buffer)
373 | self.buffer = self.buffer[sent:]
374 | logger.debug('flushed %d bytes to %s' % (sent, self.what))
375 |
376 |
377 | class Server(Connection):
378 | """Establish connection to destination server."""
379 |
380 | def __init__(self, host, port):
381 | super(Server, self).__init__(b'server')
382 | self.addr = (host, int(port))
383 |
384 | def __del__(self):
385 | if self.conn:
386 | self.close()
387 |
388 | def connect(self):
389 | self.conn = socket.create_connection((self.addr[0], self.addr[1]))
390 |
391 |
392 | class Client(Connection):
393 | """Accepted client connection."""
394 |
395 | def __init__(self, conn, addr):
396 | super(Client, self).__init__(b'client')
397 | self.conn = conn
398 | self.addr = addr
399 |
400 |
401 | class ProxyError(Exception):
402 | pass
403 |
404 |
405 | class ProxyConnectionFailed(ProxyError):
406 |
407 | def __init__(self, host, port, reason):
408 | self.host = host
409 | self.port = port
410 | self.reason = reason
411 |
412 | def __str__(self):
413 | return '' % (self.host, self.port, self.reason)
414 |
415 |
416 | class ProxyAuthenticationFailed(ProxyError):
417 | pass
418 |
419 |
420 | class Proxy(threading.Thread):
421 | """HTTP proxy implementation.
422 |
423 | Accepts `Client` connection object and act as a proxy between client and server.
424 | """
425 |
426 | def __init__(self, client, auth_code=None, server_recvbuf_size=8192, client_recvbuf_size=8192):
427 | super(Proxy, self).__init__()
428 |
429 | self.start_time = self._now()
430 | self.last_activity = self.start_time
431 |
432 | self.auth_code = auth_code
433 | self.client = client
434 | self.client_recvbuf_size = client_recvbuf_size
435 | self.server = None
436 | self.server_recvbuf_size = server_recvbuf_size
437 |
438 | self.request = HttpParser(HttpParser.types.REQUEST_PARSER)
439 | self.response = HttpParser(HttpParser.types.RESPONSE_PARSER)
440 |
441 | @staticmethod
442 | def _now():
443 | return datetime.datetime.utcnow()
444 |
445 | def _inactive_for(self):
446 | return (self._now() - self.last_activity).seconds
447 |
448 | def _is_inactive(self):
449 | return self._inactive_for() > 30
450 |
451 | def _get_host_and_port(self):
452 | if self.request.method == b'CONNECT':
453 | host, port = self.request.url.path.split(COLON)
454 | elif self.request.url:
455 | host, port = self.request.url.hostname, self.request.url.port if self.request.url.port else 80
456 | else:
457 | raise Exception('Invalid request\n%s' % self.request.raw)
458 |
459 | return host, port
460 |
461 | def _process_request(self, data):
462 | # once we have connection to the server
463 | # we don't parse the http request packets
464 | # any further, instead just pipe incoming
465 | # data from client to server
466 | if self.server and not self.server.closed:
467 | self.server.queue(data)
468 | return
469 |
470 | # parse http request
471 | self.request.parse(data)
472 |
473 | # once http request parser has reached the state complete
474 | # we attempt to establish connection to destination server
475 | if self.request.state == HttpParser.states.COMPLETE:
476 | logger.debug('request parser is in state complete')
477 |
478 | if self.auth_code:
479 | if b'proxy-authorization' not in self.request.headers or \
480 | self.request.headers[b'proxy-authorization'][1] != self.auth_code:
481 | raise ProxyAuthenticationFailed()
482 |
483 | host, port = self._get_host_and_port()
484 |
485 | self.server = Server(host, port)
486 | try:
487 | logger.debug('connecting to server %s:%s' % (host, port))
488 | self.server.connect()
489 | logger.debug('connected to server %s:%s' % (host, port))
490 | except Exception as e: # TimeoutError, socket.gaierror
491 | self.server.closed = True
492 | raise ProxyConnectionFailed(host, port, repr(e))
493 |
494 | # for http connect methods (https requests)
495 | # queue appropriate response for client
496 | # notifying about established connection
497 | # if self.request.method == b'CONNECT':
498 | self.server.queue(data)
499 | # for usual http requests, re-build request packet
500 | # and queue for the server with appropriate headers
501 | # else:
502 | # self.server.queue(self.request.build(
503 | # del_headers=[b'proxy-authorization', b'proxy-connection', b'connection', b'keep-alive'],
504 | # # add_headers=[(b'Via', b'1.1 proxy.py v%s' % version), (b'Connection', b'Close')]
505 | # ))
506 |
507 | def _process_response(self, data):
508 | # parse incoming response packet
509 | # only for non-https requests
510 | if not self.request.method == b'CONNECT':
511 | self.response.parse(data)
512 |
513 | self.before_process_response()
514 |
515 | # queue data for client
516 | self.client.queue(data)
517 |
518 | def _access_log(self):
519 | host, port = self.server.addr if self.server else (None, None)
520 | if self.request.method == b'CONNECT':
521 | logger.info(
522 | '%s:%s - %s %s:%s' % (self.client.addr[0], self.client.addr[1], self.request.method, host, port))
523 | elif self.request.method:
524 | logger.info('%s:%s - %s %s:%s%s - %s %s - %s bytes' % (
525 | self.client.addr[0], self.client.addr[1], self.request.method, host, port, self.request.build_url(),
526 | self.response.code, self.response.reason, len(self.response.raw)))
527 |
528 | def _get_waitable_lists(self):
529 | rlist, wlist, xlist = [self.client.conn], [], []
530 | if self.client.has_buffer():
531 | wlist.append(self.client.conn)
532 | if self.server and not self.server.closed:
533 | rlist.append(self.server.conn)
534 | if self.server and not self.server.closed and self.server.has_buffer():
535 | wlist.append(self.server.conn)
536 | return rlist, wlist, xlist
537 |
538 | def _process_wlist(self, w):
539 | if self.client.conn in w:
540 | logger.debug('client is ready for writes, flushing client buffer')
541 | self.client.flush()
542 |
543 | if self.server and not self.server.closed and self.server.conn in w:
544 | logger.debug('server is ready for writes, flushing server buffer')
545 | self.server.flush()
546 |
547 | def _process_rlist(self, r):
548 | """Returns True if connection to client must be closed."""
549 | if self.client.conn in r:
550 | logger.debug('client is ready for reads, reading')
551 | data = self.client.recv(self.client_recvbuf_size)
552 | self.last_activity = self._now()
553 |
554 | if not data:
555 | logger.debug('client closed connection, breaking')
556 | return True
557 |
558 | try:
559 | self._process_request(data)
560 | except (ProxyAuthenticationFailed, ProxyConnectionFailed) as e:
561 | logger.exception(e)
562 | self.client.queue(Proxy._get_response_pkt_by_exception(e))
563 | self.client.flush()
564 | return True
565 |
566 | if self.server and not self.server.closed and self.server.conn in r:
567 | logger.debug('server is ready for reads, reading')
568 | data = self.server.recv(self.server_recvbuf_size)
569 | self.last_activity = self._now()
570 |
571 | if not data:
572 | logger.debug('server closed connection')
573 | self.server.close()
574 | else:
575 | self._process_response(data)
576 |
577 | return False
578 |
579 | def _process(self):
580 | while True:
581 | rlist, wlist, xlist = self._get_waitable_lists()
582 | r, w, x = select.select(rlist, wlist, xlist, 1)
583 |
584 | self._process_wlist(w)
585 | if self._process_rlist(r):
586 | break
587 |
588 | if self.client.buffer_size() == 0:
589 | if self.response.state == HttpParser.states.COMPLETE:
590 | logger.debug('client buffer is empty and response state is complete, breaking')
591 | break
592 |
593 | if self._is_inactive():
594 | logger.debug('client buffer is empty and maximum inactivity has reached, breaking')
595 | break
596 |
597 | @staticmethod
598 | def _get_response_pkt_by_exception(e):
599 | if e.__class__.__name__ == 'ProxyAuthenticationFailed':
600 | return PROXY_AUTHENTICATION_REQUIRED_RESPONSE_PKT
601 | if e.__class__.__name__ == 'ProxyConnectionFailed':
602 | return BAD_GATEWAY_RESPONSE_PKT
603 |
604 | def run(self):
605 | logger.debug('Proxying connection %r' % self.client.conn)
606 | try:
607 | self._process()
608 | except KeyboardInterrupt:
609 | pass
610 | except Exception as e:
611 | logger.exception('Exception while handling connection %r with reason %r' % (self.client.conn, e))
612 | finally:
613 | logger.debug(
614 | 'closing client connection with pending client buffer size %d bytes' % self.client.buffer_size())
615 | self.client.close()
616 | if self.server:
617 | logger.debug(
618 | 'closed client connection with pending server buffer size %d bytes' % self.server.buffer_size())
619 | self._access_log()
620 | logger.debug('Closing proxy for connection %r at address %r' % (self.client.conn, self.client.addr))
621 |
622 | def before_process_response(self):
623 | pass
624 |
625 | class TCP(object):
626 | """TCP server implementation.
627 |
628 | Subclass MUST implement `handle` method. It accepts an instance of accepted `Client` connection.
629 | """
630 |
631 | def __init__(self, hostname='127.0.0.1', port=8899, backlog=100):
632 | self.hostname = hostname
633 | self.port = port
634 | self.backlog = backlog
635 | self.socket = None
636 |
637 | def handle(self, client):
638 | raise NotImplementedError()
639 |
640 | def run(self):
641 | try:
642 | logger.info('Starting server on port %d' % self.port)
643 | self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
644 | self.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
645 | self.socket.bind((self.hostname, self.port))
646 | self.socket.listen(self.backlog)
647 | while True:
648 | conn, addr = self.socket.accept()
649 | client = Client(conn, addr)
650 | self.handle(client)
651 | except Exception as e:
652 | logger.exception('Exception while running the server %r' % e)
653 | finally:
654 | logger.info('Closing server socket')
655 | self.socket.close()
656 |
657 |
658 | class HTTP(TCP):
659 | """HTTP proxy server implementation.
660 |
661 | Spawns new process to proxy accepted client connection.
662 | """
663 |
664 | def __init__(self, hostname='127.0.0.1', port=8899, backlog=100,
665 | auth_code=None, server_recvbuf_size=8192, client_recvbuf_size=8192):
666 | super(HTTP, self).__init__(hostname, port, backlog)
667 | self.auth_code = auth_code
668 | self.client_recvbuf_size = client_recvbuf_size
669 | self.server_recvbuf_size = server_recvbuf_size
670 |
671 | def handle(self, client):
672 | proxy = Proxy(client,
673 | auth_code=self.auth_code,
674 | server_recvbuf_size=self.server_recvbuf_size,
675 | client_recvbuf_size=self.client_recvbuf_size)
676 | proxy.daemon = True
677 | proxy.start()
678 |
679 |
680 | def set_open_file_limit(soft_limit):
681 | """Configure open file description soft limit on supported OS."""
682 | if os.name != 'nt': # resource module not available on Windows OS
683 | curr_soft_limit, curr_hard_limit = resource.getrlimit(resource.RLIMIT_NOFILE)
684 | if curr_soft_limit < soft_limit < curr_hard_limit:
685 | resource.setrlimit(resource.RLIMIT_NOFILE, (soft_limit, curr_hard_limit))
686 | logger.info('Open file descriptor soft limit set to %d' % soft_limit)
687 |
688 |
689 | def main():
690 | parser = argparse.ArgumentParser(
691 | description='proxy.py v%s' % __version__,
692 | epilog='Having difficulty using proxy.py? Report at: %s/issues/new' % __homepage__
693 | )
694 |
695 | parser.add_argument('--hostname', default='127.0.0.1', help='Default: 127.0.0.1')
696 | parser.add_argument('--port', default='8899', help='Default: 8899')
697 | parser.add_argument('--backlog', default='100', help='Default: 100. '
698 | 'Maximum number of pending connections to proxy server')
699 | parser.add_argument('--basic-auth', default=None, help='Default: No authentication. '
700 | 'Specify colon separated user:password '
701 | 'to enable basic authentication.')
702 | parser.add_argument('--server-recvbuf-size', default='8192', help='Default: 8 KB. '
703 | 'Maximum amount of data received from the '
704 | 'server in a single recv() operation. Bump this '
705 | 'value for faster downloads at the expense of '
706 | 'increased RAM.')
707 | parser.add_argument('--client-recvbuf-size', default='8192', help='Default: 8 KB. '
708 | 'Maximum amount of data received from the '
709 | 'client in a single recv() operation. Bump this '
710 | 'value for faster uploads at the expense of '
711 | 'increased RAM.')
712 | parser.add_argument('--open-file-limit', default='1024', help='Default: 1024. '
713 | 'Maximum number of files (TCP connections) '
714 | 'that proxy.py can open concurrently.')
715 | parser.add_argument('--log-level', default='INFO', help='DEBUG, INFO (default), WARNING, ERROR, CRITICAL')
716 | args = parser.parse_args()
717 |
718 | logging.basicConfig(level=getattr(logging, args.log_level),
719 | format='%(asctime)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s')
720 |
721 | try:
722 | set_open_file_limit(int(args.open_file_limit))
723 |
724 | auth_code = None
725 | if args.basic_auth:
726 | auth_code = b'Basic %s' % base64.b64encode(bytes_(args.basic_auth))
727 |
728 | proxy = HTTP(hostname=args.hostname,
729 | port=int(args.port),
730 | backlog=int(args.backlog),
731 | auth_code=auth_code,
732 | server_recvbuf_size=int(args.server_recvbuf_size),
733 | client_recvbuf_size=int(args.client_recvbuf_size))
734 | proxy.run()
735 | except KeyboardInterrupt:
736 | pass
737 |
738 |
739 | if __name__ == '__main__':
740 | main()
741 |
--------------------------------------------------------------------------------
/Src/Log/LogHandler.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import os
4 |
5 | import logging
6 |
7 | from logging.handlers import TimedRotatingFileHandler
8 | from Config import ConfigManager
9 |
10 | LOG_LEVEL = {
11 | "CRITICAL": 50,
12 | "FATAL": 50,
13 | "ERROR": 40,
14 | "WARNING": 30,
15 | "WARN": 30,
16 | "INFO": 20,
17 | "DEBUG": 10,
18 | "NOTSET": 0,
19 | }
20 |
21 | CURRENT_PATH = os.path.dirname(os.path.abspath(__file__))
22 | ROOT_PATH = os.path.join(CURRENT_PATH, os.pardir, os.pardir)
23 | LOG_PATH = os.path.join(ROOT_PATH, 'logs')
24 | if not os.path.exists(LOG_PATH):
25 | os.mkdir(LOG_PATH)
26 |
27 | class LogHandler(logging.Logger):
28 |
29 | def __init__(self, level=None, stream=True, file=True):
30 | self.name = "ProxyPool"
31 | if level:
32 | self.level = level
33 | else:
34 | self.level = LOG_LEVEL.get(ConfigManager.base_config.setting.get("log_level"), LOG_LEVEL["INFO"])
35 |
36 | super(LogHandler, self).__init__(self.name, level=self.level)
37 | if stream:
38 | self.__setStreamHandler__()
39 | if file:
40 | self.__setFileHandler__()
41 |
42 | def __setFileHandler__(self, level=None):
43 | file_name = os.path.join(LOG_PATH, '{name}.log'.format(name=self.name))
44 | # 设置日志回滚, 保存在log目录, 一天保存一个文件, 保留15天
45 | file_handler = TimedRotatingFileHandler(filename=file_name, when='D', interval=1, backupCount=15)
46 | file_handler.suffix = '%Y%m%d.log'
47 | if not level:
48 | file_handler.setLevel(self.level)
49 | else:
50 | file_handler.setLevel(level)
51 | formatter = logging.Formatter('%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s')
52 |
53 | file_handler.setFormatter(formatter)
54 | self.file_handler = file_handler
55 | self.addHandler(file_handler)
56 |
57 | def __setStreamHandler__(self, level=None):
58 | stream_handler = logging.StreamHandler()
59 | formatter = logging.Formatter('%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s')
60 | stream_handler.setFormatter(formatter)
61 | if not level:
62 | stream_handler.setLevel(self.level)
63 | else:
64 | stream_handler.setLevel(level)
65 | self.addHandler(stream_handler)
66 |
67 | def resetName(self, name):
68 | self.name = name
69 | self.removeHandler(self.file_handler)
70 | self.__setFileHandler__()
71 |
72 |
73 | if __name__ == '__main__':
74 | log = LogHandler()
75 | log.info('this is a test msg')
76 |
--------------------------------------------------------------------------------
/Src/Log/LogManager.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from Log.LogHandler import LogHandler
3 |
4 | log = LogHandler()
5 |
6 | def init():
7 | pass
8 |
9 | if __name__ == '__main__':
10 | log.info('this is a test msg')
11 |
--------------------------------------------------------------------------------
/Src/Log/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ozhiwei/SmartProxyPool/f38a21857c9f5e3511bc65792a493cf45f6c491e/Src/Log/__init__.py
--------------------------------------------------------------------------------
/Src/Manager/ProxyClean.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # !/usr/bin/env python
3 |
4 | import sys
5 | sys.path.append("Src")
6 | import time
7 | import threading
8 |
9 | from Manager.ProxyManager import proxy_manager
10 | from Log.LogManager import log
11 | from Config import ConfigManager
12 |
13 | try:
14 | from Queue import Queue # py3
15 | except:
16 | from queue import Queue # py2
17 |
18 | # 这样的实现多线程有问题, 后期无法扩展到独立的进程.
19 | # must call classmethod initQueue before when thread start
20 | class ProxyClean(threading.Thread):
21 | def __init__(self, **kwargs):
22 | super(ProxyClean, self).__init__(**kwargs)
23 |
24 | class ProxyCleanUseful(ProxyClean):
25 |
26 | def run(self):
27 | hold_number = ConfigManager.setting_config.setting.get("hold_useful_proxy_number")
28 | total_number = proxy_manager.getUsefulProxyNumber()
29 | clean_number = proxy_manager.cleanUsefulProxy(hold_number=hold_number)
30 |
31 |
32 | log.info("clean useful, total_number:{total_number}, clean_number:{clean_number}, hold_number:{hold_number}".format(total_number=total_number, clean_number=clean_number, hold_number=hold_number))
33 |
34 | class ProxyCleanRaw(ProxyClean):
35 |
36 | def run(self):
37 | total_number = proxy_manager.getRawProxyNumber()
38 | clean_number = proxy_manager.cleanRawProxy()
39 | remain_number = total_number - clean_number
40 |
41 | log.info("clean raw_proxy, total_number:{total_number}, clean_number:{clean_number}, remain_number:{remain_number}".format(total_number=total_number, clean_number=clean_number, remain_number=remain_number))
42 |
43 | if __name__ == "__main__":
44 | t1 = ProxyCleanUseful()
45 | t1.daemon = True
46 | t1.start()
47 |
48 | t2 = ProxyCleanRaw()
49 | t2.daemon = True
50 | t2.start()
51 |
52 | t1.join()
53 | t2.join()
--------------------------------------------------------------------------------
/Src/Manager/ProxyFetch.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # !/usr/bin/env python
3 |
4 | from gevent import monkey, pool
5 | monkey.patch_all()
6 |
7 | import sys
8 | sys.path.append("Src")
9 | import time
10 | import threading
11 | import gevent
12 |
13 | from Manager import ProxyManager
14 | # from ProxyGetter.getFreeProxy import GetFreeProxy
15 | from Fetcher import FetcherManager
16 | from Log.LogManager import log
17 | from Config import ConfigManager
18 | from Util.utilFunction import verifyProxyFormat
19 |
20 | try:
21 | from Queue import Queue # py3
22 | except:
23 | from queue import Queue # py2
24 |
25 | # 这样的实现多线程有问题, 后期无法扩展到独立的机器上.
26 | # must call classmethod initQueue before
27 | class ProxyFetch(object):
28 | queue = Queue()
29 |
30 | @classmethod
31 | def initQueue(cls):
32 | fetchers = ProxyManager.proxy_manager.getExecFetcher()
33 | for fetcher in fetchers:
34 | cls.queue.put(fetcher)
35 |
36 | def start(self):
37 | concurrency = ConfigManager.setting_config.setting.get("fetch_new_proxy_concurrency")
38 | task_pool = pool.Pool(concurrency)
39 |
40 | queue_size = self.queue.qsize()
41 | if queue_size > 0:
42 | greenlet_list = []
43 | for _ in range(queue_size):
44 | greenlet_list.append(task_pool.spawn(self.fetch))
45 |
46 | gevent.joinall(greenlet_list)
47 | else:
48 | log.info("Not Have Fetcher Of Now, skip!")
49 |
50 | def fetch(self):
51 | start_time = time.time()
52 | total = 0
53 | succ = 0
54 | fail = 0
55 | skip = 0
56 |
57 | fetcher = self.queue.get()
58 | name = fetcher["name"]
59 |
60 | fetcher_class = FetcherManager.getFetcherClass(name)
61 | log.debug("fetch [{name}] proxy start".format(name=name))
62 | try:
63 | f = fetcher_class()
64 | for proxy in f.run():
65 | proxy = proxy.strip()
66 | if proxy and verifyProxyFormat(proxy) and \
67 | not ProxyManager.proxy_manager.checkUsefulProxyExists(proxy):
68 |
69 | ProxyManager.proxy_manager.saveUsefulProxy(proxy)
70 | succ = succ + 1
71 | log.debug("fetch [{name}] proxy {proxy} succ".format(name=name, proxy=proxy))
72 | else:
73 | skip = skip + 1
74 | log.debug("fetch [{name}] proxy {proxy} skip".format(name=name, proxy=proxy))
75 |
76 | total = total + 1
77 | except Exception as e:
78 | log.error("fetch [{name}] proxy fail: {error}".format(name=name, error=e))
79 | fail = fail + 1
80 |
81 | self.queue.task_done()
82 |
83 | now = int(time.time())
84 | elapsed_time = int(now - start_time)
85 |
86 | next_fetch_time = now + (fetcher["interval"] * 60)
87 |
88 | data = {
89 | "$inc": {
90 | "succ": succ,
91 | "fail": fail,
92 | "skip": skip,
93 | "total": total,
94 | },
95 | "$set": {
96 | "next_fetch_time": next_fetch_time,
97 | }
98 | }
99 |
100 | ProxyManager.proxy_manager.updateFetcher(name, data)
101 | log.info("fetch [{name:^15}] proxy finish, \
102 | total:{total}, succ:{succ}, fail:{fail}, skip:{skip}, elapsed_time:{elapsed_time}s". \
103 | format(name=name, total=total, succ=succ, fail=fail, skip=skip, elapsed_time=elapsed_time))
104 |
105 | def run(self):
106 | while self.queue.qsize():
107 | self.fetch()
108 |
109 |
110 |
111 | if __name__ == "__main__":
112 | ProxyFetch.initQueue()
113 | t = ProxyFetch()
114 | t.start()
--------------------------------------------------------------------------------
/Src/Manager/ProxyManager.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # !/usr/bin/env python
3 |
4 | import random
5 |
6 | import datx
7 | import time
8 |
9 | from Util import EnvUtil
10 | from DB.DbClient import UsefulProxyDocsModel, RawProxyDocsModel, DomainCounterDocsModel, FetchersDocsModel
11 | from Config import ConfigManager
12 | from Util.utilFunction import verifyProxyFormat
13 | from ProxyGetter.getFreeProxy import GetFreeProxy
14 | from Log.LogManager import log
15 |
16 | PROXY_LAST_STATUS = {
17 | "UNKNOWN": 0,
18 | "SUCC": 1,
19 | "FAIL": 2,
20 | }
21 |
22 | PROXY_TYPE = {
23 | "UNKNOWN": 0,
24 | "CLEAR": 1,
25 | "ANONYMOUS": 2,
26 | "DYNAMIC": 3,
27 | }
28 |
29 | PROXY_HTTPS = {
30 | "UNKNOWN": 0,
31 | "ENABLE": 1,
32 | "DISABLE": 2,
33 | }
34 |
35 | IP_DATA_PATH = "Data/17monipdb.datx"
36 |
37 | class ProxyManager(object):
38 |
39 | def __init__(self):
40 | self.useful_proxy = UsefulProxyDocsModel()
41 | self.raw_proxy = RawProxyDocsModel()
42 | self.domain_counter = DomainCounterDocsModel()
43 | self.fetchers = FetchersDocsModel()
44 | self.datx = datx.City(IP_DATA_PATH)
45 |
46 | self.quality_useful_proxy_list = []
47 | self.quality_domain_index = {}
48 |
49 | def cleanUsefulProxy(self, **kwargs):
50 | result = self.useful_proxy.cleanUsefulProxy(**kwargs)
51 | return result
52 |
53 | def cleanRawProxy(self, **kwargs):
54 | result = self.raw_proxy.cleanRawProxy(**kwargs)
55 | return result
56 |
57 | def getAllValidUsefulProxy(self, **kwargs):
58 | result = self.useful_proxy.getAllValidUsefulProxy(**kwargs)
59 | return result
60 |
61 | def getAllUsefulProxy(self, **kwargs):
62 | result = self.useful_proxy.getAllUsefulProxy(**kwargs)
63 | return result
64 |
65 | def getVerifyUsefulProxy(self):
66 | now = int(time.time())
67 | result = self.useful_proxy.getVerifyUsefulProxy(now)
68 | return result
69 |
70 | def getLowQualityUsefulProxy(self, **kwagrs):
71 | result = self.useful_proxy.getLowQualityUsefulProxy(**kwagrs)
72 | return result
73 |
74 | def getHighQualityUsefulProxy(self, **kwagrs):
75 | result = self.useful_proxy.getHighQualityUsefulProxy(**kwagrs)
76 | return result
77 |
78 | def getAllRawProxy(self):
79 | result = self.raw_proxy.getAllRawProxy()
80 | return result
81 |
82 | def checkRawProxyExists(self, proxy):
83 | result = self.raw_proxy.checkRawProxyExists(proxy)
84 | return result
85 |
86 | def checkUsefulProxyExists(self, proxy):
87 | result = self.useful_proxy.checkUsefulProxyExists(proxy)
88 | return result
89 |
90 | def getSampleUsefulProxy(self, **kwargs):
91 | result = self.useful_proxy.getSampleUsefulProxy(**kwargs)
92 | return result
93 |
94 | def getQualityUsefulProxy(self, **kwargs):
95 | count = kwargs.get("count", 1)
96 | domain = kwargs.get("domain", None)
97 |
98 | index = self.quality_domain_index.get(domain, 0)
99 |
100 | if index == 0:
101 | self.quality_useful_proxy_list = self.useful_proxy.getQualityUsefulProxy(**kwargs)
102 |
103 | index = (count-1) % len(self.quality_useful_proxy_list)
104 | self.quality_domain_index[domain] = index+1
105 |
106 | result = self.quality_useful_proxy_list[index]
107 | return result
108 |
109 | def deleteRawProxy(self, proxy):
110 | self.raw_proxy.deleteRawProxy(proxy)
111 |
112 | def saveRawProxy(self, proxy):
113 | data = {
114 | "proxy": proxy,
115 | "health": ConfigManager.setting_config.setting.get("init_raw_proxy_health")
116 | }
117 | self.raw_proxy.saveRawProxy(data)
118 |
119 | def getProxyRegion(self, ip):
120 | data = self.datx.find(ip)
121 | region_list = data[:3]
122 | result = []
123 | for item in region_list:
124 | if item and item not in result:
125 | result.append(item)
126 |
127 | return result
128 |
129 | def saveUsefulProxy(self, proxy):
130 | ip = proxy.split(":")[0]
131 | region_list = self.getProxyRegion(ip)
132 |
133 | data = {
134 | "proxy": proxy,
135 | "succ": 0,
136 | "keep_succ": 0,
137 | "fail": 0,
138 | "total": 0,
139 | "quality": 0,
140 | "https": PROXY_HTTPS["UNKNOWN"],
141 | "type": PROXY_TYPE["UNKNOWN"],
142 | "region_list": region_list,
143 | "last_status": PROXY_LAST_STATUS["UNKNOWN"],
144 | "last_succ_time": 0,
145 | "next_verify_time": 0,
146 |
147 | }
148 |
149 | self.useful_proxy.saveUsefulProxy(data)
150 |
151 | def updateUsefulProxy(self, item, info):
152 | data = {
153 | "$set": {}
154 | }
155 |
156 | if item.get("type") == PROXY_TYPE["UNKNOWN"]:
157 | data["$set"]["type"] = info["type"]
158 |
159 | if item.get("https") == PROXY_HTTPS["UNKNOWN"]:
160 | data["$set"]["https"] = info["https"]
161 |
162 | if len(data["$set"]) > 0:
163 | self.useful_proxy.updateUsefulProxy(item["proxy"], data)
164 |
165 | def deleteUsefulProxy(self, proxy):
166 | self.useful_proxy.deleteUsefulProxy(proxy)
167 |
168 | def tickUsefulProxyVaildSucc(self, proxy):
169 | self.useful_proxy.tickUsefulProxyVaildSucc(proxy)
170 |
171 | def tickUsefulProxyVaildFail(self, proxy):
172 | self.useful_proxy.tickUsefulProxyVaildFail(proxy)
173 |
174 | def tickUsefulProxyVaildTotal(self, proxy):
175 | self.useful_proxy.tickUsefulProxyVaildTotal(proxy)
176 |
177 | def updateUsefulProxyNextVerifyTime(self, proxy, start_time=None):
178 |
179 | item = self.getProxy(proxy)
180 | multiple = abs(item["quality"])
181 | if item["quality"] > 0:
182 | multiple = 0
183 |
184 | start_time = start_time if start_time else int(time.time())
185 | interval = ConfigManager.setting_config.setting.get("verify_useful_proxy_interval")
186 | next_verify_time = start_time + (multiple * interval * 60)
187 |
188 | query = {
189 | "proxy": proxy
190 | }
191 | data = {
192 | "$set": {
193 | "next_verify_time": next_verify_time
194 | }
195 | }
196 | self.useful_proxy.updateProxy(query, data)
197 |
198 | def tickRawProxyVaildFail(self, proxy):
199 | self.raw_proxy.tickRawProxyVaildFail(proxy)
200 |
201 | def getProxy(self, proxy):
202 | result = self.useful_proxy.getProxy(proxy)
203 | return result
204 |
205 | def getProxyNumber(self):
206 | total_raw_proxy = self.getRawProxyNumber()
207 | total_useful_queue = self.getUsefulProxyNumber()
208 | result = {'raw_proxy': total_raw_proxy, 'useful_proxy': total_useful_queue}
209 | return result
210 |
211 | def getRawProxyNumber(self):
212 | result = self.raw_proxy.getProxyNum()
213 | return result
214 |
215 | def getUsefulProxyNumber(self):
216 | result = self.useful_proxy.getProxyNum()
217 | return result
218 |
219 | def tickDomainRequestState(self, domain, code):
220 |
221 | self.domain_counter.tickDomainRequestState(domain, code)
222 |
223 | def getDomainCounter(self, domain):
224 | result = self.domain_counter.getDomainCounter(domain)
225 | return result
226 |
227 |
228 | def getAllFetcher(self):
229 | result = self.fetchers.getAllFetcher()
230 | return result
231 |
232 | def getExecFetcher(self):
233 | now = int(time.time())
234 | result = self.fetchers.getExecFetcher(now)
235 | return result
236 |
237 | def getFetcher(self, name):
238 | result = self.fetchers.getFetcher(name)
239 | return result
240 |
241 |
242 | def updateFetcher(self, name, data):
243 | self.fetchers.updateFetcher(name, data)
244 |
245 | proxy_manager = ProxyManager()
246 |
247 | if __name__ == '__main__':
248 | # proxy_manager.refresh()
249 | pass
250 |
--------------------------------------------------------------------------------
/Src/Manager/ProxyVerify.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # !/usr/bin/env python
3 |
4 | from gevent import monkey, pool
5 | monkey.patch_all()
6 |
7 | import sys
8 | sys.path.append("Src")
9 |
10 | import threading
11 | import requests
12 | import re
13 | import time
14 | import gevent
15 |
16 | from Manager import ProxyManager
17 | from Log.LogManager import log
18 | from Config import ConfigManager
19 |
20 | try:
21 | from Queue import Queue # py3
22 | except:
23 | from queue import Queue # py2
24 |
25 | class ProxyVerify(object):
26 |
27 | # http可用才会检查https, 会不会有只开通https的代理呢?
28 | def getProxyInfo(self, proxy):
29 | info = {}
30 |
31 | data = proxy.split(':')
32 | info["ip"] = data[0]
33 | info["port"] = data[1]
34 | info["address"] = proxy
35 |
36 | proxies = {
37 | "http": proxy,
38 | "https": proxy,
39 | }
40 | http_url = "http://httpbin.org/ip"
41 | https_url = "https://httpbin.org/ip"
42 |
43 | result = False
44 |
45 | info["https"] = ProxyManager.PROXY_HTTPS["UNKNOWN"]
46 | info["type"] = ProxyManager.PROXY_TYPE["UNKNOWN"]
47 | # http verify
48 | try:
49 | r = requests.get(http_url, proxies=proxies, timeout=10, verify=False)
50 | data = r.json()
51 | ip_string = data["origin"]
52 | ip_list = ip_string.split(", ")
53 |
54 | status_result = r.status_code == 200
55 | content_result = "origin" in data
56 | if status_result and content_result:
57 | result = True
58 |
59 | if len(ip_list) > 1:
60 | info["type"] = ProxyManager.PROXY_TYPE["CLEAR"]
61 | else:
62 | info["type"] = ProxyManager.PROXY_TYPE["ANONYMOUS"]
63 |
64 | except Exception as e:
65 | log.debug("proxy:[{proxy}] http verify fail, error:{error}".format(proxy=proxy, error=e))
66 | result = False
67 |
68 | if result:
69 |
70 | # https verify
71 | try:
72 | r = requests.get(https_url, proxies=proxies, timeout=10, verify=False)
73 | status_result = r.status_code == 200
74 | content_result = "origin" in data
75 | if status_result and content_result:
76 | info["https"] = ProxyManager.PROXY_HTTPS["ENABLE"]
77 |
78 | except Exception as e:
79 | log.debug("proxy [{proxy}] https verify fail, error:{error}".format(proxy=proxy, error=e))
80 | info["https"] = ProxyManager.PROXY_HTTPS["DISABLE"]
81 |
82 | return info
83 |
84 | def defaultVerifyProxy(self, proxy):
85 | result = None
86 |
87 | if isinstance(proxy, bytes):
88 | proxy = proxy.decode('utf8')
89 |
90 | proxies = {
91 | "http": proxy,
92 | }
93 | http_url = "http://httpbin.org/ip"
94 |
95 | try:
96 | r = requests.get(http_url, proxies=proxies, timeout=10, verify=False)
97 | data = r.json()
98 |
99 | status_result = r.status_code == 200
100 | content_result = "origin" in data
101 | if status_result and content_result:
102 | result = True
103 |
104 | except Exception as e:
105 | log.debug("proxy:{proxy} http verify proxy fail, error:{error}".format(proxy=proxy, error=e))
106 | result = False
107 |
108 | return result
109 |
110 | def customVerifyProxy(self, proxy):
111 | result = None
112 |
113 | if isinstance(proxy, bytes):
114 | proxy = proxy.decode('utf8')
115 |
116 | proxies = {
117 | "http": proxy,
118 | "https": proxy,
119 | }
120 | verify_url = ConfigManager.setting_config.setting.get("custom_verify_url")
121 |
122 | try:
123 | content_result = True
124 | r = requests.get(verify_url, proxies=proxies, timeout=10, verify=False)
125 | pattern = ConfigManager.setting_config.setting.get("custom_verify_content")
126 | if pattern:
127 | content = r.content.decode('utf-8')
128 | search_result = re.search(pattern, content)
129 | content_result = search_result != None
130 |
131 | status_result = r.status_code == 200
132 | if status_result and content_result:
133 | result = True
134 |
135 | except Exception as e:
136 | log.debug("proxy:{proxy} http verify proxy fail, error:{error}".format(proxy=proxy, error=e))
137 | result = False
138 |
139 | return result
140 |
141 | def verify(self):
142 | pass
143 |
144 | def run(self):
145 | while self.queue.qsize():
146 | self.verify()
147 |
148 | # 这样的实现多线程有问题, 后期无法扩展到独立的机器上.
149 | # must call classmethod initQueue before
150 | class ProxyVerifyRaw(ProxyVerify):
151 | queue = Queue()
152 | useful_proxies = {}
153 |
154 | @classmethod
155 | def initQueue(cls):
156 | items = ProxyManager.proxy_manager.getAllRawProxy()
157 | for item in items:
158 | cls.queue.put(item)
159 |
160 | items = ProxyManager.proxy_manager.getAllUsefulProxy()
161 | data = { item["proxy"]: 1 for item in items }
162 | cls.useful_proxies = data
163 |
164 | def verify(self):
165 | raw_proxy_item = self.queue.get()
166 | raw_proxy = raw_proxy_item.get("proxy")
167 | if isinstance(raw_proxy, bytes):
168 | raw_proxy = raw_proxy.decode('utf8')
169 |
170 | if raw_proxy not in self.useful_proxies:
171 | if ConfigManager.setting_config.setting.get("custom_verify_url"):
172 | verify_result = self.customVerifyProxy(raw_proxy)
173 | else:
174 | verify_result = self.defaultVerifyProxy(raw_proxy)
175 |
176 | if verify_result:
177 | ProxyManager.proxy_manager.saveUsefulProxy(raw_proxy)
178 | ProxyManager.proxy_manager.deleteRawProxy(raw_proxy)
179 | self.useful_proxies[raw_proxy] = True
180 |
181 | self.stat["succ"] = self.stat["succ"] + 1
182 | log.debug("raw_proxy:{raw_proxy} verify succ".format(raw_proxy=raw_proxy))
183 | else:
184 | ProxyManager.proxy_manager.tickRawProxyVaildFail(raw_proxy)
185 |
186 | self.stat["fail"] = self.stat["fail"] + 1
187 | log.debug("raw_proxy:{raw_proxy} verify fail".format(raw_proxy=raw_proxy))
188 | else:
189 | ProxyManager.proxy_manager.deleteRawProxy(raw_proxy)
190 |
191 | self.stat["skip"] = self.stat["skip"] + 1
192 | log.debug("raw_proxy:{raw_proxy} verify repetition".format(raw_proxy=raw_proxy))
193 |
194 | self.queue.task_done()
195 | self.stat["total"] = self.stat["total"] + 1
196 |
197 | def start(self):
198 |
199 | start_time = time.time()
200 | log.debug("raw_proxy proxy verify start")
201 |
202 | self.stat = dict(
203 | total = 0,
204 | succ = 0,
205 | fail = 0,
206 | skip = 0,
207 | )
208 |
209 | concurrency = ConfigManager.setting_config.setting.get("verify_raw_proxy_concurrency")
210 | queue_size = self.queue.qsize()
211 | if concurrency > queue_size:
212 | spawn_num = queue_size
213 | else:
214 | spawn_num = concurrency
215 |
216 | greenlet_list = []
217 | for _ in range(spawn_num):
218 | greenlet_list.append(gevent.spawn(self.run))
219 |
220 | gevent.joinall(greenlet_list)
221 |
222 | end_time = time.time()
223 | elapsed_time = int(end_time - start_time)
224 | log.info("raw_proxy verify proxy finish, total:{total}, succ:{succ}, fail:{fail}, skip:{skip}, elapsed_time:{elapsed_time}s".format(total=self.stat["total"], succ=self.stat["succ"], fail=self.stat["fail"], skip=self.stat["skip"], elapsed_time=elapsed_time))
225 |
226 | # 这样的实现多线程有问题, 后期无法扩展到独立的机器上.
227 | # must call classmethod initQueue before
228 | class ProxyVerifyUseful(ProxyVerify):
229 | queue = Queue()
230 |
231 | @classmethod
232 | def initQueue(cls):
233 | proxies = ProxyManager.proxy_manager.getVerifyUsefulProxy()
234 | for proxy in proxies:
235 | cls.queue.put(proxy)
236 |
237 | def checkProxyInfo(self, item):
238 | result = False
239 | if item.get("type") == ProxyManager.PROXY_TYPE["UNKNOWN"] or item.get("type") == None:
240 | result = True
241 |
242 | if item.get("https") == ProxyManager.PROXY_HTTPS["UNKNOWN"] or item.get("https") == None:
243 | result = True
244 |
245 | return result
246 |
247 | def updateUsefulProxy(self, item):
248 | proxy = item.get("proxy")
249 | info = self.getProxyInfo(proxy)
250 | ProxyManager.proxy_manager.updateUsefulProxy(item, info)
251 |
252 | def verify(self):
253 | item = self.queue.get()
254 | proxy = item.get("proxy")
255 |
256 | if ConfigManager.setting_config.setting.get("custom_verify_url"):
257 | verify_result = self.customVerifyProxy(proxy)
258 | else:
259 | verify_result = self.defaultVerifyProxy(proxy)
260 |
261 | if verify_result:
262 | if self.checkProxyInfo(item):
263 | self.updateUsefulProxy(item)
264 |
265 | ProxyManager.proxy_manager.tickUsefulProxyVaildSucc(proxy)
266 | self.stat["succ"] = self.stat["succ"] + 1
267 | log.debug("useful_proxy:{proxy} verify succ".format(proxy=proxy))
268 | else:
269 | ProxyManager.proxy_manager.tickUsefulProxyVaildFail(proxy)
270 | self.stat["fail"] = self.stat["fail"] + 1
271 | log.debug("useful_proxy:{proxy} verify fail".format(proxy=proxy))
272 |
273 | self.queue.task_done()
274 | ProxyManager.proxy_manager.tickUsefulProxyVaildTotal(proxy)
275 | ProxyManager.proxy_manager.updateUsefulProxyNextVerifyTime(proxy, self.start_time)
276 | self.stat["total"] = self.stat["total"] + 1
277 |
278 | def start(self):
279 |
280 | start_time = time.time()
281 | self.start_time = int(start_time)
282 |
283 | log.debug("useful_proxy proxy verify start")
284 |
285 | self.stat = dict(
286 | total = 0,
287 | succ = 0,
288 | fail = 0,
289 | )
290 |
291 | concurrency = ConfigManager.setting_config.setting.get("verify_useful_proxy_concurrency")
292 | task_pool = pool.Pool(concurrency)
293 |
294 | queue_size = self.queue.qsize()
295 | greenlet_list = []
296 | for _ in range(queue_size):
297 | greenlet_list.append(task_pool.spawn(self.verify))
298 |
299 | gevent.joinall(greenlet_list)
300 |
301 | end_time = time.time()
302 | elapsed_time = int(end_time - start_time)
303 | log.info('useful_proxy verify proxy finish, total:{total}, succ:{succ}, fail:{fail}, elapsed_time:{elapsed_time}s'
304 | .format(total=self.stat["total"], succ=self.stat["succ"], fail=self.stat["fail"], elapsed_time=elapsed_time))
305 |
306 | if __name__ == "__main__":
307 | ProxyVerifyRaw.initQueue()
308 | t = ProxyVerifyRaw()
309 | t.start()
310 |
311 | ProxyVerifyUseful.initQueue()
312 | t = ProxyVerifyUseful()
313 | t.start()
--------------------------------------------------------------------------------
/Src/Manager/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ozhiwei/SmartProxyPool/f38a21857c9f5e3511bc65792a493cf45f6c491e/Src/Manager/__init__.py
--------------------------------------------------------------------------------
/Src/Notify/NotifyManager.py:
--------------------------------------------------------------------------------
1 |
2 | NOTIFY_LIST = [
3 | "AFTER_SETTING_CHANGE"
4 | ]
5 |
6 | NOTIFY_HANDLER = {}
7 |
8 | NOTIFY_EVENT = {}
9 | for name in NOTIFY_LIST:
10 | NOTIFY_EVENT[name] = name
11 | NOTIFY_HANDLER[name] = []
12 |
13 | def register_event(name, handler):
14 | handler_list = NOTIFY_HANDLER[name]
15 | handler_list.append(handler)
16 |
17 | def dispatch_event(name, **kwargs):
18 | if name in NOTIFY_HANDLER:
19 | try:
20 | handler_list = NOTIFY_HANDLER[name]
21 | for handler in handler_list:
22 | handler(**kwargs)
23 | except Exception as e:
24 | # tmp handle
25 | # print("dispatch_notify err: {name}, {e}".format(name=name, e=e))
26 | pass
27 |
--------------------------------------------------------------------------------
/Src/Notify/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ozhiwei/SmartProxyPool/f38a21857c9f5e3511bc65792a493cf45f6c491e/Src/Notify/__init__.py
--------------------------------------------------------------------------------
/Src/ProxyGetter/CheckProxy.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import sys
4 | from getFreeProxy import GetFreeProxy
5 | from Util.utilFunction import verifyProxyFormat
6 |
7 |
8 | from Util.LogHandler import LogHandler
9 |
10 | log = LogHandler('check_proxy', file=False)
11 |
12 |
13 | class CheckProxy(object):
14 |
15 | @staticmethod
16 | def checkAllGetProxyFunc():
17 | """
18 | 检查getFreeProxy所有代理获取函数运行情况
19 | Returns:
20 | None
21 | """
22 | import inspect
23 | member_list = inspect.getmembers(GetFreeProxy, predicate=inspect.isfunction)
24 | proxy_count_dict = dict()
25 | for func_name, func in member_list:
26 | log.info(u"开始运行 {}".format(func_name))
27 | try:
28 | proxy_list = [_ for _ in func() if verifyProxyFormat(_)]
29 | proxy_count_dict[func_name] = len(proxy_list)
30 | except Exception as e:
31 | log.info(u"代理获取函数 {} 运行出错!".format(func_name))
32 | log.error(str(e))
33 | log.info(u"所有函数运行完毕 " + "***" * 5)
34 | for func_name, func in member_list:
35 | log.info(u"函数 {n}, 获取到代理数: {c}".format(n=func_name, c=proxy_count_dict.get(func_name, 0)))
36 |
37 | @staticmethod
38 | def checkGetProxyFunc(func):
39 | """
40 | 检查指定的getFreeProxy某个function运行情况
41 | Args:
42 | func: getFreeProxy中某个可调用方法
43 |
44 | Returns:
45 | None
46 | """
47 | func_name = getattr(func, '__name__', "None")
48 | log.info("start running func: {}".format(func_name))
49 | count = 0
50 | for proxy in func():
51 | if verifyProxyFormat(proxy):
52 | log.info("fetch proxy: {}".format(proxy))
53 | count += 1
54 | log.info("{n} completed, fetch proxy number: {c}".format(n=func_name, c=count))
55 |
56 |
57 | if __name__ == '__main__':
58 | CheckProxy.checkAllGetProxyFunc()
59 | CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxyFirst)
60 |
--------------------------------------------------------------------------------
/Src/ProxyGetter/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | File Name: __init__.py.py
5 | Description :
6 | Author : JHao
7 | date: 2016/11/25
8 | -------------------------------------------------
9 | Change Activity:
10 | 2016/11/25:
11 | -------------------------------------------------
12 | """
--------------------------------------------------------------------------------
/Src/ProxyGetter/getFreeProxy.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # !/usr/bin/env python
3 |
4 | import re
5 | import sys
6 | import requests
7 |
8 | from Util.WebRequest import WebRequest
9 | from Util.utilFunction import getHtmlTree
10 | from Util.utilFunction import verifyProxyFormat
11 |
12 | # for debug to disable insecureWarning
13 | requests.packages.urllib3.disable_warnings()
14 |
15 | class GetFreeProxy(object):
16 |
17 | def __init__(self):
18 | pass
19 |
20 | @staticmethod
21 | def freeProxyFirst(page=10):
22 | url_list = [
23 | 'http://www.data5u.com/',
24 | 'http://www.data5u.com/free/gngn/index.shtml',
25 | 'http://www.data5u.com/free/gnpt/index.shtml'
26 | ]
27 | for url in url_list:
28 | html_tree = getHtmlTree(url)
29 | ul_list = html_tree.xpath('//ul[@class="l2"]')
30 | for ul in ul_list:
31 | try:
32 | yield ':'.join(ul.xpath('.//li/text()')[0:2])
33 | except Exception as e:
34 | print(e)
35 |
36 | @staticmethod
37 | def freeProxySecond(area=33, page=1):
38 | area = 33 if area > 33 else area
39 | for area_index in range(1, area + 1):
40 | for i in range(1, page + 1):
41 | url = "http://www.66ip.cn/areaindex_{}/{}.html".format(area_index, i)
42 | html_tree = getHtmlTree(url)
43 | tr_list = html_tree.xpath("//*[@id='footer']/div/table/tr[position()>1]")
44 | if len(tr_list) == 0:
45 | continue
46 | for tr in tr_list:
47 | yield tr.xpath("./td[1]/text()")[0] + ":" + tr.xpath("./td[2]/text()")[0]
48 | break
49 |
50 | @staticmethod
51 | def freeProxyThird(days=1):
52 | url = 'http://www.ip181.com/'
53 | html_tree = getHtmlTree(url)
54 | try:
55 | tr_list = html_tree.xpath('//tr')[1:]
56 | for tr in tr_list:
57 | yield ':'.join(tr.xpath('./td/text()')[0:2])
58 | except Exception as e:
59 | pass
60 |
61 | @staticmethod
62 | def freeProxyFourth(page_count=2):
63 | url_list = [
64 | 'http://www.xicidaili.com/nn/', # 高匿
65 | 'http://www.xicidaili.com/nt/', # 透明
66 | ]
67 | for each_url in url_list:
68 | for i in range(1, page_count + 1):
69 | page_url = each_url + str(i)
70 | tree = getHtmlTree(page_url)
71 | proxy_list = tree.xpath('.//table[@id="ip_list"]//tr[position()>1]')
72 | for proxy in proxy_list:
73 | try:
74 | yield ':'.join(proxy.xpath('./td/text()')[0:2])
75 | except Exception as e:
76 | pass
77 |
78 | @staticmethod
79 | def freeProxyFifth():
80 | url = "http://www.goubanjia.com/"
81 | tree = getHtmlTree(url)
82 | proxy_list = tree.xpath('//td[@class="ip"]')
83 | # 此网站有隐藏的数字干扰,或抓取到多余的数字或.符号
84 | # 需要过滤掉的内容
85 | xpath_str = """.//*[not(contains(@style, 'display: none'))
86 | and not(contains(@style, 'display:none'))
87 | and not(contains(@class, 'port'))
88 | ]/text()
89 | """
90 | for each_proxy in proxy_list:
91 | try:
92 | # :符号裸放在td下,其他放在div span p中,先分割找出ip,再找port
93 | ip_addr = ''.join(each_proxy.xpath(xpath_str))
94 | port = each_proxy.xpath(".//span[contains(@class, 'port')]/text()")[0]
95 | yield '{}:{}'.format(ip_addr, port)
96 | except Exception as e:
97 | pass
98 |
99 | @staticmethod
100 | def freeProxySixth():
101 | url = 'http://www.xdaili.cn/ipagent/freeip/getFreeIps?page=1&rows=10'
102 | request = WebRequest()
103 | try:
104 | res = request.get(url).json()
105 | for row in res['RESULT']['rows']:
106 | yield '{}:{}'.format(row['ip'], row['port'])
107 | except Exception as e:
108 | pass
109 |
110 | @staticmethod
111 | def freeProxySeventh():
112 | url_list = [
113 | 'https://www.kuaidaili.com/free/inha/{page}/',
114 | 'https://www.kuaidaili.com/free/intr/{page}/'
115 | ]
116 | for url in url_list:
117 | for page in range(1, 5):
118 | page_url = url.format(page=page)
119 | tree = getHtmlTree(page_url)
120 | proxy_list = tree.xpath('.//table//tr')
121 | for tr in proxy_list[1:]:
122 | yield ':'.join(tr.xpath('./td/text()')[0:2])
123 |
124 | @staticmethod
125 | def freeProxyEight():
126 | url_gngao = ['http://www.mimiip.com/gngao/%s' % n for n in range(1, 10)] # 国内高匿
127 | url_gnpu = ['http://www.mimiip.com/gnpu/%s' % n for n in range(1, 10)] # 国内普匿
128 | url_gntou = ['http://www.mimiip.com/gntou/%s' % n for n in range(1, 10)] # 国内透明
129 | url_list = url_gngao + url_gnpu + url_gntou
130 |
131 | request = WebRequest()
132 | for url in url_list:
133 | r = request.get(url)
134 | proxies = re.findall(r'
(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) | [\w\W].*(\d+) | ', r.text)
135 | for proxy in proxies:
136 | yield ':'.join(proxy)
137 |
138 | @staticmethod
139 | def freeProxyNinth():
140 | urls = ['https://proxy.coderbusy.com/classical/country/cn.aspx?page=1']
141 | request = WebRequest()
142 | for url in urls:
143 | r = request.get(url)
144 | proxies = re.findall('data-ip="(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})".+?>(\d+)', r.text)
145 | for proxy in proxies:
146 | yield ':'.join(proxy)
147 |
148 | @staticmethod
149 | def freeProxyTen():
150 | urls = [
151 | "http://www.ip3366.net/free/?stype=1",
152 | "http://www.ip3366.net/free/?stype=2",
153 | "http://www.ip3366.net/free/?stype=3",
154 | "http://www.ip3366.net/free/?stype=4",
155 | ]
156 | request = WebRequest()
157 | for url in urls:
158 | r = request.get(url)
159 | proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) | [\s\S]*?(\d+) | ', r.text)
160 | for proxy in proxies:
161 | yield ":".join(proxy)
162 |
163 | @staticmethod
164 | def freeProxyEleven():
165 | urls = [
166 | 'http://www.iphai.com/free/ng',
167 | 'http://www.iphai.com/free/np',
168 | 'http://www.iphai.com/free/wg',
169 | 'http://www.iphai.com/free/wp'
170 | ]
171 | request = WebRequest()
172 | for url in urls:
173 | r = request.get(url)
174 | proxies = re.findall(r'\s*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s*? | [\s\S]*?\s*?(\d+)\s*? | ',
175 | r.text)
176 | for proxy in proxies:
177 | yield ":".join(proxy)
178 |
179 | @staticmethod
180 | def freeProxyTwelve(page_count=8):
181 | for i in range(1, page_count + 1):
182 | url = 'http://ip.jiangxianli.com/?page={}'.format(i)
183 | html_tree = getHtmlTree(url)
184 | tr_list = html_tree.xpath("/html/body/div[1]/div/div[1]/div[2]/table/tbody/tr")
185 | if len(tr_list) == 0:
186 | continue
187 | for tr in tr_list:
188 | yield tr.xpath("./td[2]/text()")[0] + ":" + tr.xpath("./td[3]/text()")[0]
189 |
190 | @staticmethod
191 | def freeProxyWallFirst():
192 | urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218']
193 | request = WebRequest()
194 | for url in urls:
195 | r = request.get(url)
196 | proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) | [\w\W](\d+) | ', r.text)
197 | for proxy in proxies:
198 | yield ':'.join(proxy)
199 |
200 | @staticmethod
201 | def freeProxyWallSecond():
202 | urls = ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)]
203 | request = WebRequest()
204 | import base64
205 | for url in urls:
206 | r = request.get(url)
207 | proxies = re.findall(r"Proxy\('(.*?)'\)", r.text)
208 | for proxy in proxies:
209 | yield base64.b64decode(proxy).decode()
210 |
211 | @staticmethod
212 | def freeProxyWallThird():
213 | urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1']
214 | request = WebRequest()
215 | for url in urls:
216 | r = request.get(url)
217 | proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) | [\s\S]*?(\d+) | ', r.text)
218 | for proxy in proxies:
219 | yield ':'.join(proxy)
220 |
221 |
222 | if __name__ == '__main__':
223 | pass
224 |
--------------------------------------------------------------------------------
/Src/Run/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ozhiwei/SmartProxyPool/f38a21857c9f5e3511bc65792a493cf45f6c491e/Src/Run/__init__.py
--------------------------------------------------------------------------------
/Src/Run/main.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from gevent import monkey
4 | monkey.patch_all()
5 |
6 | import sys
7 | sys.path.append("Src")
8 |
9 | import time
10 | import signal
11 | from threading import Thread
12 |
13 | from Log import LogManager
14 | from Web import WebManager
15 | from Forward.ForwardManager import ForwardHttp
16 | from Manager.ProxyFetch import ProxyFetch
17 |
18 | from Schedule.ProxyVerifySchedule import ProxyVerifySchedule
19 | from Schedule.ProxyFetchSchedule import ProxyFetchSchedule
20 |
21 | TASK_LIST = {
22 | "ProxyVerifySchedule": ProxyVerifySchedule,
23 | "ProxyFetchSchedule": ProxyFetchSchedule,
24 | "ForwardHttp": ForwardHttp,
25 | }
26 |
27 | def show_time():
28 | date = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
29 | content = "{newline}{symbol} ProxyPool Start, date:{date} {symbol}{newline}".format(newline="\n", symbol="-"*50, date=date)
30 | print(content)
31 |
32 | def start_fetch():
33 | ProxyFetch.initQueue()
34 | t = ProxyFetch()
35 | t.start()
36 |
37 | def start_task():
38 | start_fetch()
39 |
40 | task_list = []
41 | for name in TASK_LIST.keys():
42 | task = TASK_LIST[name]()
43 | t = Thread(target=task.run, name=name)
44 | task_list.append(t)
45 |
46 | for t in task_list:
47 | t.daemon = True
48 | t.start()
49 |
50 | def stop_handler(signum, frame):
51 | print('Received Signal [%s], Stop Program' % signum)
52 | sys.exit()
53 |
54 | def register_signal():
55 | signal.signal(signal.SIGINT, stop_handler)
56 |
57 |
58 | def main(test=False):
59 | show_time()
60 | register_signal()
61 |
62 | LogManager.init()
63 |
64 | start_task()
65 |
66 | WebManager.run()
67 |
68 | if __name__ == '__main__':
69 | main()
--------------------------------------------------------------------------------
/Src/Schedule/ProxyCleanSchedule.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # !/usr/bin/env python
3 |
4 | import sys
5 | sys.path.append("Src")
6 | import time
7 | import threading
8 | import datetime
9 |
10 | from Schedule.ProxySchedule import ProxySchedule
11 | from Manager.ProxyClean import ProxyCleanRaw, ProxyCleanUseful
12 |
13 | from Log.LogManager import log
14 | from Config import ConfigManager
15 |
16 | class ProxyCleanSchedule(ProxySchedule):
17 | rightnow = True
18 |
19 | def __init__(self, **kwargs):
20 | super(ProxyCleanSchedule, self).__init__(**kwargs)
21 | self.task_handler_hash = {
22 | "clean_raw_proxy_interval": self.clean_raw_proxy,
23 | "clean_useful_proxy_interval": self.clean_useful_proxy,
24 | }
25 |
26 | def clean_raw_proxy(self):
27 | t = ProxyCleanRaw()
28 | t.daemon = True
29 | t.start()
30 | t.join()
31 |
32 | def clean_useful_proxy(self):
33 | t = ProxyCleanUseful()
34 | t.daemon = True
35 | t.start()
36 | t.join()
37 |
38 | if __name__ == '__main__':
39 | sch = ProxyCleanSchedule()
40 | sch.run()
--------------------------------------------------------------------------------
/Src/Schedule/ProxyFetchSchedule.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # !/usr/bin/env python
3 |
4 | from gevent import monkey
5 | monkey.patch_all()
6 |
7 | import sys
8 | sys.path.append("Src")
9 | import time
10 | import threading
11 | import datetime
12 |
13 | from Manager.ProxyFetch import ProxyFetch
14 | from Manager import ProxyManager
15 | from Schedule.ProxySchedule import ProxySchedule
16 |
17 | from Log.LogManager import log
18 | from Config import ConfigManager
19 |
20 | class ProxyFetchSchedule(ProxySchedule):
21 | rightnow = False
22 |
23 | def __init__(self, **kwargs):
24 | super(ProxyFetchSchedule, self).__init__(**kwargs)
25 | self.task_handler_hash = {
26 | "fetch_new_proxy_interval": self.fetchNewProxy,
27 | }
28 |
29 | def checkFetchNewProxy(self):
30 |
31 | total_number = ProxyManager.proxy_manager.getUsefulProxyNumber()
32 | hold_number = ConfigManager.setting_config.setting.get("hold_useful_proxy_number")
33 | if total_number < hold_number or hold_number == -1:
34 | log.debug("fetch new proxy start, exist raw_proxy total_number:{total_number}, hold_number:{hold_number}".format(total_number=total_number, hold_number=hold_number))
35 | result = True
36 | else:
37 | log.debug("fetch new proxy skip, exist raw_proxy total_number:{total_number}, hold_number:{hold_number}".format(total_number=total_number, hold_number=hold_number))
38 | result = False
39 |
40 | return result
41 |
42 | def fetchNewProxy(self):
43 | if self.checkFetchNewProxy():
44 | ProxyFetch.initQueue()
45 | t = ProxyFetch()
46 | t.start()
47 |
48 | if __name__ == '__main__':
49 | sch = ProxyFetchSchedule()
50 | sch.run()
51 |
--------------------------------------------------------------------------------
/Src/Schedule/ProxySchedule.py:
--------------------------------------------------------------------------------
1 | from apscheduler.schedulers.blocking import BlockingScheduler
2 | from Config import ConfigManager
3 | from Notify import NotifyManager
4 | from Log.LogManager import log
5 |
6 | import logging
7 | import datetime
8 |
9 | DISPATCH_EVENT_LIST = [
10 | "fetch_new_proxy_interval",
11 | "verify_raw_proxy_interval",
12 | "verify_useful_proxy_interval",
13 | "clean_raw_proxy_interval",
14 | "clean_useful_proxy_interval",
15 | ]
16 |
17 | SCHEDULE_LOG_PATH = "logs/schedule.log"
18 |
19 | logger = logging.getLogger()
20 | file_handler = logging.FileHandler(SCHEDULE_LOG_PATH)
21 | logger.addHandler(file_handler)
22 | logger.setLevel(logging.INFO)
23 |
24 | class ProxySchedule(BlockingScheduler):
25 | def __init__(self, **kwargs):
26 | super(ProxySchedule, self).__init__(logger=logger, **kwargs)
27 | self.task_handler_hash = {}
28 |
29 | NotifyManager.register_event(NotifyManager.NOTIFY_EVENT["AFTER_SETTING_CHANGE"], self.dispatch_event)
30 |
31 | def dispatch_event(self, **kwargs):
32 | event_name = kwargs.get("event_name")
33 | event_data = kwargs.get("event_data")
34 |
35 | if event_name in DISPATCH_EVENT_LIST:
36 | self.update_job_interval(**event_data)
37 |
38 |
39 | def update_job_interval(self, **kwargs):
40 | job_name = kwargs.get("job_name")
41 |
42 | value = ConfigManager.setting_config.setting.get(job_name)
43 | trigger_args = { "minutes": value }
44 | trigger='interval'
45 | job = self._update_job(job_name, trigger, **trigger_args)
46 | log.info("update_job_interval: {job_name}, {job}".format(job_name=job_name, job=job))
47 | return job
48 |
49 | def _update_job(self, job_name, trigger, **trigger_args):
50 | return self.reschedule_job(job_name, trigger=trigger, **trigger_args)
51 |
52 | def run(self):
53 | now = datetime.datetime.now()
54 | for name, handler in self.task_handler_hash.items():
55 | value = ConfigManager.setting_config.setting.get(name)
56 | if self.rightnow:
57 | next_run_time=now
58 | else:
59 | next_run_time=None
60 |
61 | self.add_job(handler, "interval", id=name, minutes=value, next_run_time=next_run_time)
62 |
63 | self.start()
--------------------------------------------------------------------------------
/Src/Schedule/ProxyVerifySchedule.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # !/usr/bin/env python
3 | from gevent import monkey
4 | monkey.patch_all()
5 |
6 | import sys
7 | sys.path.append("Src")
8 | import time
9 |
10 | from Manager.ProxyVerify import ProxyVerifyRaw, ProxyVerifyUseful
11 | from Schedule.ProxySchedule import ProxySchedule
12 |
13 | from Log.LogManager import log
14 | from Config import ConfigManager
15 |
16 | class ProxyVerifySchedule(ProxySchedule):
17 | rightnow = True
18 |
19 | def __init__(self, **kwargs):
20 | super(ProxyVerifySchedule, self).__init__(**kwargs)
21 | self.task_handler_hash = {
22 | "verify_useful_proxy_interval": self.verifyUsefulProxy,
23 | }
24 |
25 | def verifyUsefulProxy(self):
26 | ProxyVerifyUseful.initQueue()
27 | t = ProxyVerifyUseful()
28 | t.start()
29 |
30 | if __name__ == '__main__':
31 | sch = ProxyVerifySchedule()
32 | sch.run()
33 |
--------------------------------------------------------------------------------
/Src/Schedule/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
--------------------------------------------------------------------------------
/Src/Util/EnvUtil.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import sys
4 |
5 | PY3 = sys.version_info >= (3,)
--------------------------------------------------------------------------------
/Src/Util/GetConfig.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # !/usr/bin/env python
3 |
4 | import os
5 | from Util.utilClass import ConfigParse
6 | from Util.utilClass import LazyProperty
7 |
8 |
9 | class GetConfig(object):
10 | """
11 | to get config from config.ini
12 | """
13 |
14 | def __init__(self):
15 | self.pwd = os.path.split(os.path.realpath(__file__))[0]
16 | config_dir = os.path.split(self.pwd)[0]
17 | self.config_path = os.path.join(config_dir, 'Config.ini')
18 | if not os.path.isfile(self.config_path):
19 | self.config_path = os.path.join(config_dir, 'Config.ini.default')
20 |
21 | self.config_file = ConfigParse()
22 | self.config_file.read(self.config_path)
23 |
24 | @LazyProperty
25 | def db_type(self):
26 | return self.config_file.get('DB', 'type')
27 |
28 | @LazyProperty
29 | def db_name(self):
30 | return self.config_file.get('DB', 'name')
31 |
32 | @LazyProperty
33 | def db_host(self):
34 | return self.config_file.get('DB', 'host')
35 |
36 | @LazyProperty
37 | def db_port(self):
38 | return int(self.config_file.get('DB', 'port'))
39 |
40 | @LazyProperty
41 | def db_password(self):
42 | try:
43 | password = self.config_file.get('DB', 'password')
44 | except Exception:
45 | password = None
46 | return password
47 |
48 | @LazyProperty
49 | def db_username(self):
50 | try:
51 | username = self.config_file.get('DB', 'username')
52 | except Exception:
53 | username = None
54 | return username
55 |
56 | @LazyProperty
57 | def log_level(self):
58 | try:
59 | log_level = self.config_file.get('LOG', 'level')
60 | except Exception:
61 | log_level = None
62 | return log_level
63 |
64 | @LazyProperty
65 | def proxy_getter_functions(self):
66 | return self.config_file.options('ProxyGetter')
67 |
68 | @LazyProperty
69 | def host_ip(self):
70 | return self.config_file.get('API','ip')
71 |
72 | @LazyProperty
73 | def host_port(self):
74 | return int(self.config_file.get('API', 'port'))
75 |
76 | @LazyProperty
77 | def processes(self):
78 | return int(self.config_file.get('API', 'processes'))
79 |
80 | config = GetConfig()
81 |
82 | if __name__ == '__main__':
83 | gg = GetConfig()
84 | print(gg.db_type)
85 | print(gg.db_name)
86 | print(gg.db_host)
87 | print(gg.db_port)
88 | print(gg.proxy_getter_functions)
89 | print(gg.host_ip)
90 | print(gg.host_port)
91 | print(gg.processes)
92 |
--------------------------------------------------------------------------------
/Src/Util/WebRequest.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from requests.models import Response
4 | import requests
5 | import random
6 | import time
7 |
8 |
9 | class WebRequest(object):
10 | def __init__(self, *args, **kwargs):
11 | pass
12 |
13 | @property
14 | def user_agent(self):
15 | """
16 | return an User-Agent at random
17 | :return:
18 | """
19 | ua_list = [
20 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101',
21 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122',
22 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71',
23 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95',
24 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71',
25 | 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
26 | 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
27 | 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
28 | ]
29 | return random.choice(ua_list)
30 |
31 | @property
32 | def header(self):
33 | """
34 | basic header
35 | :return:
36 | """
37 | return {'User-Agent': self.user_agent,
38 | 'Accept': '*/*',
39 | 'Connection': 'keep-alive',
40 | 'Accept-Language': 'zh-CN,zh;q=0.8'}
41 |
42 | def get(self, url, header=None, retry_time=1, timeout=10,
43 | *args, **kwargs):
44 |
45 | headers = self.header
46 | if header and isinstance(header, dict):
47 | headers.update(header)
48 |
49 | try:
50 | resp = requests.get(url, headers=headers, timeout=timeout, **kwargs)
51 | except Exception as e:
52 | # print("request url error", url, e)
53 | resp = Response()
54 | resp.status_code = 504
55 |
56 | return resp
--------------------------------------------------------------------------------
/Src/Util/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | File Name: __init__.py.py
5 | Description :
6 | Author : JHao
7 | date: 2016/11/25
8 | -------------------------------------------------
9 | Change Activity:
10 | 2016/11/25:
11 | -------------------------------------------------
12 | """
--------------------------------------------------------------------------------
/Src/Util/utilClass.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # !/usr/bin/env python
3 |
4 | class LazyProperty(object):
5 | """
6 | LazyProperty
7 | explain: http://www.spiderpy.cn/blog/5/
8 | """
9 |
10 | def __init__(self, func):
11 | self.func = func
12 |
13 | def __get__(self, instance, owner):
14 | if instance is None:
15 | return self
16 | else:
17 | value = self.func(instance)
18 | setattr(instance, self.func.__name__, value)
19 | return value
20 |
21 |
22 | try:
23 | from configparser import ConfigParser # py3
24 | except:
25 | from ConfigParser import ConfigParser # py2
26 |
27 |
28 | class ConfigParse(ConfigParser):
29 | """
30 | rewrite ConfigParser, for support upper option
31 | """
32 |
33 | def __init__(self):
34 | ConfigParser.__init__(self)
35 |
36 | def optionxform(self, optionstr):
37 | return optionstr
38 |
39 |
40 | class Singleton(type):
41 | """
42 | Singleton Metaclass
43 | """
44 |
45 | _inst = {}
46 |
47 | def __call__(cls, *args, **kwargs):
48 | if cls not in cls._inst:
49 | cls._inst[cls] = super(Singleton, cls).__call__(*args)
50 | return cls._inst[cls]
51 |
--------------------------------------------------------------------------------
/Src/Util/utilFunction.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # !/usr/bin/env python
3 | """
4 | -------------------------------------------------
5 | File Name: utilFunction.py
6 | Description : tool function
7 | Author : JHao
8 | date: 2016/11/25
9 | -------------------------------------------------
10 | Change Activity:
11 | 2016/11/25: 添加robustCrawl、verifyProxy、getHtmlTree
12 | -------------------------------------------------
13 | """
14 | import requests
15 | import time
16 | import re
17 | from lxml import etree
18 |
19 | from Util.WebRequest import WebRequest
20 |
21 | # noinspection PyPep8Naming
22 | def robustCrawl(func):
23 | def decorate(*args, **kwargs):
24 | try:
25 | return func(*args, **kwargs)
26 | except Exception as e:
27 | pass
28 | # logger.info(u"sorry, 抓取出错。错误原因:")
29 | # logger.info(e)
30 |
31 | return decorate
32 |
33 |
34 | # noinspection PyPep8Naming
35 | def verifyProxyFormat(proxy):
36 | """
37 | 检查代理格式
38 | :param proxy:
39 | :return:
40 | """
41 | import re
42 | verify_regex = r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}"
43 | _proxy = re.findall(verify_regex, proxy)
44 | return True if len(_proxy) == 1 and _proxy[0] == proxy else False
45 |
46 |
47 | # noinspection PyPep8Naming
48 | def getHtmlTree(url, **kwargs):
49 | """
50 | 获取html树
51 | :param url:
52 | :param kwargs:
53 | :return:
54 | """
55 |
56 | header = {
57 | 'Connection': 'keep-alive',
58 | 'Cache-Control': 'max-age=0',
59 | 'Upgrade-Insecure-Requests': '1',
60 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko)',
61 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
62 | 'Accept-Encoding': 'gzip, deflate, sdch',
63 | 'Accept-Language': 'zh-CN,zh;q=0.8',
64 | }
65 | # TODO 取代理服务器用代理服务器访问
66 | wr = WebRequest()
67 |
68 | # delay 2s for per request
69 | # time.sleep(2)
70 |
71 | html = wr.get(url=url, header=header).content
72 | try:
73 | result = etree.HTML(html)
74 | except Exception as e:
75 | # print("getHtmlTree error: ", url, e)
76 | result = etree.HTML("")
77 |
78 | return result
79 |
80 |
81 | def tcpConnect(proxy):
82 | """
83 | TCP 三次握手
84 | :param proxy:
85 | :return:
86 | """
87 | from socket import socket, AF_INET, SOCK_STREAM
88 | s = socket(AF_INET, SOCK_STREAM)
89 | ip, port = proxy.split(':')
90 | result = s.connect_ex((ip, int(port)))
91 | return True if result == 0 else False
92 |
93 |
94 | # TODO: 逻辑应该有问题, 但不确定
95 | # http是可用的才会保存https, 会不会有只开通https的代理呢?
96 | def validUsefulProxy(proxy):
97 | """
98 | 检验代理是否可用
99 | :param proxy:
100 | :return:
101 | """
102 | if isinstance(proxy, bytes):
103 | proxy = proxy.decode('utf8')
104 | proxies = {
105 | "http": proxy,
106 | "https": proxy,
107 | }
108 | http_url = "http://httpbin.org/ip"
109 | https_url = "https://httpbin.org/ip"
110 |
111 | http_result = False
112 | https_result = False
113 |
114 | # http valid
115 | try:
116 | r = requests.get(http_url, proxies=proxies, timeout=10, verify=False)
117 |
118 | content = r.content
119 | if isinstance(content, bytes):
120 | content = content.decode('utf8')
121 |
122 | status_result = r.status_code == 200
123 | content_result = re.search("\"origin\"", content) != None
124 | if status_result and content_result:
125 | http_result = True
126 |
127 | except Exception as e:
128 | # print(str(e))
129 | http_result = False
130 |
131 | if http_result:
132 |
133 | # https vaild
134 | try:
135 | r = requests.get(https_url, proxies=proxies, timeout=10, verify=False)
136 |
137 | content = r.content
138 | if isinstance(content, bytes):
139 | content = content.decode('utf8')
140 |
141 | status_right = r.status_code == 200
142 | content_right = re.search("\"origin\"", content) != None
143 | if status_right and content_right:
144 | https_result = True
145 |
146 | except Exception as e:
147 | # print(str(e))
148 | https_result = False
149 |
150 | return (http_result, https_result)
--------------------------------------------------------------------------------
/Src/Version/VersionManger.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append("Src")
3 |
4 | from pymongo import MongoClient
5 | from Config import ConfigManager
6 |
7 | import importlib
8 |
9 | mc = MongoClient(ConfigManager.base_config.setting.get("db_host"), ConfigManager.base_config.setting.get("db_port"))
10 |
11 | VERSION_FILE_PATH = "version"
12 | version_list = []
13 |
14 | def init():
15 | with open(VERSION_FILE_PATH) as f:
16 | items = f.readlines()
17 | for item in items:
18 | version_list.append(item.strip())
19 |
20 | def get_last_version():
21 | result = version_list[0]
22 | return result
23 |
24 | def update_version(cur_version):
25 | index = version_list.index(cur_version)
26 | while index:
27 | index = index - 1
28 | next_version = version_list[index]
29 | version_name = next_version.replace(".", "_")
30 | last_version = get_last_version()
31 |
32 | module_name = "version.version_{version_name}".format(version_name=version_name)
33 |
34 | try:
35 | module = importlib.import_module(module_name)
36 | result = module.run(mc, last_version, next_version, cur_version)
37 | except Exception:
38 | result = False
39 |
40 | query = {"setting_name": "version"}
41 | data = {
42 | "$set": {
43 | "setting_value": next_version
44 | }
45 | }
46 | mc.proxy.setting.update(query, data)
47 |
48 | cur_version = next_version
49 |
50 | def run():
51 | item = mc.proxy.setting.find_one({"setting_name": "version"})
52 | if item:
53 | cur_version = item["setting_value"]
54 | update_version(cur_version)
55 | else:
56 | last_version = get_last_version()
57 | data = {
58 | "setting_name": "version",
59 | "setting_value": last_version,
60 | "setting_value": True,
61 | }
62 | mc.proxy.setting.insert(data)
63 |
64 |
65 | mc.close()
66 |
67 | if __name__ == '__main__':
68 | init()
69 | run()
--------------------------------------------------------------------------------
/Src/Version/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ozhiwei/SmartProxyPool/f38a21857c9f5e3511bc65792a493cf45f6c491e/Src/Version/__init__.py
--------------------------------------------------------------------------------
/Src/Version/version/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ozhiwei/SmartProxyPool/f38a21857c9f5e3511bc65792a493cf45f6c491e/Src/Version/version/__init__.py
--------------------------------------------------------------------------------
/Src/Version/version/version_1_0_0.py:
--------------------------------------------------------------------------------
1 |
2 | # just example
3 | def run(mc, last_version, update_version, cur_version):
4 | print("nothing to do", last_version, update_version, cur_version, __file__)
5 |
6 | if __name__ == '__main__':
7 | pass
--------------------------------------------------------------------------------
/Src/Web/WebManager.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # !/usr/bin/env python
3 |
4 | # base import
5 | from gevent import monkey
6 | monkey.patch_all()
7 |
8 | import time
9 | import math
10 | import os
11 | import sys
12 | sys.path.append("Src/")
13 |
14 | import logging
15 | from flask import Flask
16 | from gevent.pywsgi import WSGIServer
17 | from Config import ConfigManager
18 |
19 | ACCESS_LOG_PATH = "logs/app_access.log"
20 |
21 | app = Flask(__name__)
22 | app.config['JSON_AS_ASCII'] = False
23 | app.config['JSONIFY_PRETTYPRINT_REGULAR'] = True
24 |
25 |
26 | logger = logging.getLogger()
27 | def init_log():
28 | file_handler = logging.FileHandler(ACCESS_LOG_PATH)
29 | logger.addHandler(file_handler)
30 | logger.setLevel(logging.INFO)
31 |
32 | return logger
33 |
34 | def init_config():
35 | app.config.from_pyfile('config.py')
36 | app.config["MONGODB_SETTINGS"] = {
37 | 'db': ConfigManager.base_config.setting.get("db_name"),
38 | 'host': ConfigManager.base_config.setting.get("db_host"),
39 | 'port': ConfigManager.base_config.setting.get("db_port"),
40 | 'username': ConfigManager.base_config.setting.get("db_user"),
41 | 'password': ConfigManager.base_config.setting.get("db_pass"),
42 | }
43 |
44 | def init_app():
45 | init_config()
46 | init_log()
47 |
48 | def start_app():
49 | from Web.admin import admin
50 | from Web.api import api
51 |
52 | admin.init_app(app)
53 | api.init_app(app)
54 |
55 | http_server = WSGIServer((ConfigManager.base_config.setting.get("web_bind_host"), ConfigManager.base_config.setting.get("web_bind_port")), app, log=logger, error_log=logger)
56 | http_server.serve_forever()
57 |
58 | def run():
59 | init_app()
60 | start_app()
61 |
62 |
63 | if __name__ == '__main__':
64 | run()
--------------------------------------------------------------------------------
/Src/Web/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ozhiwei/SmartProxyPool/f38a21857c9f5e3511bc65792a493cf45f6c491e/Src/Web/__init__.py
--------------------------------------------------------------------------------
/Src/Web/admin/__init__.py:
--------------------------------------------------------------------------------
1 | from . import admin
--------------------------------------------------------------------------------
/Src/Web/admin/admin.py:
--------------------------------------------------------------------------------
1 | from .views import ProxyView, SettingView, FetcherView, ProxyPoolAdminIndexView
2 | from .model import ProxyModel, SettingModel, FetcherModel
3 |
4 | from flask_mongoengine import MongoEngine
5 |
6 | from flask_security import Security, MongoEngineUserDatastore, \
7 | UserMixin, RoleMixin, login_required, current_user, forms
8 |
9 | from flask_mongoengine import MongoEngine
10 |
11 | from .model import User, Role
12 | from .forms import LoginForm
13 |
14 | from flask_security.utils import hash_password
15 | from flask import url_for
16 |
17 | from flask_admin import AdminIndexView, helpers, expose
18 |
19 | import flask_admin
20 |
21 | def init_base_data(*args):
22 | create_roles(*args)
23 | create_admin_user(*args)
24 |
25 | def create_roles(user_datastore, app):
26 | with app.app_context():
27 | if not user_datastore.find_role(role='user'):
28 | user_datastore.create_role(name='user')
29 | if not user_datastore.find_role(role='superuser'):
30 | user_datastore.create_role(name='superuser')
31 |
32 | def create_admin_user(user_datastore, app):
33 | with app.app_context():
34 | if user_datastore.get_user("admin"):
35 | pass
36 | else:
37 | user_role = user_datastore.find_role(role='user')
38 | super_user_role = user_datastore.find_role(role='superuser')
39 | user_datastore.create_user(name='admin', email='admin', password=hash_password('admin'), roles=[user_role, super_user_role])
40 |
41 | def init_security(user_datastore, app, admin):
42 | security = Security(app, user_datastore, login_form=LoginForm)
43 |
44 | @security.context_processor
45 | def security_context_processor():
46 | return dict(
47 | admin_base_template=admin.base_template,
48 | admin_view=admin.index_view,
49 | h=helpers,
50 | get_url=url_for
51 | )
52 |
53 | def init_app(app):
54 | admin = flask_admin.Admin(app=app, name='ProxyPool Admin', base_template="admin/master_base.html", index_view=ProxyPoolAdminIndexView(), template_mode='bootstrap3')
55 | admin.add_view(ProxyView(ProxyModel))
56 | admin.add_view(SettingView(SettingModel))
57 | # admin.add_view(ProxyPoolView(ProxyPoolModel))
58 | admin.add_view(FetcherView(FetcherModel))
59 |
60 | db = MongoEngine()
61 | db.init_app(app)
62 |
63 | user_datastore = MongoEngineUserDatastore(db, User, Role)
64 | init_security(user_datastore, app, admin)
65 |
66 | init_base_data(user_datastore, app)
67 |
68 |
--------------------------------------------------------------------------------
/Src/Web/admin/forms.py:
--------------------------------------------------------------------------------
1 | from flask_security import Security, MongoEngineUserDatastore, \
2 | UserMixin, RoleMixin, login_required, current_user, forms
3 |
4 | from wtforms import fields, validators
5 |
6 | class LoginForm(forms.LoginForm):
7 | name = fields.StringField()
8 | email = fields.StringField(label="Name or Email", validators=[validators.required()])
9 |
--------------------------------------------------------------------------------
/Src/Web/admin/model.py:
--------------------------------------------------------------------------------
1 | import mongoengine
2 | from flask_mongoengine import Document
3 | from flask_security import UserMixin, RoleMixin
4 |
5 | class ProxyModel(Document):
6 | meta = {'collection': 'useful_proxy'}
7 |
8 | proxy = mongoengine.StringField(required=True, max_length=40)
9 | last_status = mongoengine.IntField(default=0)
10 | last_succ_time = mongoengine.IntField(default=0)
11 | next_verify_time = mongoengine.IntField(default=0)
12 | succ = mongoengine.IntField(default=0)
13 | fail = mongoengine.IntField(default=0)
14 | total = mongoengine.IntField(default=0)
15 | keep_succ = mongoengine.IntField(default=0)
16 | quality = mongoengine.IntField(default=0)
17 | type = mongoengine.IntField(default=0)
18 | https = mongoengine.BooleanField(default=False)
19 | region_list = mongoengine.ListField(mongoengine.StringField(max_length=20))
20 |
21 | def __unicode__(self):
22 | return self.name
23 |
24 | class SettingModel(Document):
25 | meta = {'collection': 'setting'}
26 |
27 | setting_name = mongoengine.StringField(required=True, unique=True, max_length=40)
28 | setting_value = mongoengine.StringField(required=True, max_length=40)
29 | setting_state = mongoengine.BooleanField(default=True)
30 |
31 | class FetcherModel(Document):
32 | meta = {'collection': 'fetchers'}
33 |
34 | name = mongoengine.StringField(required=True, unique=True, max_length=40)
35 | host = mongoengine.StringField(required=True, unique=True, max_length=40)
36 | total = mongoengine.IntField(default=0)
37 | succ = mongoengine.IntField(default=0)
38 | fail = mongoengine.IntField(default=0)
39 | skip = mongoengine.IntField(default=0)
40 | interval = mongoengine.IntField(default=0)
41 | next_fetch_time = mongoengine.IntField(default=0)
42 | # fetch_time = mongoengine.DateTimeField()
43 | status = mongoengine.BooleanField(default=True)
44 | # fetch_desc = mongoengine.StringField(max_length=40)
45 |
46 |
47 | class ProxyPoolModel(Document):
48 | meta = {'collection': 'proxy_pool'}
49 |
50 | token = mongoengine.StringField(required=True, max_length=40)
51 | filter_name = mongoengine.StringField(required=True, max_length=40)
52 | verifier_name = mongoengine.StringField(required=True, max_length=40)
53 |
54 |
55 | class Role(Document, RoleMixin):
56 | name = mongoengine.StringField(max_length=80, unique=True)
57 | description = mongoengine.StringField(max_length=255)
58 |
59 | class User(Document, UserMixin):
60 | email = mongoengine.StringField(max_length=255)
61 | name = mongoengine.StringField(max_length=255)
62 | password = mongoengine.StringField(max_length=255)
63 | active = mongoengine.BooleanField(default=True)
64 | confirmed_at = mongoengine.DateTimeField()
65 | roles = mongoengine.ListField(mongoengine.ReferenceField(Role), default=[])
--------------------------------------------------------------------------------
/Src/Web/admin/views.py:
--------------------------------------------------------------------------------
1 | import math
2 | import time
3 | from flask import request
4 | from flask_security import current_user
5 |
6 | import flask_admin
7 | from flask import Flask, jsonify, url_for, redirect, render_template, request
8 | from flask_admin.contrib.mongoengine import ModelView
9 | from flask_admin import expose
10 |
11 | from Notify.NotifyManager import dispatch_event, NOTIFY_EVENT
12 |
13 | # project import
14 | from Config import ConfigManager
15 | from Manager.ProxyManager import proxy_manager
16 |
17 | CUSTOM_COLUMN_FORMAT = {
18 | "type" : [
19 | "未知",
20 | "透明",
21 | "匿名",
22 | ],
23 | "https" : [
24 | "未知",
25 | "开启",
26 | "关闭",
27 | ],
28 | "last_status": [
29 | "未知",
30 | "成功",
31 | "失败"
32 | ]
33 | }
34 |
35 | def ElapseTimeFormat(all_time):
36 | day = 24*60*60
37 | hour = 60*60
38 | min = 60
39 | if all_time <60:
40 | return "%d sec"%math.ceil(all_time)
41 | elif all_time > day:
42 | days = divmod(all_time,day)
43 | return "%d days"%(int(days[0]))
44 | elif all_time > hour:
45 | hours = divmod(all_time,hour)
46 | return '%d hours'%(int(hours[0]))
47 | else:
48 | mins = divmod(all_time,min)
49 | return "%d mins"%(int(mins[0]))
50 |
51 | def LastSuccTimeFormat(last_time):
52 | if last_time:
53 | result = ElapseTimeFormat(int(time.time() - last_time))
54 | else:
55 | result = 0
56 |
57 | return result
58 |
59 | def TimeStampFormat(timeStamp):
60 | time_object = time.localtime(timeStamp)
61 | result = time.strftime("%m-%d %H:%M", time_object)
62 | return result
63 |
64 | def PercentFormat(cur, total):
65 | if total == 0:
66 | percent = 0
67 | else:
68 | percent = float(cur / total * 100)
69 | result = "%d(%.2f%%)" % (cur, percent)
70 |
71 | return result
72 |
73 | class ProxyView(ModelView):
74 | name = "ProxyPool"
75 |
76 | column_list = ("proxy", "succ", "total", "keep_succ", "quality", "type", "https",
77 | "last_status", "last_succ_time", "next_verify_time", "region_list")
78 | can_create = False
79 | column_default_sort = ("quality", True)
80 | column_formatters = dict(
81 | type=lambda v, c, m, p: CUSTOM_COLUMN_FORMAT[p][m.type],
82 | https=lambda v, c, m, p: CUSTOM_COLUMN_FORMAT[p][m.https],
83 | last_status=lambda v, c, m, p: CUSTOM_COLUMN_FORMAT[p][m.last_status],
84 | last_succ_time=lambda v, c, m, p: LastSuccTimeFormat(m.last_succ_time),
85 | next_verify_time=lambda v, c, m, p: TimeStampFormat(m.next_verify_time),
86 | succ=lambda v, c, m, p: PercentFormat(m.succ, m.total),
87 | )
88 |
89 | def is_accessible(self):
90 | if not current_user.is_active or not current_user.is_authenticated:
91 | return False
92 |
93 | if current_user.has_role('superuser'):
94 | return True
95 |
96 | return False
97 |
98 | def _handle_view(self, name, **kwargs):
99 | if current_user.is_authenticated:
100 | pass
101 | else:
102 | return redirect(url_for('security.login', next=request.url))
103 |
104 | class SettingView(ModelView):
105 | name="Setting"
106 |
107 | can_create = False
108 | can_delete = False
109 | can_view_details = True
110 | column_searchable_list = ['setting_name']
111 | column_editable_list = [ "setting_value", "setting_state"]
112 |
113 | def is_accessible(self):
114 | result = None
115 | if not current_user.is_active or not current_user.is_authenticated:
116 | result = False
117 |
118 | if current_user.has_role('superuser'):
119 | result = True
120 |
121 | return result
122 |
123 | def _handle_view(self, name, **kwargs):
124 | if current_user.is_authenticated:
125 | pass
126 | else:
127 | return redirect(url_for('security.login', next=request.url))
128 |
129 | def after_model_change(self, form, model, is_created):
130 | kwargs = dict(
131 | event_name=model.setting_name,
132 | event_data=dict(
133 | job_id=model.setting_name
134 | )
135 | )
136 | dispatch_event(NOTIFY_EVENT["AFTER_SETTING_CHANGE"], **kwargs)
137 |
138 | class FetcherView(ModelView):
139 | name="Fethers"
140 |
141 | column_list = ("name", "host", "succ", "fail", "skip", "total", "status", "interval", "next_fetch_time")
142 | can_create = False
143 | can_delete = False
144 | can_view_details = True
145 | column_default_sort = ("succ", True)
146 | column_searchable_list = ['name']
147 | column_editable_list = [ "status", "interval"]
148 | column_formatters = dict(
149 | succ=lambda v, c, m, p: PercentFormat(m.succ, m.total),
150 | fail=lambda v, c, m, p: PercentFormat(m.fail, m.total),
151 | skip=lambda v, c, m, p: PercentFormat(m.skip, m.total),
152 | next_fetch_time=lambda v, c, m, p: TimeStampFormat(m.next_fetch_time),
153 | )
154 |
155 | def is_accessible(self):
156 | result = None
157 | if not current_user.is_active or not current_user.is_authenticated:
158 | result = False
159 |
160 | if current_user.has_role('superuser'):
161 | result = True
162 |
163 | return result
164 |
165 | def _handle_view(self, name, **kwargs):
166 | if current_user.is_authenticated:
167 | pass
168 | else:
169 | return redirect(url_for('security.login', next=request.url))
170 |
171 | class ProxyPoolAdminIndexView(flask_admin.AdminIndexView):
172 |
173 | @expose()
174 | def index(self):
175 | if not current_user.is_authenticated:
176 | return redirect(url_for('security.login'))
177 | return super(ProxyPoolAdminIndexView, self).index()
178 |
--------------------------------------------------------------------------------
/Src/Web/api/__init__.py:
--------------------------------------------------------------------------------
1 | from . import api
--------------------------------------------------------------------------------
/Src/Web/api/api.py:
--------------------------------------------------------------------------------
1 | from flask import Flask, jsonify, url_for, redirect, render_template, request
2 | from flask_restful import reqparse, abort, Api, Resource
3 |
4 | from Manager.ProxyManager import proxy_manager
5 | from Log.LogManager import log
6 |
7 | API_LIST = {
8 | "/api/v1/proxy/": {
9 | "args": {
10 | "token": {
11 | "value": "random string + random number",
12 | "desc": "Avoid Get Repetition Proxy",
13 | "required": False,
14 | },
15 | "https": {
16 | "value": [1],
17 | "desc": "need https proxy? 1 == true",
18 | "required": False,
19 | },
20 | "region": {
21 | "value": "region name like 中国 or 广州 or 江苏",
22 | "desc": "Get Region Proxy",
23 | "required": False,
24 | },
25 | "type": {
26 | "value": [1,2],
27 | "desc": "clear proxy 1 or (common) anonymous 2",
28 | "required": False,
29 | }
30 | },
31 | "desc": "Get A Random Proxy"
32 | },
33 | "/api/v1/proxies/": {
34 | "args": {
35 | "https": {
36 | "value": [1],
37 | "desc": "need https proxy? 1 == true",
38 | "required": False,
39 | },
40 | "region": {
41 | "value": "region name like 中国 or 广州 or 江苏",
42 | "desc": "Get Region Proxy",
43 | "required": False,
44 | },
45 | "type": {
46 | "value": [1,2],
47 | "desc": "clear proxy 1 or (common) anonymous 2",
48 | "required": False,
49 | }
50 | },
51 | "desc": "Get All Proxy",
52 | }
53 | }
54 |
55 | class ApiList(Resource):
56 | def get(self):
57 | result = jsonify(API_LIST)
58 |
59 | return result
60 |
61 | class Proxy(Resource):
62 | def __init__(self, **kwargs):
63 | super(Proxy, self).__init__(**kwargs)
64 |
65 | parser = reqparse.RequestParser()
66 | parser.add_argument('https', type=int, choices=[1], location='args')
67 | parser.add_argument('type', type=int, choices=[1,2], location='args')
68 | parser.add_argument('region', type=str, location='args')
69 | parser.add_argument('token', type=str, location='args')
70 | self.args = parser.parse_args()
71 |
72 | def get(self):
73 | result = {
74 | "data": {}
75 | }
76 |
77 | options = {
78 | "https": self.args.get('https'),
79 | "type": self.args.get('type'),
80 | "region": self.args.get('region'),
81 | }
82 | log.debug("receive params: {}".format(options))
83 |
84 | item = proxy_manager.getSampleUsefulProxy(**options)
85 | if item:
86 | del item["_id"]
87 |
88 | result["data"] = item
89 |
90 | return result
91 |
92 |
93 | class Proxies(Resource):
94 | def __init__(self, **kwargs):
95 | super(Proxies, self).__init__(**kwargs)
96 |
97 | parser = reqparse.RequestParser()
98 | parser.add_argument('https', type=int, choices=[1], location='args')
99 | parser.add_argument('type', type=int, choices=[1,2], location='args')
100 | parser.add_argument('region', type=str, location='args')
101 | self.args = parser.parse_args()
102 |
103 | def get(self):
104 | result = {
105 | "data": []
106 | }
107 |
108 | options = {
109 | "https": bool(self.args.get('https')),
110 | "type": self.args.get('type'),
111 | "region": self.args.get('region'),
112 | }
113 |
114 | data = proxy_manager.getAllUsefulProxy(**options)
115 |
116 | for item in data:
117 | del item["_id"]
118 |
119 | result["data"] = data
120 |
121 | return result
122 |
123 | def init_api(app):
124 | @app.errorhandler(404)
125 | def miss(e):
126 | data = [
127 | {"result": "not found"},
128 | {"status_code": 404},
129 | {"github": "https://github.com/1again/ProxyPool"},
130 | {"api_list": API_LIST},
131 | ]
132 | result = jsonify(data)
133 | return result, 404
134 |
135 |
136 | def init_app(app):
137 | app.config.update(RESTFUL_JSON=dict(ensure_ascii=False))
138 | init_api(app)
139 |
140 | api = Api(app)
141 | api.add_resource(Proxies, '/api/v1/proxies/')
142 | api.add_resource(Proxy, '/api/v1/proxy/')
143 | api.add_resource(ApiList, '/api/v1/')
144 |
--------------------------------------------------------------------------------
/Src/Web/config.py:
--------------------------------------------------------------------------------
1 | # Create dummy secrey key so we can use sessions
2 | SECRET_KEY = '1234567890'
3 |
4 | # Flask-Security config
5 | SECURITY_URL_PREFIX = "/admin"
6 | SECURITY_PASSWORD_HASH = "pbkdf2_sha256"
7 | SECURITY_PASSWORD_SALT = "ATGUOHAELKiubahiughaerGOJAEGj"
8 |
9 | SECURITY_USER_IDENTITY_ATTRIBUTES = ["name"]
10 |
11 | # Flask-Security URLs, overridden because they don't put a / at the end
12 | SECURITY_LOGIN_URL = "/login/"
13 | SECURITY_LOGOUT_URL = "/logout/"
14 | SECURITY_REGISTER_URL = "/register/"
15 |
16 | SECURITY_POST_LOGIN_VIEW = "/admin/"
17 | SECURITY_POST_LOGOUT_VIEW = "/admin/"
18 | SECURITY_POST_REGISTER_VIEW = "/admin/"
19 |
20 | # Flask-Security features
21 | SECURITY_REGISTERABLE = True
22 | SECURITY_SEND_REGISTER_EMAIL = False
23 | SQLALCHEMY_TRACK_MODIFICATIONS = False
--------------------------------------------------------------------------------
/Src/Web/templates/admin/index.html:
--------------------------------------------------------------------------------
1 | {% extends 'admin/master.html' %}
2 | {% block body %}
3 | {{ super() }}
4 |
5 |
6 |
7 |
ProxyPool-Admin
8 |
9 | Nothing to tell you, Have good day!
10 |
11 |
12 |
13 |
14 | {% endblock body %}
15 |
--------------------------------------------------------------------------------
/Src/Web/templates/admin/master_base.html:
--------------------------------------------------------------------------------
1 | {% extends 'admin/base.html' %}
2 |
3 | {% block access_control %}
4 | {% if current_user.is_authenticated %}
5 |
17 | {% endif %}
18 | {% endblock %}
19 |
--------------------------------------------------------------------------------
/Src/Web/templates/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/Src/Web/templates/security/_macros.html:
--------------------------------------------------------------------------------
1 | {% macro render_field_with_errors(field) %}
2 |
3 |
13 | {% endmacro %}
14 |
15 | {% macro render_field(field) %}
16 | {{ field(class_='form-control', **kwargs)|safe }}
17 | {% endmacro %}
18 |
19 | {% macro render_checkbox_field(field) -%}
20 |
27 | {%- endmacro %}
--------------------------------------------------------------------------------
/Src/Web/templates/security/_menu.html:
--------------------------------------------------------------------------------
1 | {% if security.registerable or security.recoverable or security.confirmable %}
2 | Menu
3 |
4 | - Login
5 | {% if security.registerable %}
6 | - Register
7 | {% endif %}
8 | {% if security.recoverable %}
9 | - Forgot password
10 | {% endif %}
11 | {% if security.confirmable %}
12 | - Confirm account
13 | {% endif %}
14 |
15 | {% endif %}
16 |
--------------------------------------------------------------------------------
/Src/Web/templates/security/_messages.html:
--------------------------------------------------------------------------------
1 | {%- with messages = get_flashed_messages(with_categories=true) -%}
2 | {% if messages %}
3 |
4 | {% for category, message in messages %}
5 | - {{ message }}
6 | {% endfor %}
7 |
8 | {% endif %}
9 | {%- endwith %}
--------------------------------------------------------------------------------
/Src/Web/templates/security/login_user.html:
--------------------------------------------------------------------------------
1 | {% extends 'admin/master.html' %}
2 | {% from "security/_macros.html" import render_field, render_field_with_errors, render_checkbox_field %}
3 | {% include "security/_messages.html" %}
4 | {% block body %}
5 | {{ super() }}
6 |
24 | {% endblock body %}
--------------------------------------------------------------------------------
/Src/Web/templates/security/register_user.html:
--------------------------------------------------------------------------------
1 | {% extends 'admin/master.html' %}
2 | {% from "security/_macros.html" import render_field_with_errors, render_field %}
3 | {% include "security/_messages.html" %}
4 | {% block body %}
5 | {{ super() }}
6 |
7 |
8 |
Register
9 |
10 |
19 |
Already signed up? Please log in.
20 |
21 |
22 |
23 | {% endblock body %}
--------------------------------------------------------------------------------
/Test/.pytest_cache/v/cache/lastfailed:
--------------------------------------------------------------------------------
1 | {
2 | "testGetFreeProxy.py::testGetFreeProxy": true
3 | }
--------------------------------------------------------------------------------
/Test/.pytest_cache/v/cache/nodeids:
--------------------------------------------------------------------------------
1 | [
2 | "testGetFreeProxy.py::testGetFreeProxy"
3 | ]
--------------------------------------------------------------------------------
/Test/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
--------------------------------------------------------------------------------
/Test/testGetConfig.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from Util.GetConfig import GetConfig
4 |
5 |
6 | # noinspection PyPep8Naming
7 | def testGetConfig():
8 | """
9 | test class GetConfig in Util/GetConfig
10 | :return:
11 | """
12 | gg = GetConfig()
13 | print(gg.db_type)
14 | print(gg.db_name)
15 | print(gg.db_host)
16 | print(gg.db_port)
17 | assert isinstance(gg.proxy_getter_functions, list)
18 | print(gg.proxy_getter_functions)
19 |
20 | if __name__ == '__main__':
21 | testGetConfig()
22 |
--------------------------------------------------------------------------------
/Test/testGetFreeProxy.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import re
4 | import sys
5 | import requests
6 |
7 |
8 | try:
9 | from importlib import reload # py3 实际不会实用,只是为了不显示语法错误
10 | except:
11 | reload(sys)
12 | sys.setdefaultencoding('utf-8')
13 |
14 | sys.path.append('..')
15 | from ProxyGetter.getFreeProxy import GetFreeProxy
16 | from Util.GetConfig import GetConfig
17 |
18 |
19 | # noinspection PyPep8Naming
20 | def testGetFreeProxy():
21 | """
22 | test class GetFreeProxy in ProxyGetter/GetFreeProxy
23 | :return:
24 | """
25 | gc = GetConfig()
26 | proxy_getter_functions = gc.proxy_getter_functions
27 | for proxyGetter in proxy_getter_functions:
28 | proxy_count = 0
29 | for proxy in getattr(GetFreeProxy, proxyGetter.strip())():
30 | if proxy:
31 | print('{func}: fetch proxy {proxy},proxy_count:{proxy_count}'.format(func=proxyGetter, proxy=proxy,proxy_count=proxy_count))
32 | proxy_count += 1
33 | #assert proxy_count >= 20, '{} fetch proxy fail'.format(proxyGetter)
34 |
35 |
36 | if __name__ == '__main__':
37 | testGetFreeProxy()
38 |
--------------------------------------------------------------------------------
/Test/testLogHandler.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from Util.LogHandler import LogHandler
4 |
5 |
6 | # noinspection PyPep8Naming
7 | def testLogHandler():
8 | """
9 | test function LogHandler in Util/LogHandler
10 | :return:
11 | """
12 | log = LogHandler('test')
13 | log.info('this is a log from test')
14 |
15 | log.resetName(name='test1')
16 | log.info('this is a log from test1')
17 |
18 | log.resetName(name='test2')
19 | log.info('this is a log from test2')
20 |
21 |
22 | if __name__ == '__main__':
23 | testLogHandler()
24 |
--------------------------------------------------------------------------------
/Test/testWebRequest.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from Util.WebRequest import WebRequest
4 |
5 |
6 | # noinspection PyPep8Naming
7 | def testWebRequest():
8 | """
9 | test class WebRequest in Util/WebRequest.py
10 | :return:
11 | """
12 | wr = WebRequest()
13 | request_object = wr.get('https://www.baidu.com/')
14 | assert request_object.status_code == 200
15 |
16 |
17 | if __name__ == '__main__':
18 | testWebRequest()
19 |
--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-time-machine
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | APScheduler==3.2.0
2 | Werkzeug==0.15.3
3 | Flask==1.0.2
4 | requests==2.20.0
5 | lxml==4.3.3
6 | gevent==1.4.0
7 | Flask-RESTful==0.3.6
8 |
9 | ipip-datx==0.4.0
10 | pymongo==3.7.2
11 |
12 | flask-mongoengine==0.8.2
13 | Flask-Admin==1.5.3
14 | Flask-Security==3.0.0
15 |
--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import sys
4 | sys.path.append("Src")
5 |
6 | from Run.main import main
7 |
8 | main(test=True)
--------------------------------------------------------------------------------
/version:
--------------------------------------------------------------------------------
1 | 0.1.0
--------------------------------------------------------------------------------