├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── __init__.py
├── config.py
├── crawler
    ├── __init__.py
    ├── items.py
    ├── pipelines.py
    ├── settings.py
    └── spiders
    │   ├── __init__.py
    │   ├── proxy
    │       ├── __init__.py
    │       ├── basespider.py
    │       ├── data5u.py
    │       ├── freeproxylists.py
    │       ├── gatherproxy.py
    │       ├── hidemy.py
    │       ├── ip181.py
    │       ├── kuaidaili.py
    │       ├── proxydb.py
    │       ├── proxylistplus.py
    │       ├── sixsixip.py
    │       ├── usproxy.py
    │       └── xicidaili.py
    │   └── validator
    │       ├── __init__.py
    │       ├── amazoncn.py
    │       ├── anjuke.py
    │       ├── assetstore.py
    │       ├── baidu.py
    │       ├── bbs.py
    │       ├── boss.py
    │       ├── douban.py
    │       ├── gather.py
    │       ├── httpbin.py
    │       ├── jd.py
    │       ├── lagou.py
    │       ├── liepin.py
    │       ├── steam.py
    │       ├── validator.py
    │       └── zhilian.py
├── db.sql
├── ipproxytool.py
├── proxy.py
├── requirements.txt
├── run_crawl_proxy.py
├── run_server.py
├── run_spider.py
├── run_validator.py
├── run_validator_async.py
├── scrapy.cfg
├── server
    ├── __init__.py
    └── dataserver.py
├── sql
    ├── __init__.py
    ├── mongodb.py
    ├── mysql.py
    ├── sql_base.py
    └── sql_manager.py
├── utils.py
└── weixin.png


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | 
 27 | # PyInstaller
 28 | #  Usually these files are written by a python script from a template
 29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 30 | *.manifest
 31 | *.spec
 32 | 
 33 | # Installer logs
 34 | pip-log.txt
 35 | pip-delete-this-directory.txt
 36 | 
 37 | # Unit test / coverage reports
 38 | htmlcov/
 39 | .tox/
 40 | .coverage
 41 | .coverage.*
 42 | .cache
 43 | nosetests.xml
 44 | coverage.xml
 45 | *,cover
 46 | .hypothesis/
 47 | 
 48 | # Translations
 49 | *.mo
 50 | *.pot
 51 | 
 52 | # Django stuff:
 53 | *.log
 54 | local_settings.py
 55 | 
 56 | # Flask stuff:
 57 | instance/
 58 | .webassets-cache
 59 | 
 60 | # Scrapy stuff:
 61 | .scrapy
 62 | 
 63 | # Sphinx documentation
 64 | docs/_build/
 65 | 
 66 | # PyBuilder
 67 | target/
 68 | 
 69 | # IPython Notebook
 70 | .ipynb_checkpoints
 71 | 
 72 | # pyenv
 73 | .python-version
 74 | 
 75 | # celery beat schedule file
 76 | celerybeat-schedule
 77 | 
 78 | # dotenv
 79 | .env
 80 | 
 81 | # virtualenv
 82 | venv/
 83 | ENV/
 84 | 
 85 | # Spyder project settings
 86 | .spyderproject
 87 | 
 88 | # Rope project settings
 89 | .ropeproject
 90 | 
 91 | # custom file
 92 | *.json
 93 | *.idea
 94 | *.DS_Store
 95 | *.pyc
 96 | test*
 97 | headers.py
 98 | 
 99 | # custom dir
100 | log/
101 | 
102 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM docker.io/mrjogo/scrapy 
2 | ENV PATH /usr/local/bin:$PATH
3 | ENV PATH /home:$PATH
4 | ADD . /home
5 | WORKDIR /home
6 | RUN pip install -i https://mirrors.aliyun.com/pypi/simple -r requirements.txt
7 | CMD python ipproxytool.py


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # IPProxyTool
  2 | 使用 scrapy 爬虫抓取代理网站，获取大量的免费代理 ip。过滤出所有可用的 ip，存入数据库以备使用。
  3 | 可以访问我的个人站点，查看我的更多有趣项目 [西瓜](http://xigua233.com/)
  4 | 
  5 | 感谢 [youngjeff](https://github.com/youngjeff) 和我一起维护该项目
  6 | 
  7 | ## 运行环境
  8 | 安装 python3  and mysql 数据库
  9 | 
 10 | cryptography模块安装环境:
 11 | ```
 12 | sudo yum install gcc libffi-devel python-devel openssl-devel
 13 | ```
 14 | 
 15 | 
 16 | ```
 17 | $ pip install -r requirements.txt
 18 | ```
 19 | 
 20 | 
 21 | 
 22 | ## 下载使用
 23 | 将项目克隆到本地
 24 | 
 25 | ```
 26 | $ git clone https://github.com/awolfly9/IPProxyTool.git
 27 | ```
 28 | 
 29 | 进入工程目录
 30 | 
 31 | ```
 32 | $ cd IPProxyTool
 33 | ```
 34 | 修改 mysql 数据库配置 [config.py](https://github.com/awolfly9/IPProxyTool/blob/master/config.py) 中 database_config 的用户名和密码为数据库的用户名和密码
 35 | 
 36 | ```
 37 | $ vim config.py
 38 | ---------------
 39 | 
 40 | database_config = {
 41 | 	'host': 'localhost',
 42 | 	'port': 3306,
 43 | 	'user': 'root',
 44 | 	'password': '123456',
 45 | 	'charset': 'utf8',
 46 | }
 47 | ```
 48 | 
 49 | MYSQL: 导入数据表结构
 50 | ```
 51 | $ mysql> create database ipproxy;
 52 | Query OK, 1 row affected (0.00 sec)
 53 | $ mysql> use ipproxy;
 54 | Database changed
 55 | $ mysql> source '/你的项目目录/db.sql'
 56 | 
 57 | ```
 58 | 
 59 | 
 60 | 运行启动脚本 ipproxytool.py 也可以分别运行抓取，验证，服务器接口脚本，运行方法参考项目说明
 61 | 
 62 | ```
 63 | $ python ipproxytool.py 
 64 | ```
 65 | 
 66 | 新增异步验证方式，运行方法如下
 67 | 
 68 | ```
 69 | $ python ipproxytool.py async
 70 | ```
 71 | <br>
 72 | 
 73 | ## 项目说明
 74 | #### 抓取代理网站
 75 | 所有抓取代理网站的代码都在 [proxy](https://github.com/awolfly9/IPProxyTool/tree/master/ipproxytool/spiders/proxy)<br/>
 76 | ##### 扩展抓取其他的代理网站
 77 | 1.在 proxy 目录下新建脚本并继承自 BaseSpider <br/>
 78 | 2.设置 name、urls、headers<br/>
 79 | 3.重写 parse_page 方法，提取代理数据<br/>
 80 | 4.将数据存入数据库 具体可以参考 [ip181](https://github.com/awolfly9/IPProxyTool/blob/master/ipproxytool/spiders/proxy/ip181.py)                 [kuaidaili](https://github.com/awolfly9/IPProxyTool/blob/master/ipproxytool/spiders/proxy/kuaidaili.py)<br/>
 81 | 5.如果需要抓取特别复杂的代理网站，可以参考[peuland](https://github.com/awolfly9/IPProxyTool/blob/master/ipproxytool/spiders/proxy/peuland.py)<br/>
 82 | 
 83 | ##### 修改 run_crawl_proxy.py 导入抓取库，添加到抓取队列
 84 | 
 85 | 可以单独运行 run_crawl_proxy.py 脚本开始抓取代理网站
 86 | 
 87 | ```
 88 | $ python run_crawl_proxy.py
 89 | ```
 90 | 
 91 | #### 验证代理 ip 是否有效
 92 | 目前验证方式：<br>
 93 | 1.从上一步抓取并存储的数据库中取出所有的代理 IP <br>
 94 | 2.利用取出的代理 IP 去请求 [httpbin](http://httpbin.org/get?show_env=1)<br>
 95 | 3.根据请求结果判断出代理 IP 的有效性，是否支持 HTTPS 以及匿名度，并存储到表 httpbin 中<br>
 96 | 4.从 httpbin 表中取出代理去访问目标网站，例如 [豆瓣](https://www.douban.com/)<br>
 97 | 5.如果请求在合适的时间返回成功的数据，则认为这个代理 IP 有效。并且存入相应的表中<br>
 98 | 
 99 | 一个目标网站对应一个脚本，所有验证代理 ip 的代码都在 [validator](https://github.com/awolfly9/IPProxyTool/tree/master/ipproxytool/spiders/validator)
100 | ##### 扩展验证其他网站
101 | 1.在 validator 目录下新建脚本并继承 Validator <br>
102 | 2.设置 name、timeout、urls、headers <br>
103 | 3.然后调用 init 方法,可以参考 [baidu](https://github.com/awolfly9/IPProxyTool/blob/master/ipproxytool/spiders/validator/baidu.py) [douban](https://github.com/awolfly9/IPProxyTool/blob/master/ipproxytool/spiders/validator/douban.py)<br>
104 | 4.如果需要特别复杂的验证方式，可以参考 [assetstore](https://github.com/awolfly9/IPProxyTool/blob/master/ipproxytool/spiders/validator/assetstore.py)<br>
105 | ##### 修改 run_validator.py 导入验证库，添加到验证队列
106 | 可以单独运行 run_validator.py 开始验证代理ip的有效性
107 | 
108 | ```
109 | $ python run_validator.py
110 | ```
111 | 
112 | ### 获取代理 ip 数据服务器接口
113 | 在 config.py 中修改启动服务器端口配置 data_port，默认为 8000
114 | 启动服务器
115 | 
116 | ```
117 | $ python run_server.py
118 | ```
119 | 
120 | 服务器提供接口
121 | #### 获取
122 | <http://127.0.0.1:8000/select?name=httpbin&anonymity=1&https=yes&order=id&sort=desc&count=100>
123 | 
124 | 参数
125 | 
126 | | Name    | Type   | Description   | must |
127 | | ----    | ----   | ----          | ---- |
128 | | name    | str    | 数据库名称      | 是   |
129 | | anonymity | int  | 1:高匿 2:匿名 3:透明 | 否 |
130 | | https     | str  | https:yes http:no  | 否 |
131 | | order     | str  | table 字段  | 否 |
132 | | sort      | str | asc 升序，desc 降序 | 否 |
133 | | count | int | 获取代理数量，默认 100 | 否 |
134 | 
135 | 
136 | 
137 | 
138 | #### 删除
139 | <http://127.0.0.1:8000/delete?name=httpbin&ip=27.197.144.181>
140 | 
141 | 参数
142 | 
143 | | Name | Type | Description | 是否必须|
144 | | ----| ---- | ---- | --- |
145 | | name | str | 数据库名称 |  是 |
146 | | ip | str | 需要删除的 ip | 是 |
147 | 
148 | #### 插入
149 | <http://127.0.0.1:8000/insert?name=httpbin&ip=555.22.22.55&port=335&country=%E4%B8%AD%E5%9B%BD&anonymity=1&https=yes&speed=5&source=100>
150 | 
151 | 参数
152 | 
153 | | Name | Type | Description | 是否必须|
154 | | ----| ---- | ---- | ----|
155 | | name | str | 数据库名称 |是 |
156 | | ip | str | ip 地址 | 是|
157 | | port | str | 端口 |是|
158 | | country | str | 国家 |否|
159 | | anonymity | int | 1:高匿,2:匿名,3:透明  |否|
160 | | https | str | yes:https,no:http |否|
161 | | speed | float | 访问速度 |否|
162 | | source | str | ip 来源 |否|
163 | 
164 | 
165 | ## TODO
166 | * 添加多数据库支持
167 |   * mysql
168 |   * redis TODO...
169 |   * sqlite TODO...
170 | * 添加抓取更多免费代理网站，目前支持的抓取的免费代理 IP 站点，目前有一些国外的站点连接不稳定
171 |   * (国外) <http://www.freeproxylists.net/>
172 |   * (国外) <http://gatherproxy.com/>
173 |   * (国内) <https://hidemy.name/en/proxy-list/>
174 |   * (国内) <http://www.ip181.com/>
175 |   * (国内) <http://www.kuaidaili.com/>
176 |   * (国外) <https://proxy.peuland.com/proxy_list_by_category.htm>
177 |   * (国外) <https://list.proxylistplus.com/>
178 |   * (国内) <http://m.66ip.cn>
179 |   * (国外) <http://www.us-proxy.org/>
180 |   * (国内) <http://www.xicidaili.com>
181 | * 分布式部署项目
182 | * ~~添加服务器获取接口更多筛选条件~~
183 | * ~~多进程验证代理 IP~~
184 | * ~~添加 https 支持~~
185 | * ~~添加检测 ip 的匿名度~~
186 | 
187 | 
188 | ## 参考
189 | * [IPProxyPool](https://github.com/qiyeboy/IPProxyPool)
190 | 
191 | 
192 | ## 项目更新
193 | -----------------------------2020-12-29----------------------------<br>
194 | 1. 修改之前错误的路径命名
195 | 2. 修改mysql 表结构
196 | <br>
197 | -----------------------------2017-6-23----------------------------<br>
198 | 1.python2 -> python3<br>
199 | 2.web.py -> flask<br>
200 | <br>
201 | -----------------------------2017-5-17----------------------------<br>
202 | 1.本系统在原来的基础上加入了docker。操作见下方，关于docker的相关知识可以上官网看看http://www.docker.com.<br>
203 | <br>
204 | -----------------------------2017-3-30----------------------------<br>
205 | 1.修改完善 readme<br>
206 | 2.数据插入支持事务<br>
207 | <br>
208 | -----------------------------2017-3-14----------------------------<br>
209 | 1.更改服务器接口，添加排序方式<br>
210 | 2.添加多进程方式验证代理 ip 的有效性<br>
211 | <br>
212 | -----------------------------2017-2-20----------------------------<br>
213 | 1.添加服务器获取接口更多筛选条件<br>
214 | <br>
215 | 
216 | -----------------------------2017-2-16----------------------------<br>
217 | 1.验证代理 IP 的匿名度<br>
218 | 2.验证代理 IP HTTPS 支持<br>
219 | 3.添加 httpbin 验证并发数设置，默认为 4
220 | 
221 | 
222 | 
223 | 
224 | 
225 | 
226 | 
227 | 
228 | 
229 | 
230 | 
231 | 
232 | ## 在系统中安装docker就可以使用本程序：
233 | 
234 | 下载本程序
235 | ```
236 | git clone https://github.com/awolfly9/IPProxyTool
237 | ```
238 | 
239 | 然后进入目录：
240 | ```
241 | cd IPProxyTool
242 | ```
243 | 
244 | 创建镜像：
245 | ```
246 | docker build -t proxy .
247 | ```
248 | 
249 | 运行容器：
250 | ```
251 | docker run -it proxy
252 | ```
253 | 
254 | ## 在config.py中按照自己的需求修改配置信息
255 | ```
256 | database_config = {
257 |     'host': 'localhost',
258 |     'port': 3306,
259 |     'user': 'root',
260 |     'password': 'root',
261 |     'charset': 'utf8',
262 | }
263 | ```
264 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | #-*- coding: utf-8 -*-
2 | 
3 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | DB_config = {
 4 |     # 'db_type': 'mongodb',
 5 |     'db_type': 'mysql',
 6 | 
 7 |     'mysql': {
 8 |         'host': 'localhost',
 9 |         'port': 3306,
10 |         'user': 'root',
11 |         'password': '123456',
12 |         'charset': 'utf8',
13 |     },
14 |     'redis': {
15 |         'host': 'localhost',
16 |         'port': 6379,
17 |         'password': '123456',
18 |         'db': 1,
19 |     },
20 |     'mongodb':{
21 |         'host': 'localhost',
22 |         'port': 27017,
23 |         'username': '',
24 |         'password': '',
25 |     }
26 | }
27 | 
28 | database = 'ipproxy'
29 | free_ipproxy_table = 'free_ipproxy'
30 | httpbin_table = 'httpbin'
31 | 
32 | data_port = 8000
33 | 


--------------------------------------------------------------------------------
/crawler/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/awolfly9/IPProxyTool/4e4e3aadd30a75f74393b54e8077568b6a58a813/crawler/__init__.py


--------------------------------------------------------------------------------
/crawler/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class CrawlerItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/crawler/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class CrawlerPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/crawler/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for crawler project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'crawler'
13 | 
14 | SPIDER_MODULES = ['crawler.spiders','crawler.spiders.proxy']
15 | NEWSPIDER_MODULE = 'crawler.spiders'
16 | 
17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
18 | #USER_AGENT = 'crawler (+http://www.yourdomain.com)'
19 | 
20 | # Obey robots.txt rules
21 | ROBOTSTXT_OBEY = False
22 | 
23 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
24 | #CONCURRENT_REQUESTS = 32
25 | 
26 | # Configure a delay for requests for the same website (default: 0)
27 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
28 | # See also autothrottle settings and docs
29 | DOWNLOAD_DELAY = 0.5
30 | # The download delay setting will honor only one of:
31 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
32 | #CONCURRENT_REQUESTS_PER_IP = 16
33 | 
34 | # Disable cookies (enabled by default)
35 | #COOKIES_ENABLED = False
36 | 
37 | # Disable Telnet Console (enabled by default)
38 | #TELNETCONSOLE_ENABLED = False
39 | 
40 | # Override the default request headers:
41 | #DEFAULT_REQUEST_HEADERS = {
42 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
43 | #   'Accept-Language': 'en',
44 | #}
45 | 
46 | # Enable or disable spider middlewares
47 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
48 | #SPIDER_MIDDLEWARES = {
49 | #    'crawler.middlewares.MyCustomSpiderMiddleware': 543,
50 | #}
51 | 
52 | # Enable or disable downloader middlewares
53 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
54 | #DOWNLOADER_MIDDLEWARES = {
55 | #    'crawler.middlewares.MyCustomDownloaderMiddleware': 543,
56 | #}
57 | 
58 | # Enable or disable extensions
59 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
60 | EXTENSIONS = {
61 |     'scrapy.extensions.telnet.TelnetConsole': None,
62 | }
63 | 
64 | # Configure item pipelines
65 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
66 | #ITEM_PIPELINES = {
67 | #    'crawler.pipelines.SomePipeline': 300,
68 | #}
69 | 
70 | # Enable and configure the AutoThrottle extension (disabled by default)
71 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
72 | #AUTOTHROTTLE_ENABLED = True
73 | # The initial download delay
74 | #AUTOTHROTTLE_START_DELAY = 5
75 | # The maximum download delay to be set in case of high latencies
76 | #AUTOTHROTTLE_MAX_DELAY = 60
77 | # The average number of requests Scrapy should be sending in parallel to
78 | # each remote server
79 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
80 | # Enable showing throttling stats for every response received:
81 | #AUTOTHROTTLE_DEBUG = False
82 | 
83 | # Enable and configure HTTP caching (disabled by default)
84 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
85 | #HTTPCACHE_ENABLED = True
86 | #HTTPCACHE_EXPIRATION_SECS = 0
87 | #HTTPCACHE_DIR = 'httpcache'
88 | #HTTPCACHE_IGNORE_HTTP_CODES = []
89 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
90 | 
91 | # RETRY_ENABLED = False
92 | 
93 | LOG_ENABLED = True
94 | 


--------------------------------------------------------------------------------
/crawler/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/crawler/spiders/proxy/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | 
3 | 


--------------------------------------------------------------------------------
/crawler/spiders/proxy/basespider.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import sys
 4 | import config
 5 | import utils
 6 | import datetime
 7 | 
 8 | from scrapy.spiders import Spider
 9 | from scrapy.http import Request
10 | from sql import SqlManager
11 | 
12 | 
13 | class BaseSpider(Spider):
14 |     name = 'basespider'
15 | 
16 |     def __init__(self, *a, **kw):
17 |         super(BaseSpider, self).__init__(*a, **kw)
18 | 
19 |         self.urls = []
20 |         self.headers = {}
21 |         self.timeout = 10
22 |         self.is_record_web_page = True
23 | 
24 |         self.sql = SqlManager()
25 | 
26 |     def init(self):
27 |         self.meta = {
28 |             'download_timeout': self.timeout,
29 |         }
30 | 
31 |         self.dir_log = 'log/proxy/%s' % self.name
32 |         utils.make_dir(self.dir_log)
33 |         self.sql.init_proxy_table(config.free_ipproxy_table)
34 | 
35 |     def start_requests(self):
36 |         for i, url in enumerate(self.urls):
37 |             yield Request(
38 |                 url=url,
39 |                 headers=self.headers,
40 |                 meta=self.meta,
41 |                 dont_filter=True,
42 |                 callback=self.parse_page,
43 |                 errback=self.error_parse,
44 |             )
45 | 
46 |     def parse_page(self, response):
47 |         self.write(response.body)
48 |         pass
49 | 
50 |     def error_parse(self, failure):
51 |         request = failure.request
52 |         pass
53 | 
54 |     def add_proxy(self, proxy):
55 |         self.sql.insert_proxy(config.free_ipproxy_table, proxy)
56 | 
57 |     def write(self, data):
58 |         if self.is_record_web_page:
59 |             with open('%s/%s.html' % (self.dir_log, datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S:%f')),
60 |                       'wb') as f:
61 |                 f.write(data)
62 |                 f.close()
63 | 
64 |     def close(spider, reason):
65 |         spider.sql.commit()
66 | 


--------------------------------------------------------------------------------
/crawler/spiders/proxy/data5u.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from scrapy import Selector
 4 | from .basespider import BaseSpider
 5 | from proxy import Proxy
 6 | 
 7 | 
 8 | class Data5uSpider(BaseSpider):
 9 |     name = 'data5u'
10 | 
11 |     def __init__(self, *a, **kw):
12 |         # 在类的继承中，如果重定义某个方法，该方法会覆盖父类的同名方法
13 |         # 但有时，我们希望能同时实现父类的功能，这时，我们就需要调用父类的方法了，可通过使用 super 来实现，比如：
14 |         super(Data5uSpider, self).__init__(*a, **kw)
15 | 
16 |         self.urls = [
17 |             'http://www.data5u.com/'
18 |         ]
19 |         self.headers = {
20 |             # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
21 |             # 'Accept-Encoding': 'gzip, deflate, sdch',
22 |             # 'Accept-Language': 'zh-CN,zh;q=0.8',
23 |             # 'Connection': 'keep-alive',
24 |             'Host': 'www.data5u.com',
25 |             'Upgrade-Insecure-Requests': 1,
26 |             'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36',
27 |         }
28 | 
29 |         self.init()
30 | 
31 |     def parse_page(self, response):
32 |         self.write(response.body)
33 | 
34 |         sel = Selector(response)
35 |         infos = sel.xpath('//ul[@class="l2"]').extract()
36 |         for i, info in enumerate(infos):
37 |             val = Selector(text=info)
38 |             ip = val.xpath('//ul[@class="l2"]/span[1]/li/text()').extract_first()
39 |             port = val.xpath('//ul[@class="l2"]/span[2]/li/text()').extract_first()
40 |             anonymity = val.xpath('//ul[@class="l2"]/span[3]/li/text()').extract_first()
41 |             https = val.xpath('//ul[@class="l2"]/span[4]/li/text()').extract_first()
42 |             country = val.xpath('//ul[@class="l2"]/span[5]/li/a/text()').extract_first()
43 | 
44 |             proxy = Proxy()
45 |             proxy.set_value(
46 |                 ip=ip,
47 |                 port=port,
48 |                 country=country,
49 |                 anonymity=anonymity,
50 |                 source=self.name,
51 |             )
52 |             self.add_proxy(proxy=proxy)
53 | 


--------------------------------------------------------------------------------
/crawler/spiders/proxy/freeproxylists.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import urllib
 4 | import re
 5 | 
 6 | from proxy import Proxy
 7 | from .basespider import BaseSpider
 8 | from bs4 import BeautifulSoup
 9 | 
10 | 
11 | class FreeProxyListsSpider(BaseSpider):
12 |     name = 'freeproxylists'
13 | 
14 |     def __init__(self, *a, **kwargs):
15 |         super(FreeProxyListsSpider, self).__init__(*a, **kwargs)
16 |         self.urls = [
17 |             'http://www.freeproxylists.net/'
18 |         ]
19 |         self.headers = {
20 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
21 |             'Accept-Encoding': 'gzip, deflate',
22 |             'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
23 |             'Connection': 'keep-alive',
24 |             'Host': 'www.freeproxylists.net',
25 |             'Upgrade-Insecure-Requests': '1',
26 |             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 Firefox/50.0',
27 |         }
28 | 
29 |         self.init()
30 | 
31 |     def parse_page(self, response):
32 |         pattern = re.compile('<tr class=(.*?)</tr>', re.S)
33 |         items = re.findall(pattern = pattern, string = response.body)
34 |         for i, item in enumerate(items):
35 |             if i > 0:
36 |                 if 'async' in item:
37 |                     continue
38 | 
39 |                 ip_pattern = re.compile('IPDecode\(\"(.*?)\"\)', re.S)
40 |                 ip_decode = re.findall(ip_pattern, item)[0]
41 |                 ip_url = urllib.unquote(ip_decode)
42 |                 ip_soup = BeautifulSoup(ip_url, 'lxml')
43 |                 ip = ip_soup.text.encode()
44 | 
45 |                 item = '<tr class=' + item + '</tr>'
46 |                 soup = BeautifulSoup(item, 'lxml')
47 |                 tbodys = soup.find_all('td')
48 | 
49 |                 proxy = Proxy()
50 |                 proxy.set_value(
51 |                         ip = ip,
52 |                         port = tbodys[1].text.encode(),
53 |                         country = tbodys[4].text.encode(),
54 |                         anonymity = tbodys[3].text.encode(),
55 |                         source = self.name,
56 |                 )
57 | 
58 |                 self.add_proxy(proxy = proxy)
59 | 


--------------------------------------------------------------------------------
/crawler/spiders/proxy/gatherproxy.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import json
 4 | import random
 5 | import re
 6 | import requests
 7 | 
 8 | from proxy import Proxy
 9 | from .basespider import BaseSpider
10 | 
11 | 
12 | class GatherproxySpider(BaseSpider):
13 |     name = 'gatherproxy'
14 | 
15 |     def __init__(self, *a, **kwargs):
16 |         super(GatherproxySpider, self).__init__(*a, **kwargs)
17 |         self.urls = [
18 |             'http://gatherproxy.com/',
19 |             'http://www.gatherproxy.com/proxylist/anonymity/?t=Anonymous',
20 |             'http://gatherproxy.com/proxylist/country/?c=China',
21 |         ]
22 | 
23 |         self.headers = {
24 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
25 |             'Accept-Encoding': 'gzip, deflate',
26 |             'Accept-Language': 'en-US,en;q=0.5',
27 |             'Connection': 'keep-alive',
28 |             'Host': 'www.gatherproxy.com',
29 |             'Upgrade-Insecure-Requests': '1',
30 |             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:52.0) Gecko/20100101 Firefox/52.0'
31 |         }
32 | 
33 |         # self.proxies = self.get_proxy()
34 |         self.init()
35 | 
36 |     def parse_page(self, response):
37 |         pattern = re.compile('gp.insertPrx\((.*?)\)', re.S)
38 |         items = re.findall(pattern, response.body.decode())
39 |         for item in items:
40 |             data = json.loads(item)
41 |             #端口用的是十六进制
42 |             port = data.get('PROXY_PORT')
43 |             port = str(int(port, 16))
44 | 
45 |             proxy = Proxy()
46 |             proxy.set_value(
47 |                     ip = data.get('PROXY_IP'),
48 |                     port = port,
49 |                     country = data.get('PROXY_COUNTRY'),
50 |                     anonymity = data.get('PROXY_TYPE'),
51 |                     source = self.name,
52 |             )
53 | 
54 |             self.add_proxy(proxy = proxy)
55 | 
56 |     def get_proxy(self):
57 |         try:
58 |             url = 'http://127.0.0.1:8000/?name={0}'.format(self.name)
59 |             r = requests.get(url = url)
60 |             if r.text != None and r.text != '':
61 |                 data = json.loads(r.text)
62 |                 if len(data) > 0:
63 |                     proxy = random.choice(data)
64 |                     ip = proxy.get('ip')
65 |                     port = proxy.get('port')
66 |                     address = '%s:%s' % (ip, port)
67 | 
68 |                     proxies = {
69 |                         'http': 'http://%s' % address
70 |                     }
71 |                     return proxies
72 |         except:
73 |             return None
74 | 


--------------------------------------------------------------------------------
/crawler/spiders/proxy/hidemy.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: utf-8 -*-
 2 | 
 3 | import utils
 4 | 
 5 | from scrapy import Selector
 6 | from .basespider import BaseSpider
 7 | from proxy import Proxy
 8 | 
 9 | 
10 | class HidemySpider(BaseSpider):
11 |     name = 'hidemy'
12 | 
13 |     def __init__(self, *a, **kw):
14 |         super(HidemySpider, self).__init__(*a, **kw)
15 | 
16 |         self.urls = ['https://hidemy.name/en/proxy-list/?start=%s' % n for n in range(0, 5 * 64, 64)]
17 |         self.headers = {
18 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
19 |             'Accept-Encoding': 'gzip, deflate, br',
20 |             'Accept-Language': 'en-US,en;q=0.5',
21 |             'Connection': 'keep-alive',
22 |             'Host': 'hidemy.name',
23 |             'Referer': 'https://hidemy.name/en/proxy-list/?start=0',
24 |             'Upgrade-Insecure-Requests': '1',
25 |             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 Firefox/51.0',
26 |         }
27 | 
28 |         self.init()
29 | 
30 |     def parse_page(self, response):
31 |         self.write(response.body)
32 | 
33 |         sel = Selector(response)
34 |         infos = sel.xpath('//tbody/tr').extract()
35 |         for i, info in enumerate(infos):
36 |             if i == 0:
37 |                 continue
38 | 
39 |             val = Selector(text = info)
40 |             ip = val.xpath('//td[1]/text()').extract_first()
41 |             port = val.xpath('//td[2]/text()').extract_first()
42 |             country = val.xpath('//td[3]/div/text()').extract_first()
43 |             anonymity = val.xpath('//td[6]/text()').extract_first()
44 | 
45 |             proxy = Proxy()
46 |             proxy.set_value(
47 |                     ip = ip,
48 |                     port = port,
49 |                     country = country,
50 |                     anonymity = anonymity,
51 |                     source = self.name,
52 |             )
53 | 
54 |             self.add_proxy(proxy = proxy)
55 | 


--------------------------------------------------------------------------------
/crawler/spiders/proxy/ip181.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: utf-8 -*-
 2 | 
 3 | from scrapy import Selector
 4 | from .basespider import BaseSpider
 5 | from proxy import Proxy
 6 | 
 7 | 
 8 | class IpOneEightOneSpider(BaseSpider):
 9 |     name = 'ip181'
10 | 
11 |     def __init__(self, *a, **kw):
12 |         super(IpOneEightOneSpider, self).__init__(*a, **kw)
13 | 
14 |         self.urls = ['http://www.ip181.com/']
15 |         self.headers = {
16 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
17 |             'Accept-Encoding': 'gzip, deflate',
18 |             'Accept-Language': 'en-US,en;q=0.5',
19 |             'Connection': 'keep-alive',
20 |             'Host': 'www.ip181.com',
21 |             'Upgrade-Insecure-Requests': '1',
22 |             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 Firefox/50.0',
23 |         }
24 | 
25 |         self.init()
26 | 
27 |     def parse_page(self, response):
28 |         self.write(response.body)
29 | 
30 |         sel = Selector(response)
31 |         infos = sel.xpath('//tbody/tr').extract()
32 |         for i, info in enumerate(infos):
33 |             if i == 0:
34 |                 continue
35 | 
36 |             val = Selector(text = info)
37 |             ip = val.xpath('//td[1]/text()').extract_first()
38 |             port = val.xpath('//td[2]/text()').extract_first()
39 |             country = val.xpath('//td[6]/text()').extract_first()
40 |             anonymity = val.xpath('//td[3]/text()').extract_first()
41 |             https = val.xpath('//td[4]/text()').extract_first()
42 | 
43 |             proxy = Proxy()
44 |             proxy.set_value(
45 |                     ip = ip,
46 |                     port = port,
47 |                     country = country,
48 |                     anonymity = anonymity,
49 |                     source = self.name,
50 |             )
51 | 
52 |             self.add_proxy(proxy = proxy)
53 | 


--------------------------------------------------------------------------------
/crawler/spiders/proxy/kuaidaili.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: utf-8 -*-
 2 | 
 3 | import re
 4 | 
 5 | from proxy import Proxy
 6 | from .basespider import BaseSpider
 7 | 
 8 | 
 9 | class KuaiDaiLiSpider(BaseSpider):
10 |     name = 'kuaidaili'
11 | 
12 |     def __init__(self, *a, **kwargs):
13 |         super(KuaiDaiLiSpider, self).__init__(*a, **kwargs)
14 | 
15 |         self.urls = ['https://www.kuaidaili.com/free/inha/%s/' % i for i in range(1, 5)]
16 | 
17 |         self.headers = {
18 |             'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
19 |             'Accept-Encoding':'gzip, deflate',
20 |             'Accept-Language':'zh-CN,zh;q=0.9',
21 |             'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
22 |         }
23 | 
24 |         self.is_record_web_page = False
25 |         self.init()
26 | 
27 |     def parse_page(self, response):
28 |         pattern = re.compile(
29 |                 '<tr>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>('
30 |                 '.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?<td.*?>(.*?)</td>\s.*?</tr>',
31 |                 re.S)
32 |         items = re.findall(pattern, response.body.decode())
33 | 
34 |         for item in items:
35 |             proxy = Proxy()
36 |             proxy.set_value(
37 |                     ip = item[0],
38 |                     port = item[1],
39 |                     country = item[4],
40 |                     anonymity = item[2],
41 |                     source = self.name,
42 |             )
43 | 
44 |             self.add_proxy(proxy)
45 | 


--------------------------------------------------------------------------------
/crawler/spiders/proxy/proxydb.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | from proxy import Proxy
 4 | from .basespider import BaseSpider
 5 | from scrapy.selector import Selector
 6 | import re
 7 | from base64 import b64decode
 8 | 
 9 | class ProxyDBSpider(BaseSpider):
10 |     name = 'proxydb'
11 | 
12 |     def __init__(self, *a, **kwargs):
13 |         super(ProxyDBSpider, self).__init__(*a, **kwargs)
14 |         self.urls = ['http://proxydb.net/?protocol=http&protocol=https&offset=%s' % n for n in range(1, 500, 50)]
15 |         self.headers = {
16 |             'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
17 |             'Accept-Encoding':'gzip, deflate',
18 |             'Accept-Language':'zh-CN,zh;q=0.9',
19 |             'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
20 |         }
21 | 
22 |         self.is_record_web_page = False
23 |         self.init()
24 | 
25 |     def parse_page(self, response):
26 |         super(ProxyDBSpider, self).parse_page(response)
27 |         for table_item in response.xpath('//tbody/tr'):
28 |             ip,port = self.parse_ip(table_item.xpath('.//td[1]/script/text()').extract_first())
29 |             country = table_item.xpath('.//td/img/@title').extract_first().strip()
30 |             anonymity = table_item.xpath('.//td/span/text()').extract_first().strip()
31 |             proxy = Proxy()
32 |             proxy.set_value(
33 |                     ip = ip,
34 |                     port = port,
35 |                     country = country,
36 |                     anonymity = anonymity,
37 |                     source = self.name
38 |             )
39 |             self.add_proxy(proxy = proxy)
40 | 
41 |     def parse_ip(self, page):
42 |         ip_part1 = re.search(r'\'(.*)\'\.split',page).group(1)[::-1]
43 |         ip_part2= ''.join([chr(int(x,16)) for x in re.findall(r'\\x([0-9A-Fa-f]{2})', page)])
44 |         ip_part2= b64decode(ip_part2).decode('utf-8')
45 |         port = re.search(r'pp = -(\d+) \+ (\d+);',page).groups()
46 |         port = -int(port[0]) + int(port[1])
47 |         return [''.join([ip_part1,ip_part2]),port]
48 | 
49 | 


--------------------------------------------------------------------------------
/crawler/spiders/proxy/proxylistplus.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: utf-8 -*-
 2 | 
 3 | from scrapy import Selector
 4 | from .basespider import BaseSpider
 5 | from proxy import Proxy
 6 | 
 7 | 
 8 | class ProxylistplusSpider(BaseSpider):
 9 |     name = 'proxylistplus'
10 | 
11 |     def __init__(self, *a, **kw):
12 |         super(ProxylistplusSpider, self).__init__(*a, **kw)
13 | 
14 |         self.urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-%s' % n for n in range(1, 3)]
15 |         self.headers = {
16 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
17 |             'Accept-Encoding': 'gzip, deflate, br',
18 |             'Accept-Language': 'en-US,en;q=0.5',
19 |             'Cache-Control': 'max-age=0',
20 |             'Connection': 'keep-alive',
21 |             'Host': 'list.proxylistplus.com',
22 |             'If-Modified-Since': 'Mon, 20 Feb 2017 07:47:35 GMT',
23 |             'If-None-Match': 'list381487576865',
24 |             'Referer': 'https://list.proxylistplus.com/Fresh-HTTP-Proxy',
25 |             'Upgrade-Insecure-Requests': '1',
26 |             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 Firefox/51.0',
27 |         }
28 | 
29 |         self.is_record_web_page = False
30 |         self.init()
31 | 
32 |     def parse_page(self, response):
33 |         self.write(response.body)
34 | 
35 |         sel = Selector(response)
36 |         infos = sel.xpath('//tr[@class="cells"]').extract()
37 |         for i, info in enumerate(infos):
38 |             self.log(info)
39 |             val = Selector(text = info)
40 | 
41 |             ip = val.xpath('//td[2]/text()').extract_first()
42 |             port = val.xpath('//td[3]/text()').extract_first()
43 |             country = val.xpath('//td[5]/text()').extract_first()
44 |             anonymity = val.xpath('//td[4]/text()').extract_first()
45 | 
46 |             proxy = Proxy()
47 |             proxy.set_value(
48 |                     ip = ip,
49 |                     port = port,
50 |                     country = country,
51 |                     anonymity = anonymity,
52 |                     source = self.name,
53 |             )
54 | 
55 |             self.add_proxy(proxy = proxy)
56 | 


--------------------------------------------------------------------------------
/crawler/spiders/proxy/sixsixip.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import re
 4 | 
 5 | from proxy import Proxy
 6 | from .basespider import BaseSpider
 7 | 
 8 | 
 9 | class SixSixIpSpider(BaseSpider):
10 |     name = 'sixsixip'
11 | 
12 |     def __init__(self, *a, **kwargs):
13 |         super(SixSixIpSpider, self).__init__(*a, **kwargs)
14 | 
15 |         self.urls = ['http://m.66ip.cn/%s.html' % n for n in range(1, 10)]
16 |         self.headers = {
17 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
18 |             'Accept-Encoding': 'gzip, deflate',
19 |             'Accept-Language': 'en-US,en;q=0.5',
20 |             'Cache-Control': 'max-age=0',
21 |             'Connection': 'keep-alive',
22 |             'Host': 'm.66ip.cn',
23 |             'Upgrade-Insecure-Requests': '1',
24 |             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 Firefox/50.0',
25 |         }
26 | 
27 |         self.init()
28 | 
29 |     def parse_page(self, response):
30 |         pattern = re.compile('<tr><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td></tr>',
31 |                              re.S)
32 |         items = re.findall(pattern, response.body.decode())
33 |         for i, item in enumerate(items):
34 |             if i >= 1:
35 |                 proxy = Proxy()
36 |                 proxy.set_value(
37 |                         ip = item[0],
38 |                         port = item[1],
39 |                         country = item[2],
40 |                         anonymity = item[3],
41 |                         source = self.name
42 |                 )
43 | 
44 |                 self.add_proxy(proxy = proxy)
45 | 


--------------------------------------------------------------------------------
/crawler/spiders/proxy/usproxy.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import re
 4 | 
 5 | from proxy import Proxy
 6 | from .basespider import BaseSpider
 7 | 
 8 | 
 9 | class UsProxySpider(BaseSpider):
10 |     name = 'usproxy'
11 | 
12 |     def __init__(self, *a, **kwargs):
13 |         super(UsProxySpider, self).__init__(*a, **kwargs)
14 | 
15 |         self.urls = [
16 |             'http://www.sslproxies.org/',
17 |             'http://www.us-proxy.org/',
18 |             'http://free-proxy-list.net/uk-proxy.html',
19 |             'http://www.socks-proxy.net/',
20 |         ]
21 |         self.headers = {
22 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
23 |             'Accept-Encoding': 'gzip, deflate',
24 |             'Accept-Language': 'en-US,en;q=0.5',
25 |             'Cache-Control': 'max-age=0',
26 |             'Connection': 'keep-alive',
27 |             'Host': 'www.us-proxy.org',
28 |             'If-Modified-Since': 'Tue, 24 Jan 2017 03:32:01 GMT',
29 |             'Referer': 'http://www.sslproxies.org/',
30 |             'Upgrade-Insecure-Requests': '1',
31 |             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 Firefox/50.0',
32 |         }
33 | 
34 |         self.init()
35 | 
36 |     def parse_page(self, response):
37 |         pattern = re.compile(
38 | 		'<tr><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td><td.+?>(.*?)</td><td>(.*?)</td><td.+?>(.*?)</td><td.+?>(.*?)</td><td.+?>(.*?)</td></tr>',
39 |                 re.S)
40 |         items = re.findall(pattern, response.body.decode())
41 | 
42 |         if items is not None:
43 |             for item in items:
44 |                 proxy = Proxy()
45 |                 proxy.set_value(
46 |                         ip = item[0],
47 |                         port = item[1],
48 |                         country = item[3],
49 |                         anonymity = item[4],
50 |                         source = self.name,
51 |                 )
52 | 
53 |                 self.add_proxy(proxy)
54 | 


--------------------------------------------------------------------------------
/crawler/spiders/proxy/xicidaili.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: utf-8 -*-
 2 | 
 3 | from proxy import Proxy
 4 | from .basespider import BaseSpider
 5 | from scrapy.selector import Selector
 6 | 
 7 | 
 8 | class XiCiDaiLiSpider(BaseSpider):
 9 |     name = 'xici'
10 | 
11 |     def __init__(self, *a, **kw):
12 |         super(XiCiDaiLiSpider, self).__init__(*a, **kw)
13 | 
14 |         self.urls = ['http://www.xicidaili.com/nn/%s' % n for n in range(1, 10)]
15 |         self.headers = {
16 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
17 |             'Accept-Encoding': 'gzip, deflate',
18 |             'Accept-Language': 'en-US,en;q=0.5',
19 |             'Cache-Control': 'max-age=0',
20 |             'Connection': 'keep-alive',
21 |             'Host': 'www.xicidaili.com',
22 |             'If-None-Match': 'W/"cb655e834a031d9237e3c33f3499bd34"',
23 |             'Upgrade-Insecure-Requests': '1',
24 |             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 Firefox/50.0',
25 |         }
26 | 
27 |         self.init()
28 | 
29 |     def parse_page(self, response):
30 |         sel = Selector(text = response.body)
31 |         infos = sel.xpath('//tr[@class="odd"]').extract()
32 |         for info in infos:
33 |             val = Selector(text = info)
34 |             ip = val.xpath('//td[2]/text()').extract_first()
35 |             port = val.xpath('//td[3]/text()').extract_first()
36 |             country = val.xpath('//td[4]/a/text()').extract_first()
37 |             anonymity = val.xpath('//td[5]/text()').extract_first()
38 | 
39 |             proxy = Proxy()
40 |             proxy.set_value(
41 |                     ip = ip,
42 |                     port = port,
43 |                     country = country,
44 |                     anonymity = anonymity,
45 |                     source = self.name,
46 |             )
47 | 
48 |             self.add_proxy(proxy = proxy)
49 | 


--------------------------------------------------------------------------------
/crawler/spiders/validator/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/crawler/spiders/validator/amazoncn.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: utf-8 -*-
 2 | 
 3 | from .validator import Validator
 4 | 
 5 | 
 6 | class AmazonCnSpider(Validator):
 7 |     name = 'amazoncn'
 8 | 
 9 |     def __init__(self, name = None, **kwargs):
10 |         super(AmazonCnSpider, self).__init__(name, **kwargs)
11 | 
12 |         self.timeout = 5
13 | 
14 |         self.urls = [
15 |             'https://www.amazon.cn/dp/B00ID363S4',
16 |             'https://www.amazon.cn/gp/product/B01BDBJ71W',
17 |             'https://www.amazon.cn/gp/product/B06XBHPZNC',
18 |         ]
19 | 
20 |         self.headers = {
21 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
22 |             'Accept-Encoding': 'gzip, deflate, br',
23 |             'Accept-Language': 'en-US,en;q=0.5',
24 |             'Connection': 'keep-alive',
25 |             'Host': 'www.amazon.cn',
26 |             'Upgrade-Insecure-Requests': '1',
27 |             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 '
28 |                           'Firefox/50.0',
29 |         }
30 | 
31 |         self.init()
32 | 
33 |     def success_content_parse(self, response):
34 |         if 'Amazon CAPTCHA' in response.text:
35 |             return False
36 |         return True
37 |         
38 |         
39 |         
40 |         
41 | 


--------------------------------------------------------------------------------
/crawler/spiders/validator/anjuke.py:
--------------------------------------------------------------------------------
  1 | # -*- coding=utf-8 -*-
  2 | 
  3 | import datetime
  4 | import json
  5 | import random
  6 | import re
  7 | import time
  8 | import config
  9 | 
 10 | from scrapy import Request
 11 | from scrapy.selector import Selector
 12 | from crawler.spiders.validator.validator import Validator
 13 | 
 14 | 
 15 | class AJKSpider(Validator):
 16 |     name = 'ajk'
 17 | 
 18 |     concurrent_requests = 16
 19 | 
 20 |     def __init__(self, name = None, **kwargs):
 21 |         super(AJKSpider, self).__init__(name, **kwargs)
 22 | 
 23 |         self.region_urls = [
 24 |             'aolinpikegongyuan/',
 25 |             'anzhen/',
 26 |             'baiziwan/',
 27 |             'beiyuan/',
 28 |             'balizhuangb/',
 29 |             'chaoyanggongyuandong/',
 30 |             'chaowaidajie/',
 31 |             'changying/',
 32 |             'chaoyangjichang/',
 33 |             'chaoqing/',
 34 |             'chaoyanggongyuanxi/',
 35 |             'dawanglu/',
 36 |             'dongbaa/',
 37 |             'dougezhuang/',
 38 |             'dongdaqiao/',
 39 |             'dingfuzhuang/',
 40 |             'fatou/',
 41 |             'gaobeidian/',
 42 |             'guanzhuang/',
 43 |             'guomao/',
 44 |             'hepinglibei/',
 45 |             'huaweiqiaocy/',
 46 |             'jinsongdong/',
 47 |             'jianzhanxiang/',
 48 |             'jianxiangqiao/',
 49 |             'jianguomenwai/',
 50 |             'jiuxianqiao/',
 51 |             'jinsongxi/',
 52 |             'laiguangying/',
 53 |             'liufang/',
 54 |             'nanshatan/',
 55 |             'panjiayuan/',
 56 |             'shilihe/',
 57 |             'sanlitun/',
 58 |             'sihui/',
 59 |             'shuangqiaoc/',
 60 |             'shifoying/',
 61 |             'shibalidian/',
 62 |             'shaoyaoju/',
 63 |             'shuangjing/',
 64 |             'sanyuanqiao/',
 65 |             'taiyanggong/',
 66 |             'tuanjiehu/',
 67 |             'wangjingxi/',
 68 |             'wangjingdong/',
 69 |             'xiaohongmen/',
 70 |             'yayuncun/',
 71 |             'chaoyang/',
 72 |             'haidian/',
 73 |             'dongchenga/',
 74 |             'xicheng/',
 75 |             'fengtai/',
 76 |             'tongzhou/',
 77 |             'shijingshan/',
 78 |             'changping/',
 79 |             'daxing/',
 80 |             'shunyi/',
 81 |             'fangshan/',
 82 |             'mentougou/',
 83 |             'miyun/',
 84 |             'huairou/',
 85 |             'pinggua/',
 86 |             'yanqing/',
 87 |             'beijingzhoubiana/',
 88 |             'baishiqiao/',
 89 |             'chedaogou/',
 90 |             'dinghuisi/',
 91 |             'erlizhuang/',
 92 |             'gongzhufenxi/',
 93 |             'ganjiakou/',
 94 |             'gongzhufendong/',
 95 |             'haidianbeibu/',
 96 |             'junbo/',
 97 |             'madians/',
 98 |             'malianwa/',
 99 |             'mudanyuan/',
100 |             'qinghe/',
101 |             'shijicheng/',
102 |             'sijiqing/',
103 |             'suzhouqiao/',
104 |             'shangdi/',
105 |             'shuangyushu/',
106 |             'tiancun/',
107 |             'wenquand/',
108 |             'wanquanhe/',
109 |             'wanshoulu/',
110 |             'wanliu/',
111 |             'wudaokou/',
112 |             'weigongcun/',
113 |             'xiangshandong/',
114 |             'xibeiwang/',
115 |             'xierqi/',
116 |             'xiangshangxi/',
117 |             'xiaoxitian/',
118 |             'xisanqi/',
119 |             'xueyuanlu/',
120 |             'yuquanlu/',
121 |             'yiheyuan/',
122 |             'yuanmingyuan/',
123 |             'zaojunmiao/',
124 |             'zizhuqiao/',
125 |             'zhichunlu/',
126 |             'zhongguancun/',
127 |             'andingmen/',
128 |             'chongwenmens/',
129 |             'chaoyangmennei/',
130 |             'dongzhimenwai/',
131 |             'donghuashis/',
132 |             'dongdan/',
133 |             'dongsia/',
134 |             'dongzhimennei/',
135 |             'dengshikou/',
136 |             'guangqumen/',
137 |             'hepinglianan/',
138 |             'jiaodaokou/',
139 |             'jianguomennei/',
140 |             'longtanhus/',
141 |             'qianmens/',
142 |             'tiantans/',
143 |             'wangfujing/',
144 |             'yongdingmens/',
145 |             'yonghegong/',
146 |             'baizhifangs/',
147 |             'baiyunluxc/',
148 |             'changchunjiexc/',
149 |             'chegongzhuanga/',
150 |             'deshengmen/',
151 |             'fuchengmen/',
152 |             'guanganmenwai/',
153 |             'guanganmennei/',
154 |             'guanyuan/',
155 |             'hepingmen/',
156 |             'jinrongjie/',
157 |             'liupukang/',
158 |             'maliandaos/',
159 |             'shichahai/',
160 |             'tianningshi/',
161 |             'taorantings/',
162 |             'xuanwumens/',
163 |             'xizhimenwai/',
164 |             'xisi/',
165 |             'xizhimen/',
166 |             'xinjiekou/',
167 |             'xidan/',
168 |             'yuetan/',
169 |             'beidadi/',
170 |             'caoqiao/',
171 |             'chengshousi/',
172 |             'caihuying/',
173 |             'dahongmen/',
174 |             'fangzhuang/',
175 |             'heyi/',
176 |             'jiaomen/',
177 |             'kandanqiao/',
178 |             'kejiyuanquft/',
179 |             'lizeqiao/',
180 |             'liuliqiaoxi/',
181 |             'lugouqiao/',
182 |             'liujiayao/',
183 |             'liuliqiaodong/',
184 |             'majiabao/',
185 |             'muxiyuan/',
186 |             'puhuangyu/',
187 |             'qilizhuang/',
188 |             'qingta/',
189 |             'songjiazhuang/',
190 |             'xinfadi/',
191 |             'xiluoyuan/',
192 |             'youanmenwai/',
193 |             'yuquanying/',
194 |             'beiguan/',
195 |             'guoyuan/',
196 |             'jiukeshu/',
197 |             'luyuan/',
198 |             'liyuan/',
199 |             'majuqiao/',
200 |             'qiaozhuang/',
201 |             'tuqiao/',
202 |             'tongzhouquqita/',
203 |             'tongzhoubeiyuan/',
204 |             'wuyihuayuan/',
205 |             'xinhuadajie/',
206 |             'bajiao/',
207 |             'gucheng/',
208 |             'laoshan/',
209 |             'lugu/',
210 |             'pingguoyuan/',
211 |             'shijingshana/',
212 |             'yuquanluxi/',
213 |             'yangzhuang/',
214 |             'baishanzhen/',
215 |             'beiqijia/',
216 |             'changpingquqita/',
217 |             'changpingxiancheng/',
218 |             'dongxiaokouzhen/',
219 |             'huoying/',
220 |             'huilongguan/',
221 |             'longze/',
222 |             'lishuiqiao/',
223 |             'nanshao/',
224 |             'nankou/',
225 |             'shahea/',
226 |             'tiantongyuan/',
227 |             'xingshouzhen/',
228 |             'xiaotangshanbei/',
229 |             'xiaotangshannan/',
230 |             'zhuxinzhuang/',
231 |             'daxingquqita/',
232 |             'guanyinsi/',
233 |             'gaomidian/',
234 |             'huangcun/',
235 |             'jiugong/',
236 |             'luchengxiang/',
237 |             'panggezhuang/',
238 |             'qingyundianzhen/',
239 |             'tiangongyuannan/',
240 |             'tiangongyuanbei/',
241 |             'xihongmen/',
242 |             'yinghaizhen/',
243 |             'yizhuang/',
244 |             'zaoyuans/',
245 |             'houshayu/',
246 |             'jichangfujin/',
247 |             'liqiao/',
248 |             'mapo/',
249 |             'shunyiquqita/',
250 |             'shunyicheng/',
251 |             'tianzhu/',
252 |             'yangzhen/',
253 |             'zhongyangbieshuqu/',
254 |             'changyang/',
255 |             'chengguanbj/',
256 |             'doudian/',
257 |             'fangshanquqita/',
258 |             'guandaozhen/',
259 |             'hancunhe/',
260 |             'liangxiang/',
261 |             'liulihe/',
262 |             'yancun/',
263 |             'yanshan/',
264 |             'binhexiqu/',
265 |             'chengzi/',
266 |             'dayu/',
267 |             'fengcun/',
268 |             'mentougouquqita/',
269 |             'shimenying/',
270 |             'yongdingzhen/',
271 |             'badaling/',
272 |             'dayushu/',
273 |             'kangzhuang/',
274 |             'yanqingquqita/',
275 |             'yanqingchengqu/',
276 |             'baodinga/',
277 |             'langfanga/',
278 |             'qinhuangdaoa/',
279 |             'tangshang/',
280 |             'weihaia/',
281 |             'yantaia/',
282 |             'yanjiao/',
283 |             'zhangjiakou/',
284 |         ]
285 | 
286 |         self.price_urls = [
287 |             'zj5332/',
288 |             'zj297/',
289 |             'zj298/',
290 |             'zj299/',
291 |             'zj300/',
292 |             'zj301/',
293 |             'zj33/',
294 |             'zj5333/',
295 |             'zj5334/',
296 |             'zj5335/',
297 |             'zj5336/',
298 |         ]
299 | 
300 |         self.init()
301 | 
302 |     def start_requests(self):
303 |         count = self.sql.get_proxy_count(self.name)
304 |         count_free = self.sql.get_proxy_count(config.httpbin_table)
305 | 
306 |         ids = self.sql.get_proxy_ids(self.name)
307 |         ids_httpbin = self.sql.get_proxy_ids(config.httpbin_table)
308 | 
309 |         for i in range(0, count + count_free):
310 |             table = self.name if (i < count) else config.httpbin_table
311 |             id = ids[i] if i < count else ids_httpbin[i - len(ids)]
312 | 
313 |             proxy = self.sql.get_proxy_with_id(table, id)
314 |             if proxy == None:
315 |                 continue
316 | 
317 |             full_url = 'https://bj.zu.anjuke.com/fangyuan/{region}p{page}-{price}'.format(
318 |                 region = random.choice(self.region_urls), price = random.choice(self.price_urls), page = 1)
319 |             cur_time = time.time()
320 |             yield Request(
321 |                 url = full_url,
322 |                 headers = {
323 |                     'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
324 |                 },
325 |                 dont_filter = True,
326 |                 meta = {
327 |                     'cur_time': cur_time,
328 |                     'download_timeout': self.timeout,
329 |                     'proxy_info': proxy,
330 |                     'table': table,
331 |                     'proxy': 'http://%s:%s' % (proxy.ip, proxy.port),
332 |                 },
333 |                 callback = self.success_parse,
334 |                 errback = self.error_parse,
335 |             )
336 | 


--------------------------------------------------------------------------------
/crawler/spiders/validator/assetstore.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: utf-8 -*-
 2 | 
 3 | import json
 4 | import time
 5 | import config
 6 | 
 7 | from scrapy.http import Request
 8 | from .validator import Validator
 9 | 
10 | 
11 | class AssetStoreSpider(Validator):
12 |     name = 'assetstore'
13 | 
14 |     def __init__(self, *a, **kwargs):
15 |         super(AssetStoreSpider, self).__init__(*a, **kwargs)
16 | 
17 |         self.timeout = 10
18 | 
19 |         self.init()
20 | 
21 |     def start_requests(self):
22 |         url = 'https://www.assetstore.unity3d.com/login'
23 |         yield Request(
24 |                 url = url,
25 |                 headers = {
26 |                     'Accept': 'application/json',
27 |                     'Accept-Encoding': 'gzip, deflate, br',
28 |                     'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
29 |                     'Connection': 'keep-alive',
30 |                     'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
31 |                     'Host': 'www.assetstore.unity3d.com',
32 |                     'Referer': 'https://www.assetstore.unity3d.com/en/',
33 |                     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 '
34 |                                   'Firefox/50.0',
35 |                     'X-Kharma-Version': '0',
36 |                     'X-Requested-With': 'UnityAssetStore',
37 |                     'X-Unity-Session': '26c4202eb475d02864b40827dfff11a14657aa41',
38 |                 },
39 |                 meta = {
40 |                 },
41 |                 dont_filter = True,
42 |                 callback = self.get_unity_version,
43 |                 errback = self.error_parse,
44 |         )
45 | 
46 |     def get_unity_version(self, response):
47 |         content = json.loads(response.body)
48 |         self.log('unity content:%s' % response.body)
49 | 
50 |         unity_version = content.get('kharma_version', '')
51 | 
52 |         headers = {
53 |             'Accept': '*/*',
54 |             'Accept-Encoding': 'gzip, deflate, br',
55 |             'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
56 |             'Connection': 'keep-alive',
57 |             'Host': 'www.assetstore.unity3d.com',
58 |             'Referer': 'https://www.assetstore.unity3d.com/en/',
59 |             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 Firefox/50.0',
60 |             'X-Kharma-Version': unity_version,
61 |             'X-Requested-With': 'UnityAssetStore',
62 |             'X-Unity-Session': '26c4202eb475d02864b40827dfff11a14657aa41',
63 |         }
64 | 
65 |         count = self.sql.get_proxy_count(self.name)
66 |         count_free = self.sql.get_proxy_count(config.httpbin_table)
67 | 
68 |         ids = self.sql.get_proxy_ids(self.name)
69 |         ids_free = self.sql.get_proxy_ids(config.httpbin_table)
70 | 
71 |         for i in range(0, count + count_free):
72 |             table = self.name if (i < count) else config.httpbin_table
73 |             id = ids[i] if i < count else ids_free[i - len(ids)]
74 | 
75 |             proxy = self.sql.get_proxy_with_id(table, id)
76 |             if proxy == None:
77 |                 continue
78 | 
79 |             url = 'https://www.assetstore.unity3d.com/api/en-US/content/overview/' + '368' + '.json'
80 |             cur_time = time.time()
81 |             yield Request(
82 |                     url = url,
83 |                     headers = headers,
84 |                     meta = {
85 |                         'cur_time': cur_time,
86 |                         'download_timeout': self.timeout,
87 |                         'proxy_info': proxy,
88 |                         'table': table,
89 |                         'proxy': 'http://%s:%s' % (proxy.ip, proxy.port),
90 |                     },
91 |                     dont_filter = True,
92 |                     callback = self.success_parse,
93 |                     errback = self.error_parse,
94 |             )
95 | 


--------------------------------------------------------------------------------
/crawler/spiders/validator/baidu.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: utf-8 -*-
 2 | 
 3 | from .validator import Validator
 4 | 
 5 | 
 6 | class BaiduSpider(Validator):
 7 |     name = 'baidu'
 8 | 
 9 |     def __init__(self, name = None, **kwargs):
10 |         super(BaiduSpider, self).__init__(name, **kwargs)
11 | 
12 |         self.urls = [
13 |             'https://www.baidu.com/'
14 |         ]
15 | 
16 |         self.headers = {
17 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
18 |             'Accept-Encoding': 'gzip, deflate, br',
19 |             'Accept-Language': 'en-US,en;q=0.5',
20 |             'Cache-Control': 'max-age=0',
21 |             'Connection': 'keep-alive',
22 |             'Host': 'www.baidu.com',
23 |             'Upgrade-Insecure-Requests': '1',
24 |             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 '
25 |                           'Firefox/50.0',
26 |         }
27 | 
28 |         self.init()
29 | 


--------------------------------------------------------------------------------
/crawler/spiders/validator/bbs.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from .validator import Validator
 4 | 
 5 | 
 6 | class BBSSpider(Validator):
 7 |     name = 'bbs'
 8 |     concurrent_requests = 8
 9 | 
10 |     def __init__(self, name = None, **kwargs):
11 |         super(BBSSpider, self).__init__(name, **kwargs)
12 | 
13 |         self.urls = [
14 |             'http://www.autohome.com.cn/beijing/',
15 |             'http://club.autohome.com.cn/bbs/thread-c-2098-64053713-1.html',
16 |             'http://club.autohome.com.cn/bbs/thread-c-2098-61435076-1.html',
17 |             'http://club.autohome.com.cn/bbs/threadqa-c-4034-63834038-1.html',
18 |             'http://club.autohome.com.cn/bbs/threadqa-c-4034-63083758-1.html',
19 |             'http://club.autohome.com.cn/bbs/threadqa-c-4044-64310067-1.html',
20 |             'http://club.autohome.com.cn/bbs/threadqa-c-4044-64328047-1.html',
21 |             'http://club.autohome.com.cn/bbs/thread-c-4044-63233315-1.html',
22 |             'http://club.autohome.com.cn/bbs/threadqa-c-4044-62349867-1.html',
23 |             'http://club.autohome.com.cn/bbs/thread-c-4034-63846295-1.html',
24 |         ]
25 | 
26 |         self.headers = {
27 |             'Host': 'club.autohome.com.cn',
28 |             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 '
29 |                           'Firefox/50.0',
30 |         }
31 | 
32 |         self.is_record_web_page = False
33 |         self.init()
34 | 
35 |     def success_content_parse(self, response):
36 |         if 'conmain' in response.text:
37 |             return True
38 |         return False
39 | 


--------------------------------------------------------------------------------
/crawler/spiders/validator/boss.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: utf-8 -*-
 2 | 
 3 | from .validator import Validator
 4 | 
 5 | 
 6 | class BossSpider(Validator):
 7 |     name = 'boss'
 8 |     concurrent_requests = 8
 9 | 
10 |     def __init__(self, name = None, **kwargs):
11 |         super(BossSpider, self).__init__(name, **kwargs)
12 | 
13 |         self.urls = [
14 |             'https://www.zhipin.com/c101010100/h_101010100/?query=java&page=1&ka=page-1'
15 |         ]
16 | 
17 |         self.headers = {
18 |             # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
19 |             # 'Accept-Encoding': 'gzip, deflate, br',
20 |             # 'Accept-Language': 'en-US,en;q=0.5',
21 |             # 'Cache-Control': 'max-age=0',
22 |             # 'Connection': 'keep-alive',
23 |             # 'Upgrade-Insecure-Requests': '1',
24 |             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 '
25 |                           'Firefox/50.0',
26 |         }
27 | 
28 |         self.is_record_web_page = False
29 |         self.init()
30 | 
31 |     def success_content_parse(self, response):
32 |         if '<!DOCTYPE html>' in response.text:
33 |             return True
34 |         return False


--------------------------------------------------------------------------------
/crawler/spiders/validator/douban.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: utf-8 -*-
 2 | 
 3 | from .validator import Validator
 4 | 
 5 | 
 6 | class DoubanSpider(Validator):
 7 |     name = 'douban'
 8 | 
 9 |     def __init__(self, name = None, **kwargs):
10 |         super(DoubanSpider, self).__init__(name, **kwargs)
11 | 
12 |         self.timeout = 5
13 | 
14 |         self.urls = [
15 |             'https://movie.douban.com/subject/3434070/?from=subject-page'
16 |         ]
17 | 
18 |         self.headers = {
19 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
20 |             'Accept-Encoding': 'gzip, deflate, br',
21 |             'Accept-Language': 'en-US,en;q=0.5',
22 |             'Connection': 'keep-alive',
23 |             'Host': 'movie.douban.com',
24 |             'Upgrade-Insecure-Requests': '1',
25 |             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 '
26 |                           'Firefox/50.0',
27 |         }
28 | 
29 |         self.init()
30 | 


--------------------------------------------------------------------------------
/crawler/spiders/validator/gather.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: utf-8 -*-
 2 | 
 3 | from .validator import Validator
 4 | 
 5 | 
 6 | class GatherSpider(Validator):
 7 |     name = 'gather'
 8 | 
 9 |     def __init__(self, name = None, **kwargs):
10 |         super(GatherSpider, self).__init__(name, **kwargs)
11 | 
12 |         self.timeout = 10
13 |         self.urls = [
14 |             'http://gatherproxy.com/proxylist/anonymity/?t=Anonymous',
15 |             'http://gatherproxy.com/proxylist/country/?c=China'
16 |         ]
17 | 
18 |         self.headers = {
19 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
20 |             'Accept-Encoding': 'gzip, deflate',
21 |             'Accept-Language': 'en-US,en;q=0.5',
22 |             'Connection': 'keep-alive',
23 |             'Host': 'gatherproxy.com',
24 |             'Upgrade-Insecure-Requests': '1',
25 |             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 '
26 |                           'Firefox/50.0',
27 |         }
28 | 
29 |         self.init()
30 | 


--------------------------------------------------------------------------------
/crawler/spiders/validator/httpbin.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import json
  4 | import time
  5 | import requests
  6 | import config
  7 | 
  8 | from scrapy import Request
  9 | from .validator import Validator
 10 | 
 11 | 
 12 | class HttpBinSpider(Validator):
 13 |     name = 'httpbin'
 14 |     concurrent_requests = 16
 15 | 
 16 |     def __init__(self, name=None, **kwargs):
 17 |         super(HttpBinSpider, self).__init__(name, **kwargs)
 18 |         self.timeout = 20
 19 |         self.urls = [
 20 |             'http://httpbin.org/get?show_env=1',
 21 |             'https://httpbin.org/get?show_env=1',
 22 |         ]
 23 |         self.headers = {
 24 |             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 25 |             "Accept-Encoding": "gzip, deflate, br",
 26 |             "Accept-Language": "en-US,en;q=0.5",
 27 |             "Host": "httpbin.org",
 28 |             "Upgrade-Insecure-Requests": "1",
 29 |             "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 Firefox/51.0"
 30 |         }
 31 | 
 32 |         self.origin_ip = ''
 33 | 
 34 |         self.init()
 35 | 
 36 |     def init(self):
 37 |         super(HttpBinSpider, self).init()
 38 | 
 39 |         r = requests.get(url=self.urls[0], timeout=20)
 40 |         data = json.loads(r.text)
 41 |         self.origin_ip = data.get('origin', '')
 42 |         self.log('origin ip:%s' % self.origin_ip)
 43 | 
 44 |     def start_requests(self):
 45 |         count = self.sql.get_proxy_count(self.name)
 46 |         count_free = self.sql.get_proxy_count(config.free_ipproxy_table)
 47 | 
 48 |         ids = self.sql.get_proxy_ids(self.name)
 49 |         ids_free = self.sql.get_proxy_ids(config.free_ipproxy_table)
 50 | 
 51 |         for i in range(0, count + count_free):
 52 |             table = self.name if (i < count) else config.free_ipproxy_table
 53 |             id = ids[i] if i < count else ids_free[i - len(ids)]
 54 | 
 55 |             proxy = self.sql.get_proxy_with_id(table, id)
 56 |             if proxy == None:
 57 |                 continue
 58 | 
 59 |             for url in self.urls:
 60 |                 https = 'yes' if 'https' in url else 'no'
 61 | 
 62 |                 yield Request(
 63 |                     url=url,
 64 |                     headers=self.headers,
 65 |                     dont_filter=True,
 66 |                     priority=0 if https == 'yes' else 10,
 67 |                     meta={
 68 |                         'cur_time': time.time(),
 69 |                         'download_timeout': self.timeout,
 70 |                         'proxy_info': proxy,
 71 |                         'table': table,
 72 |                         'https': https,
 73 |                         'proxy': 'http://%s:%s' % (proxy.ip, proxy.port),
 74 |                         'vali_count': proxy.vali_count,
 75 |                     },
 76 |                     callback=self.success_parse,
 77 |                     errback=self.error_parse,
 78 |                 )
 79 | 
 80 |     def success_parse(self, response):
 81 |         proxy = response.meta.get('proxy_info')
 82 |         table = response.meta.get('table')
 83 |         proxy.https = response.meta.get('https')
 84 | 
 85 |         self.save_page(proxy.ip, response.body)
 86 | 
 87 |         if self.success_content_parse(response):
 88 |             proxy.speed = time.time() - response.meta.get('cur_time')
 89 |             proxy.vali_count += 1
 90 |             self.log('proxy_info:%s' % (str(proxy)))
 91 | 
 92 |             if proxy.https == 'no':
 93 |                 data = json.loads(response.body)
 94 |                 origin = data.get('origin')
 95 |                 headers = data.get('headers')
 96 |                 x_forwarded_for = headers.get('X-Forwarded-For', None)
 97 |                 x_real_ip = headers.get('X-Real-Ip', None)
 98 |                 via = headers.get('Via', None)
 99 | 
100 |                 if self.origin_ip in origin:
101 |                     proxy.anonymity = 3
102 |                 elif via is not None:
103 |                     proxy.anonymity = 2
104 |                 elif x_forwarded_for is not None and x_real_ip is not None:
105 |                     proxy.anonymity = 1
106 | 
107 |                 if table == self.name:
108 |                     if proxy.speed > self.timeout:
109 |                         self.sql.del_proxy_with_id(table_name=table, id=proxy.id)
110 |                     else:
111 |                         self.sql.update_proxy(table_name=table, proxy=proxy)
112 |                 else:
113 |                     if proxy.speed < self.timeout:
114 |                         self.sql.insert_proxy(table_name=self.name, proxy=proxy)
115 |             else:
116 |                 self.sql.update_proxy(table_name=table, proxy=proxy)
117 | 
118 |         self.sql.commit()
119 | 
120 |     def error_parse(self, failure):
121 |         request = failure.request
122 |         self.log('error_parse value:%s url:%s meta:%s' % (failure.value, request.url, request.meta))
123 |         https = request.meta.get('https')
124 |         if https == 'no':
125 |             table = request.meta.get('table')
126 |             proxy = request.meta.get('proxy_info')
127 | 
128 |             if table == self.name:
129 |                 self.sql.del_proxy_with_id(table_name=table, id=proxy.id)
130 |             else:
131 |                 # TODO... 如果 ip 验证失败应该针对特定的错误类型，进行处理
132 |                 pass
133 | 


--------------------------------------------------------------------------------
/crawler/spiders/validator/jd.py:
--------------------------------------------------------------------------------
  1 | #-*- coding: utf-8 -*-
  2 | 
  3 | import random
  4 | import time
  5 | import re
  6 | import config
  7 | 
  8 | from scrapy import Request
  9 | from .validator import Validator
 10 | 
 11 | 
 12 | class JDSpider(Validator):
 13 |     name = 'jd'
 14 | 
 15 |     def __init__(self, name = None, **kwargs):
 16 |         super(JDSpider, self).__init__(name, **kwargs)
 17 | 
 18 |         self.urls = [
 19 |             'https://item.jd.com/11478178241.html',
 20 |             'https://item.jd.com/4142680.html',
 21 |             'https://item.jd.com/3133859.html',
 22 |             'https://item.jd.com/11349957411.html',
 23 |             'https://item.jd.com/1231104.html',
 24 |             'https://item.jd.com/11290644320.html',
 25 |             'https://item.jd.com/3553539.html',
 26 |             'https://item.jd.com/3553567.html',
 27 |             'https://item.jd.com/4640524.html',
 28 |             'https://item.jd.com/3652063.html',
 29 |             'https://item.jd.com/2967929.html',
 30 |             'https://item.jd.com/3367822.html',
 31 |             'https://item.jd.com/1217500.html',
 32 |         ]
 33 | 
 34 |         self.headers = {
 35 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 36 |             'Accept-Encoding': 'gzip, deflate, br',
 37 |             'Accept-Language': 'en-US,en;q=0.5',
 38 |             'Connection': 'keep-alive',
 39 |             'Host': 'item.jd.com',
 40 |             'Upgrade-Insecure-Requests': '1',
 41 |             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:52.0) Gecko/20100101 Firefox/52.0',
 42 |         }
 43 | 
 44 |         self.is_record_web_page = False
 45 |         self.init()
 46 | 
 47 |     def success_content_parse(self, response):
 48 |         if 'comments' in response.text:
 49 |             return True
 50 |         return False
 51 | 
 52 |     def start_requests(self):
 53 |         count = self.sql.get_proxy_count(self.name)
 54 |         count_httpbin = self.sql.get_proxy_count(config.httpbin_table)
 55 | 
 56 |         ids = self.sql.get_proxy_ids(self.name)
 57 |         ids_httpbin = self.sql.get_proxy_ids(config.httpbin_table)
 58 | 
 59 |         for i in range(0, count + count_httpbin):
 60 |             table = self.name if (i < count) else config.httpbin_table
 61 |             id = ids[i] if i < count else ids_httpbin[i - len(ids)]
 62 | 
 63 |             proxy = self.sql.get_proxy_with_id(table, id)
 64 |             if proxy == None:
 65 |                 continue
 66 | 
 67 |             url = random.choice(self.urls)
 68 |             pattern = re.compile('\d+', re.S)
 69 |             product_id = re.search(pattern, url).group()
 70 | 
 71 |             cur_time = time.time()
 72 |             yield Request(
 73 |                     url = url,
 74 |                     headers = self.headers,
 75 |                     meta = {
 76 |                         'cur_time': cur_time,
 77 |                         'download_timeout': self.timeout,
 78 |                         'proxy_info': proxy,
 79 |                         'table': table,
 80 |                         'proxy': 'http://%s:%s' % (proxy.ip, proxy.port),
 81 |                         'product_id': product_id,
 82 |                     },
 83 |                     dont_filter = True,
 84 |                     callback = self.get_comment_count,
 85 |                     errback = self.error_parse,
 86 |             )
 87 | 
 88 |     def get_comment_count(self, response):
 89 |         name = response.xpath('//img[@id="spec-img"]/@alt').extract_first()
 90 |         self.log('name:%s time:%s' % (name, time.time() - response.meta.get('cur_time')))
 91 | 
 92 |         pattern = re.compile('commentVersion:\'(\d+)\'', re.S)
 93 |         comment_version = re.search(pattern, response.text).group(1)
 94 | 
 95 |         # sort type 5:推荐排序 6:时间排序
 96 |         url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv' \
 97 |               '{comment_version}&productId={product_id}&score=0&sortType={sort_type}&page=0&pageSize=10' \
 98 |               '&isShadowSku=0'. \
 99 |             format(product_id = response.meta.get('product_id'), comment_version = comment_version, sort_type = '6')
100 | 
101 |         cur_time = time.time()
102 |         yield Request(
103 |                 url = url,
104 |                 headers = {
105 |                     'Accept': '*/*',
106 |                     'Accept-Encoding': 'gzip, deflate, br',
107 |                     'Accept-Language': 'en-US,en;q=0.5',
108 |                     'Connection': 'keep-alive',
109 |                     'Host': 'club.jd.com',
110 |                     'Referer': 'https://item.jd.com/%s.html' % response.meta.get('product_id'),
111 |                     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:52.0) Gecko/20100101 '
112 |                                   'Firefox/52.0',
113 |                 },
114 |                 method = 'GET',
115 |                 meta = {
116 |                     'proxy': response.meta.get('proxy'),
117 |                     'cur_time': cur_time,
118 |                     'download_timeout': self.timeout,
119 |                     'proxy_info': response.meta.get('proxy_info'),
120 |                     'table': response.meta.get('table'),
121 |                 },
122 |                 dont_filter = True,
123 |                 callback = self.success_parse,
124 |                 errback = self.error_parse
125 |         )
126 | 


--------------------------------------------------------------------------------
/crawler/spiders/validator/lagou.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: utf-8 -*-
 2 | 
 3 | import time
 4 | import config
 5 | import utils
 6 | 
 7 | from .validator import Validator
 8 | from scrapy.http import FormRequest
 9 | 
10 | 
11 | class LagouSpider(Validator):
12 |     name = 'lagou'
13 |     concurrent_requests = 8
14 | 
15 |     def __init__(self, name = None, **kwargs):
16 |         super(LagouSpider, self).__init__(name, **kwargs)
17 | 
18 |         self.urls = [
19 |             'https://www.lagou.com/jobs/positionAjax.json?city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false'
20 |         ]
21 | 
22 |         self.headers = {
23 |             # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
24 |             # 'Accept-Encoding': 'gzip, deflate, br',
25 |             # 'Accept-Language': 'en-US,en;q=0.5',
26 |             # 'Cache-Control': 'max-age=0',
27 |             # 'Connection': 'keep-alive',
28 |             # 'Upgrade-Insecure-Requests': '1',
29 |             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 '
30 |                           'Firefox/50.0',
31 |         }
32 | 
33 |         self.is_record_web_page = True
34 |         self.init()
35 | 
36 |     def success_content_parse(self, response):
37 |         if 'success' in response.text:
38 |             return True
39 |         return False
40 | 
41 |     def start_requests(self):
42 |         count = self.sql.get_proxy_count(self.name)
43 |         count_httpbin = self.sql.get_proxy_count(config.httpbin_table)
44 | 
45 |         ids = self.sql.get_proxy_ids(self.name)
46 |         ids_httpbin = self.sql.get_proxy_ids(config.httpbin_table)
47 | 
48 |         for i in range(0, count + count_httpbin):
49 |             table = self.name if (i < count) else config.httpbin_table
50 |             id = ids[i] if i < count else ids_httpbin[i - len(ids)]
51 | 
52 |             proxy = self.sql.get_proxy_with_id(table, id)
53 |             if proxy == None:
54 |                 continue
55 | 
56 |             for url in self.urls:
57 |                 cur_time = time.time()
58 |                 yield FormRequest(
59 |                         url = url,
60 |                         headers = self.headers,
61 |                         method = 'POST',
62 |                         meta = {
63 |                             'cur_time': cur_time,
64 |                             'download_timeout': self.timeout,
65 |                             'proxy_info': proxy,
66 |                             'table': table,
67 |                             'id': proxy.id,
68 |                             'proxy': 'http://%s:%s' % (proxy.ip, proxy.port),
69 |                             'vali_count': proxy.vali_count,
70 |                         },
71 |                         cookies = {
72 |                             'Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1488937030',
73 |                             '_ga': 'GA1.2.40497390.1488937014',
74 |                             'TG-TRACK-CODE': 'search_code',
75 |                             'index_location_city': '%E5%8C%97%E4%BA%AC',
76 |                             'LGRID': '20170308093710-bf6755eb-039f-11e7-8025-525400f775ce',
77 |                             'Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1488881288,1488936799,1488936947,1488937014',
78 |                             'JSESSIONID': 'BDCBB6167F960CE43AF54B75A651F586',
79 |                             'LGSID': '20170308093653-b59316f0-039f-11e7-9229-5254005c3644',
80 |                             'LGUID': '20170308093653-b593185f-039f-11e7-9229-5254005c3644',
81 |                             'user_trace_token': '20170308093654-723efcfac8fb4c28a670d073d5113e02',
82 |                             'SEARCH_ID': '4db4dc3dea1c46b49018ae5421b53ffa'
83 |                         },
84 |                         formdata = {
85 |                             'first': 'true',
86 |                             'kd': 'ios',
87 |                             'pn': '1',
88 |                         },
89 |                         dont_filter = True,
90 |                         callback = self.success_parse,
91 |                         errback = self.error_parse,
92 |                 )
93 | 


--------------------------------------------------------------------------------
/crawler/spiders/validator/liepin.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: utf-8 -*-
 2 | 
 3 | from .validator import Validator
 4 | 
 5 | 
 6 | class LiepinSpider(Validator):
 7 |     name = 'liepin'
 8 |     concurrent_requests = 8
 9 | 
10 |     def __init__(self, name = None, **kwargs):
11 |         super(LiepinSpider, self).__init__(name, **kwargs)
12 | 
13 |         self.urls = [
14 |             'https://www.liepin.com/zhaopin/?pubTime=&ckid=17c370b0a0111aa5&fromSearchBtn=2&compkind' \
15 |             '=&isAnalysis=&init=-1&searchType=1&dqs=%s&industryType=&jobKind=&sortFlag=15&industries=&salary'
16 |             '=&compscale=&clean_condition=&key=%s&headckid=49963e122c30b827&curPage=%s' % ('010', 'ios', '1')
17 |         ]
18 | 
19 |         self.headers = {
20 |             # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
21 |             # 'Accept-Encoding': 'gzip, deflate, br',
22 |             # 'Accept-Language': 'en-US,en;q=0.5',
23 |             # 'Cache-Control': 'max-age=0',
24 |             # 'Connection': 'keep-alive',
25 |             # 'Upgrade-Insecure-Requests': '1',
26 |             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 '
27 |                           'Firefox/50.0',
28 |         }
29 | 
30 |         self.is_record_web_page = False
31 |         self.init()
32 | 
33 |     def success_content_parse(self, response):
34 |         if 'sojob-list' in response.text:
35 |             return True
36 |         return False
37 | 
38 | 


--------------------------------------------------------------------------------
/crawler/spiders/validator/steam.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: utf-8 -*-
 2 | 
 3 | from .validator import Validator
 4 | 
 5 | 
 6 | class SteamSpider(Validator):
 7 |     name = 'steam'
 8 | 
 9 |     def __init__(self, name = None, **kwargs):
10 |         super(SteamSpider, self).__init__(name, **kwargs)
11 | 
12 |         self.timeout = 10
13 | 
14 |         self.urls = [
15 |             'http://store.steampowered.com/app/602580/'
16 |         ]
17 | 
18 |         self.headers = {
19 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
20 |             'Accept-Encoding': 'gzip, deflate',
21 |             'Accept-Language': 'en-US,en;q=0.5',
22 |             'Connection': 'keep-alive',
23 |             'Host': 'store.steampowered.com',
24 |             'Upgrade-Insecure-Requests': '1',
25 |             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 Firefox/51.0',
26 |         }
27 | 
28 |         self.is_record_web_page = False
29 | 
30 |         self.init()
31 | 


--------------------------------------------------------------------------------
/crawler/spiders/validator/validator.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import random
  3 | import time
  4 | import datetime
  5 | import utils
  6 | import config
  7 | 
  8 | from scrapy import Request
  9 | from scrapy.spiders import Spider
 10 | from sql import SqlManager
 11 | 
 12 | 
 13 | class Validator(Spider):
 14 |     name = 'base'
 15 |     concurrent_requests = 16
 16 |     retry_enabled = False
 17 | 
 18 |     def __init__(self, name = None, **kwargs):
 19 |         super(Validator, self).__init__(name, **kwargs)
 20 | 
 21 |         self.urls = []
 22 |         self.headers = None
 23 |         self.timeout = 10
 24 |         self.success_status = [200]
 25 |         self.is_record_web_page = False
 26 | 
 27 |         self.sql = SqlManager()
 28 | 
 29 |     def init(self):
 30 |         self.dir_log = 'log/validator/%s' % self.name
 31 |         utils.make_dir(self.dir_log)
 32 | 
 33 |         self.sql.init_proxy_table(self.name)
 34 | 
 35 |     @classmethod
 36 |     def update_settings(cls, settings):
 37 |         settings.setdict(cls.custom_settings or {
 38 |             'CONCURRENT_REQUESTS': cls.concurrent_requests,
 39 |             'RETRY_ENABLED': cls.retry_enabled,
 40 |         },
 41 |                          priority = 'spider')
 42 | 
 43 |     def start_requests(self):
 44 |         count = self.sql.get_proxy_count(self.name)
 45 |         count_free = self.sql.get_proxy_count(config.httpbin_table)
 46 | 
 47 |         ids = self.sql.get_proxy_ids(self.name)
 48 |         ids_httpbin = self.sql.get_proxy_ids(config.httpbin_table)
 49 | 
 50 |         for i in range(0, count + count_free):
 51 |             table = self.name if (i < count) else config.httpbin_table
 52 |             id = ids[i] if i < count else ids_httpbin[i - len(ids)]
 53 | 
 54 |             proxy = self.sql.get_proxy_with_id(table, id)
 55 |             if proxy == None:
 56 |                 continue
 57 | 
 58 |             url = random.choice(self.urls)
 59 |             cur_time = time.time()
 60 |             yield Request(
 61 |                 url = url,
 62 |                 headers = self.headers,
 63 |                 meta = {
 64 |                     'cur_time': cur_time,
 65 |                     'download_timeout': self.timeout,
 66 |                     'proxy_info': proxy,
 67 |                     'table': table,
 68 |                     'proxy': 'http://%s:%s' % (proxy.ip, proxy.port),
 69 |                 },
 70 |                 dont_filter = True,
 71 |                 callback = self.success_parse,
 72 |                 errback = self.error_parse,
 73 |             )
 74 | 
 75 |     def success_parse(self, response):
 76 |         proxy = response.meta.get('proxy_info')
 77 |         table = response.meta.get('table')
 78 | 
 79 |         self.save_page(proxy.ip, response.body)
 80 |         self.log('success_parse speed:%s meta:%s' % (time.time() - response.meta.get('cur_time'), response.meta))
 81 | 
 82 |         proxy.vali_count += 1
 83 |         proxy.speed = time.time() - response.meta.get('cur_time')
 84 |         if self.success_content_parse(response):
 85 |             if table == self.name:
 86 |                 if proxy.speed > self.timeout:
 87 |                     self.sql.del_proxy_with_id(table, proxy.id)
 88 |                 else:
 89 |                     self.sql.update_proxy(table, proxy)
 90 |             else:
 91 |                 if proxy.speed < self.timeout:
 92 |                     self.sql.insert_proxy(table_name = self.name, proxy = proxy)
 93 |         else:
 94 |             if table == self.name:
 95 |                 self.sql.del_proxy_with_id(table_name = table, id = proxy.id)
 96 | 
 97 |         self.sql.commit()
 98 | 
 99 |     def success_content_parse(self, response):
100 |         if response.status not in self.success_status:
101 |             return False
102 |         return True
103 | 
104 |     def error_parse(self, failure):
105 |         request = failure.request
106 |         self.log('error_parse value:%s url:%s meta:%s' % (failure.value, request.url, request.meta))
107 | 
108 |         proxy = failure.request.meta.get('proxy_info')
109 |         table = failure.request.meta.get('table')
110 | 
111 |         if table == self.name:
112 |             self.sql.del_proxy_with_id(table_name = table, id = proxy.id)
113 |         else:
114 |             # TODO... 如果 ip 验证失败应该针对特定的错误类型，进行处理
115 |             pass
116 | 
117 |             #
118 |             # request = failure.request.meta
119 |             # utils.log('request meta:%s' % str(request))
120 |             #
121 |             # # log all errback failures,
122 |             # # in case you want to do something special for some errors,
123 |             # # you may need the failure's type
124 |             # self.logger.error(repr(failure))
125 |             #
126 |             # #if isinstance(failure.value, HttpError):
127 |             # if failure.check(HttpError):
128 |             #     # you can get the response
129 |             #     response = failure.value.response
130 |             #     self.logger.error('HttpError on %s', response.url)
131 |             #
132 |             # #elif isinstance(failure.value, DNSLookupError):
133 |             # elif failure.check(DNSLookupError):
134 |             #     # this is the original request
135 |             #     request = failure.request
136 |             #     self.logger.error('DNSLookupError on %s', request.url)
137 |             #
138 |             # #elif isinstance(failure.value, TimeoutError):
139 |             # elif failure.check(TimeoutError):
140 |             #     request = failure.request
141 |             #     self.logger.error('TimeoutError on url:%s', request.url)
142 | 
143 |     def save_page(self, ip, data):
144 |         filename = '{time} {ip}'.format(time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S:%f'), ip = ip)
145 | 
146 |         if self.is_record_web_page:
147 |             with open('%s/%s.html' % (self.dir_log, filename), 'wb') as f:
148 |                 f.write(data)
149 |                 f.close()
150 | 
151 |     def close(spider, reason):
152 |         spider.sql.commit()
153 | 


--------------------------------------------------------------------------------
/crawler/spiders/validator/zhilian.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: utf-8 -*-
 2 | 
 3 | from .validator import Validator
 4 | 
 5 | 
 6 | class ZhiLianSpider(Validator):
 7 |     name = 'zhilian'
 8 |     concurrent_requests = 8
 9 | 
10 |     def __init__(self, name = None, **kwargs):
11 |         super(ZhiLianSpider, self).__init__(name, **kwargs)
12 | 
13 |         self.urls = [
14 |             'http://www.zhaopin.com/'
15 |         ]
16 | 
17 |         self.headers = {
18 |             # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
19 |             # 'Accept-Encoding': 'gzip, deflate, br',
20 |             # 'Accept-Language': 'en-US,en;q=0.5',
21 |             # 'Cache-Control': 'max-age=0',
22 |             # 'Connection': 'keep-alive',
23 |             # 'Upgrade-Insecure-Requests': '1',
24 |             'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36',
25 |         }
26 | 
27 |         self.is_record_web_page = False
28 |         self.init()
29 | 
30 |     def success_content_parse(self, response):
31 |         if '<!DOCTYPE html>' in response.text:
32 |             return True
33 |         return False
34 | 


--------------------------------------------------------------------------------
/db.sql:
--------------------------------------------------------------------------------
  1 | -- MySQL dump 10.13  Distrib 5.5.58, for Linux (x86_64)
  2 | --
  3 | -- Host: localhost    Database: ipproxy
  4 | -- ------------------------------------------------------
  5 | -- Server version	5.5.58
  6 | 
  7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
  8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
  9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
 10 | /*!40101 SET NAMES utf8 */;
 11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
 12 | /*!40103 SET TIME_ZONE='+00:00' */;
 13 | /*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */;
 14 | /*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */;
 15 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */;
 16 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
 17 | 
 18 | --
 19 | -- Table structure for table `free_ipproxy`
 20 | --
 21 | 
 22 | DROP TABLE IF EXISTS `free_ipproxy`;
 23 | /*!40101 SET @saved_cs_client     = @@character_set_client */;
 24 | /*!40101 SET character_set_client = utf8 */;
 25 | CREATE TABLE `free_ipproxy` (
 26 |   `id` int(8) NOT NULL AUTO_INCREMENT,
 27 |   `ip` char(25) NOT NULL,
 28 |   `port` int(4) NOT NULL,
 29 |   `country` text,
 30 |   `anonymity` int(2) DEFAULT NULL,
 31 |   `https` char(4) DEFAULT NULL,
 32 |   `speed` float DEFAULT NULL,
 33 |   `source` char(20) DEFAULT NULL,
 34 |   `save_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
 35 |   `vali_count` int(5) DEFAULT '0',
 36 |   PRIMARY KEY (`id`),
 37 |   UNIQUE KEY `proxy_field` (`ip`,`port`)
 38 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
 39 | /*!40101 SET character_set_client = @saved_cs_client */;
 40 | 
 41 | --
 42 | -- Dumping data for table `free_ipproxy`
 43 | --
 44 | 
 45 | LOCK TABLES `free_ipproxy` WRITE;
 46 | /*!40000 ALTER TABLE `free_ipproxy` DISABLE KEYS */;
 47 | /*!40000 ALTER TABLE `free_ipproxy` ENABLE KEYS */;
 48 | UNLOCK TABLES;
 49 | 
 50 | --
 51 | -- Table structure for table `httpbin`
 52 | --
 53 | 
 54 | DROP TABLE IF EXISTS `httpbin`;
 55 | /*!40101 SET @saved_cs_client     = @@character_set_client */;
 56 | /*!40101 SET character_set_client = utf8 */;
 57 | CREATE TABLE `httpbin` (
 58 |   `id` int(8) NOT NULL AUTO_INCREMENT,
 59 |   `ip` char(25) NOT NULL,
 60 |   `port` int(4) NOT NULL,
 61 |   `country` text,
 62 |   `anonymity` int(2) DEFAULT NULL,
 63 |   `https` char(4) DEFAULT NULL,
 64 |   `speed` float DEFAULT NULL,
 65 |   `source` char(20) DEFAULT NULL,
 66 |   `save_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
 67 |   `vali_count` int(5) DEFAULT '0',
 68 |   PRIMARY KEY (`id`),
 69 |   UNIQUE KEY `proxy_field` (`ip`,`port`)
 70 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
 71 | /*!40101 SET character_set_client = @saved_cs_client */;
 72 | 
 73 | --
 74 | -- Dumping data for table `httpbin`
 75 | --
 76 | 
 77 | LOCK TABLES `httpbin` WRITE;
 78 | /*!40000 ALTER TABLE `httpbin` DISABLE KEYS */;
 79 | /*!40000 ALTER TABLE `httpbin` ENABLE KEYS */;
 80 | UNLOCK TABLES;
 81 | 
 82 | --
 83 | -- Dumping routines for database 'ipproxy'
 84 | --
 85 | /*!50003 DROP PROCEDURE IF EXISTS `drop_iptables` */;
 86 | /*!50003 SET @saved_cs_client      = @@character_set_client */ ;
 87 | /*!50003 SET @saved_cs_results     = @@character_set_results */ ;
 88 | /*!50003 SET @saved_col_connection = @@collation_connection */ ;
 89 | /*!50003 SET character_set_client  = utf8 */ ;
 90 | /*!50003 SET character_set_results = utf8 */ ;
 91 | /*!50003 SET collation_connection  = utf8_general_ci */ ;
 92 | /*!50003 SET @saved_sql_mode       = @@sql_mode */ ;
 93 | /*!50003 SET sql_mode              = '' */ ;
 94 | DELIMITER ;;
 95 | CREATE DEFINER=`root`@`localhost` PROCEDURE `drop_iptables`()
 96 | BEGIN
 97 |    DELETE FROM ipproxy.free_ipproxy;
 98 |    DELETE FROM ipproxy.httpbin;
 99 |    TRUNCATE TABLE ipproxy.free_ipproxy;
100 |    TRUNCATE TABLE ipproxy.httpbin;
101 | END ;;
102 | DELIMITER ;
103 | /*!50003 SET sql_mode              = @saved_sql_mode */ ;
104 | /*!50003 SET character_set_client  = @saved_cs_client */ ;
105 | /*!50003 SET character_set_results = @saved_cs_results */ ;
106 | /*!50003 SET collation_connection  = @saved_col_connection */ ;
107 | /*!50003 DROP PROCEDURE IF EXISTS `ip_transfer` */;
108 | /*!50003 SET @saved_cs_client      = @@character_set_client */ ;
109 | /*!50003 SET @saved_cs_results     = @@character_set_results */ ;
110 | /*!50003 SET @saved_col_connection = @@collation_connection */ ;
111 | /*!50003 SET character_set_client  = utf8 */ ;
112 | /*!50003 SET character_set_results = utf8 */ ;
113 | /*!50003 SET collation_connection  = utf8_general_ci */ ;
114 | /*!50003 SET @saved_sql_mode       = @@sql_mode */ ;
115 | /*!50003 SET sql_mode              = '' */ ;
116 | DELIMITER ;;
117 | CREATE DEFINER=`root`@`localhost` PROCEDURE `ip_transfer`(IN valid_id INT)
118 | BEGIN DECLARE cur_ip char(25); DECLARE cur_port int(4); SELECT ip,port INTO cur_ip,cur_port FROM free_ipproxy WHERE id = valid_id; DELETE FROM httpbin WHERE ip =cur_ip AND port = cur_port;  INSERT INTO httpbin(ip,port,country,anonymity,https,speed,source)  SELECT ip,port,country,anonymity,https,speed,source  FROM free_ipproxy WHERE id = valid_id; DELETE FROM free_ipproxy where id = valid_id; END ;;
119 | DELIMITER ;
120 | /*!50003 SET sql_mode              = @saved_sql_mode */ ;
121 | /*!50003 SET character_set_client  = @saved_cs_client */ ;
122 | /*!50003 SET character_set_results = @saved_cs_results */ ;
123 | /*!50003 SET collation_connection  = @saved_col_connection */ ;
124 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
125 | 
126 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
127 | /*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */;
128 | /*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */;
129 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
130 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
131 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
132 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
133 | 
134 | -- Dump completed on 2018-01-25  4:01:20
135 | 


--------------------------------------------------------------------------------
/ipproxytool.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import logging
 4 | import os
 5 | import sys
 6 | import subprocess
 7 | import run_validator
 8 | import run_validator_async
 9 | 
10 | if __name__ == '__main__':
11 | 
12 |     # 进入当前项目目录
13 |     os.chdir(sys.path[0])
14 | 
15 |     if not os.path.exists('log'):
16 |         os.makedirs('log')
17 | 
18 |     logging.basicConfig(
19 |             filename = 'log/ipproxy.log',
20 |             format = '%(asctime)s: %(message)s',
21 |             level = logging.DEBUG
22 |     )
23 | 
24 |     subprocess.Popen(['python', 'run_crawl_proxy.py'])
25 |     subprocess.Popen(['python', 'run_server.py'])
26 |     
27 |     if 'async' in sys.argv: 
28 |         run_validator_async.async_validator()
29 |     else:
30 |         run_validator.validator()
31 | 
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/proxy.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | 
 4 | class Proxy(object):
 5 |     def __init__(self):
 6 |         self.id = 1
 7 |         self.ip = ''
 8 |         self.port = ''
 9 |         self.country = ''
10 |         self.anonymity = ''
11 |         self.https = ''
12 |         self.speed = ''
13 |         self.source = ''
14 |         self.vali_count = 0
15 | 
16 |     def __str__(self):
17 |         data = {
18 |             'ip': self.ip,
19 |             'port': self.port,
20 |             'country': self.country,
21 |             'anonymity': self.anonymity,
22 |             'https': self.https,
23 |             'speed': self.speed,
24 |             'source': self.source,
25 |             'vali_count': self.vali_count,
26 |         }
27 | 
28 |         return str(data)
29 | 
30 |     def __dict__(self):
31 |         data = {
32 |             'ip': self.ip,
33 |             'port': self.port,
34 |             'country': self.country,
35 |             'anonymity': self.anonymity,
36 |             'https': self.https,
37 |             'speed': self.speed,
38 |             'source': self.source,
39 |             'vali_count': self.vali_count,
40 |         }
41 | 
42 |         return data
43 | 
44 |     def get_dict(self):
45 |         data = {
46 |             'ip': self.ip,
47 |             'port': self.port,
48 |             'country': self.country,
49 |             'anonymity': self.anonymity,
50 |             'https': self.https,
51 |             'speed': self.speed,
52 |             'source': self.source,
53 |             'vali_count': self.vali_count,
54 |         }
55 | 
56 |         return data
57 | 
58 |     def set_value(self, ip, port, country, anonymity, source='unkonw', https='no', speed=-1, vali_count=0):
59 |         self.ip = ip
60 |         self.port = port
61 |         self.country = country
62 |         self.anonymity = self.get_anonymity_type(anonymity)
63 |         self.https = https
64 |         self.speed = speed
65 |         self.source = source
66 |         self.vali_count = vali_count
67 | 
68 |     def get_anonymity_type(self, anonymity):
69 |         '''There are 3 levels of proxies according to their anonymity.
70 | 
71 |             Level 1 - Elite Proxy / Highly Anonymous Proxy: The web server can't detect whether you are using a proxy.
72 |             Level 2 - Anonymous Proxy: The web server can know you are using a proxy, but it can't know your real IP.
73 |             Level 3 - Transparent Proxy: The web server can know you are using a proxy and it can also know your real
74 |             IP.
75 |         '''
76 | 
77 |         if anonymity == u'高匿代理' or anonymity == u'高匿名' or anonymity == 'elite proxy' or \
78 |                         anonymity == u'超级匿名' or anonymity == u'High':
79 |             return '1'
80 |         elif anonymity == u'匿名' or anonymity == 'anonymous' or anonymity == u'普通匿名' or anonymity == u'Medium':
81 |             return '2'
82 |         elif anonymity == u'透明' or anonymity == 'transparent' or anonymity == u'No':
83 |             return '3'
84 |         else:
85 |             return '3'
86 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | aiohttp==3.7.4
 2 | async-timeout==3.0.1
 3 | attrs==20.3.0
 4 | Automat==20.2.0
 5 | beautifulsoup4==4.9.3
 6 | bs4==0.0.1
 7 | certifi==2020.12.5
 8 | cffi==1.14.4
 9 | chardet==3.0.4
10 | click==7.1.2
11 | constantly==15.1.0
12 | crochet==1.12.0
13 | cryptography==3.3.1
14 | cssselect==1.1.0
15 | Flask==1.1.2
16 | hyperlink==20.0.1
17 | idna==2.10
18 | incremental==17.5.0
19 | itemadapter==0.2.0
20 | itemloaders==1.0.4
21 | itsdangerous==1.1.0
22 | Jinja2==2.11.2
23 | jmespath==0.10.0
24 | logzero==1.6.3
25 | lxml==4.6.2
26 | MarkupSafe==1.1.1
27 | multidict==5.1.0
28 | parsel==1.6.0
29 | Protego==0.1.16
30 | pyasn1==0.4.8
31 | pyasn1-modules==0.2.8
32 | pycparser==2.20
33 | PyDispatcher==2.0.5
34 | Pygments==2.7.3
35 | PyHamcrest==2.0.2
36 | pymongo==3.11.2
37 | PyMySQL==0.10.1
38 | pyOpenSSL==20.0.1
39 | queuelib==1.5.0
40 | requests==2.25.1
41 | Scrapy==2.4.1
42 | scrapydo==0.2.2
43 | service-identity==18.1.0
44 | six==1.15.0
45 | soupsieve==2.1
46 | Twisted==20.3.0
47 | typing-extensions==3.7.4.3
48 | urllib3==1.26.2
49 | w3lib==1.22.0
50 | Werkzeug==1.0.1
51 | wrapt==1.12.1
52 | yarl==1.6.3
53 | zope.interface==5.2.0
54 | 


--------------------------------------------------------------------------------
/run_crawl_proxy.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import logging
 4 | import os
 5 | import sys
 6 | import scrapydo
 7 | import time
 8 | import utils
 9 | import config
10 | 
11 | from sql import SqlManager
12 | from crawler.spiders.proxy.xicidaili import XiCiDaiLiSpider
13 | from crawler.spiders.proxy.sixsixip import SixSixIpSpider
14 | from crawler.spiders.proxy.ip181 import IpOneEightOneSpider
15 | from crawler.spiders.proxy.kuaidaili import KuaiDaiLiSpider
16 | from crawler.spiders.proxy.gatherproxy import GatherproxySpider
17 | from crawler.spiders.proxy.hidemy import HidemySpider
18 | from crawler.spiders.proxy.proxylistplus import ProxylistplusSpider
19 | from crawler.spiders.proxy.freeproxylists import FreeProxyListsSpider
20 | from crawler.spiders.proxy.usproxy import UsProxySpider
21 | from crawler.spiders.proxy.proxydb import ProxyDBSpider
22 | from crawler.spiders.proxy.data5u import Data5uSpider
23 | 
24 | 
25 | scrapydo.setup()
26 | 
27 | if __name__ == '__main__':
28 |     os.chdir(sys.path[0])
29 | 
30 |     if not os.path.exists('log'):
31 |         os.makedirs('log')
32 | 
33 |     logging.basicConfig(
34 |         filename = 'log/crawl_proxy.log',
35 |         format = '%(levelname)s %(asctime)s: %(message)s',
36 |         level = logging.DEBUG
37 |     )
38 |     sql = SqlManager()
39 | 
40 |     spiders = [
41 |         # XiCiDaiLiSpider, # 已失效
42 |         SixSixIpSpider,
43 |         IpOneEightOneSpider,
44 |         KuaiDaiLiSpider,  # 在访问前加了一个 js ，反爬
45 |         GatherproxySpider,
46 |        # HidemySpider,  已失效
47 |         ProxylistplusSpider,
48 |         FreeProxyListsSpider,
49 |         # PeulandSpider,  # 目标站点失效
50 |         UsProxySpider,
51 |         ProxyDBSpider,
52 |         Data5uSpider,
53 |     ]
54 |     while True:
55 |         utils.log('*******************run spider start...*******************')
56 |         #sql.delete_old(config.free_ipproxy_table, 0.5)
57 |         try:
58 |             for spider in spiders:
59 |                 scrapydo.run_spider(spider_cls = spider)
60 |         except Exception as e:
61 |             utils.log('[Error]# spider goes wroing.Return Message: {}'.format(str(e)))
62 |      
63 |         utils.log('*******************run spider waiting...*******************')
64 |         time.sleep(1200)
65 | 


--------------------------------------------------------------------------------
/run_server.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import os
 4 | import logging
 5 | import config
 6 | import utils
 7 | 
 8 | from server import dataserver
 9 | 
10 | if __name__ == '__main__':
11 |     if not os.path.exists('log'):
12 |         os.makedirs('log')
13 | 
14 |     logging.basicConfig(
15 |         filename='log/server.log',
16 |         format='%(levelname)s %(asctime)s: %(message)s',
17 |         level=logging.DEBUG
18 |     )
19 | 
20 |     utils.kill_ports([config.data_port])
21 | 
22 |     dataserver.app.run(
23 |         debug=False,
24 |         host='127.0.0.1',
25 |         port=config.data_port,
26 |     )
27 | 


--------------------------------------------------------------------------------
/run_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import os
 4 | import logging
 5 | import sys
 6 | 
 7 | from scrapy.crawler import CrawlerProcess
 8 | from scrapy.utils.log import configure_logging
 9 | from scrapy.utils.project import get_project_settings
10 | 
11 | 
12 | def runspider(name):
13 |     configure_logging(install_root_handler=False)
14 |     logging.basicConfig(
15 |         filename='log/%s.log' % name,
16 |         format='%(levelname)s %(asctime)s: %(message)s',
17 |         level=logging.DEBUG
18 |     )
19 |     process = CrawlerProcess(get_project_settings())
20 |     try:
21 |         logging.info('runspider start spider:%s' % name)
22 |         process.crawl(name)
23 |         process.start()
24 |     except Exception as e:
25 |         logging.exception('runspider spider:%s exception:%s' % (name, e))
26 | 
27 |     logging.debug('finish this spider:%s\n\n' % name)
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     try:
32 |         name = sys.argv[1] or 'base'
33 |         runspider(name)
34 |     except Exception as e:
35 |         logging.exception('run_spider main exception msg:%s' % e)
36 | 


--------------------------------------------------------------------------------
/run_validator.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import logging
 4 | import os
 5 | import subprocess
 6 | import sys
 7 | import time
 8 | import scrapydo
 9 | import utils
10 | from importlib import import_module
11 | 
12 | VALIDATORS = {
13 |     'HttpBinSpider': 'crawler.spiders.validator.httpbin',
14 |     # 'DoubanSpider':'ipproxytool.spiders.validator.douban',
15 |     # 'AssetStoreSpider':'ipproxytool.spiders.validator.assetstore',
16 |     # 'GatherSpider' :'ipproxytool.spiders.validator.gather',
17 |     # 'HttpBinSpider' :'ipproxytool.spiders.validator.httpbin',
18 |     # 'SteamSpider' :'ipproxytool.spiders.validator.steam',
19 |     # 'BossSpider' :'ipproxytool.spiders.validator.boss',
20 |     # 'LagouSpider' :'ipproxytool.spiders.validator.lagou',
21 |     # 'LiepinSpider' :'ipproxytool.spiders.validator.liepin',
22 |     # 'JDSpider' :'ipproxytool.spiders.validator.jd',
23 |     # 'BBSSpider' :'ipproxytool.spiders.validator.bbs',
24 |     # 'ZhiLianSpider' :'ipproxytool.spiders.validator.zhilian',
25 |     # 'AmazonCnSpider' :'ipproxytool.spiders.validator.amazoncn',
26 | }
27 | 
28 | scrapydo.setup()
29 | 
30 | 
31 | def validator():
32 |     process_list = []
33 |     for item, path in VALIDATORS.items():
34 |         module = import_module(path)
35 |         validator = getattr(module, item)
36 |         popen = subprocess.Popen(['python', 'run_spider.py', validator.name], shell=False)
37 |         data = {
38 |             'name': validator.name,
39 |             'popen': popen,
40 |         }
41 |         process_list.append(data)
42 | 
43 |     while True:
44 |         time.sleep(60)
45 |         for process in process_list:
46 |             popen = process.get('popen', None)
47 |             utils.log('name:%s poll:%s' % (process.get('name'), popen.poll()))
48 | 
49 |             #  检测结束进程，如果有结束进程，重新开启
50 |             if popen != None and popen.poll() == 0:
51 |                 name = process.get('name')
52 |                 utils.log('%(name)s spider finish...\n' % {'name': name})
53 |                 process_list.remove(process)
54 |                 p = subprocess.Popen(['python', 'run_spider.py', name], shell=False)
55 |                 data = {
56 |                     'name': name,
57 |                     'popen': p,
58 |                 }
59 |                 process_list.append(data)
60 |                 time.sleep(1)
61 |                 break
62 | 
63 | 
64 | if __name__ == '__main__':
65 |     os.chdir(sys.path[0])
66 | 
67 |     if not os.path.exists('log'):
68 |         os.makedirs('log')
69 | 
70 |     logging.basicConfig(
71 |         filename='log/validator.log',
72 |         format='%(asctime)s: %(message)s',
73 |         level=logging.DEBUG
74 |     )
75 | 
76 |     validator()
77 | 


--------------------------------------------------------------------------------
/run_validator_async.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import logging
 4 | import os
 5 | import sys
 6 | import time
 7 | import utils
 8 | import aiohttp
 9 | from aiohttp import ClientSession
10 | from sql.sql_manager import SqlManager
11 | import config
12 | import asyncio
13 | 
14 | TEST_URL='http://httpbin.org/ip'
15 | 
16 | async def test_connect(proxy,operator,mode=None):
17 |     conn = aiohttp.TCPConnector(verify_ssl=False)
18 |     async with ClientSession(connector=conn) as s:
19 |         try:
20 |             async with s.get(url=TEST_URL,proxy=proxy[2],
21 |                              timeout=10,allow_redirects=False) as resp:
22 |                 page = await resp.text()
23 |                 if (resp.status != 200 or str(resp.url) != TEST_URL):
24 |                     utils.log(('[INFO]#proxy:{ip} has been dropped\n'
25 |                                '      #Reason:Abnormal url or return Code').format(ip=proxy[1]))
26 |                     operator.del_proxy_with_id(config.free_ipproxy_table,proxy[0])
27 |                     operator.del_proxy_with_id(config.httpbin_table,proxy[0])
28 |                 elif mode == 'add':
29 |                     operator.insert_valid_proxy(id=proxy[0])
30 |                 else:
31 |                     operator.update_valid_proxy(id=proxy[0])
32 |                    
33 |         except Exception as e:
34 |             utils.log(('[INFO]#proxy:{ip} has been dropped\n'
35 |                        '      #Reason:{msg}').format(ip=proxy[1],msg=str(e)))
36 |             operator.del_proxy_with_id(config.free_ipproxy_table,proxy[0])
37 |             operator.del_proxy_with_id(config.httpbin_table,proxy[0])
38 |         finally:
39 |             operator.commit()
40 | 
41 | 
42 | def async_validator():
43 |     utils.log('[INFO]#Loading ip proxies....60 sec left')
44 |     time.sleep(60)
45 |     proxy_factory = SqlManager()
46 |     loop = asyncio.get_event_loop()
47 |     def test_process(table_name,mode=None,limit=50):
48 |         id_list = proxy_factory.get_proxy_ids(table_name) 
49 |         if len(id_list) > 0:
50 |             task_len = len(id_list)
51 |             cur_id = 0
52 |             for sig in range(0,task_len,limit):
53 |                 proxies = proxy_factory.get_proxies_info(table_name=table_name,
54 |                                                          start_id=cur_id,
55 |                                                          limit=limit)
56 |                 if len(proxies) == 0:
57 |                     break
58 |                 cur_id = proxies[-1][0]
59 |                 proxies = [[proxy[0],proxy[1],'http://{}:{}'.format(proxy[1],proxy[2])] for proxy in proxies]
60 |                 tasks = [test_connect(proxy,proxy_factory,mode) for proxy in proxies]
61 |                 loop.run_until_complete(asyncio.wait(tasks))
62 |     while True:
63 |         utils.log('[INFO]Validator process started')
64 |         utils.log('[INFO]Validator process:Verify mode start')
65 |         test_process(config.httpbin_table)
66 |         utils.log('[INFO]Validator process:Add mode start')
67 |         test_process(config.free_ipproxy_table,mode='add')
68 |         utils.log('[INFO]Validator process completed')
69 |         time.sleep(300)
70 | 
71 | 
72 | if __name__ == '__main__':
73 |     if not os.path.exists('log'):
74 |         os.makedirs('log')
75 | 
76 |     logging.basicConfig(
77 |         filename = 'log/validator.log',
78 |         format = '%(asctime)s: %(message)s',
79 |         level = logging.INFO
80 |     )
81 |     async_validator()
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = crawler.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = crawler
12 | 


--------------------------------------------------------------------------------
/server/__init__.py:
--------------------------------------------------------------------------------
1 | #-*- coding: utf-8 -*-
2 | 
3 | 


--------------------------------------------------------------------------------
/server/dataserver.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import json
 4 | import logging
 5 | import sys
 6 | import config
 7 | 
 8 | from proxy import Proxy
 9 | from sql import SqlManager
10 | from flask import Flask
11 | from flask import request
12 | 
13 | app = Flask(__name__)
14 | 
15 | 
16 | @app.route('/')
17 | def index():
18 |     return 'Hello, World!'
19 | 
20 | 
21 | @app.route('/insert')
22 | def insert():
23 |     sql = SqlManager()
24 |     name = request.args.get('name')
25 |     proxy = Proxy()
26 |     proxy.set_value(
27 |         ip=request.args.get('ip'),
28 |         port=request.args.get('port'),
29 |         country=request.args.get('country', None),
30 |         anonymity=request.args.get('anonymity', None),
31 |         https=request.args.get('https', 'no'),
32 |         speed=request.args.get('speed', -1),
33 |         source=request.args.get('source', name),
34 |     )
35 | 
36 |     result = sql.insert_proxy(name, proxy)
37 |     data = {
38 |         'result': result
39 |     }
40 | 
41 |     return json.dumps(data, indent=4)
42 | 
43 | 
44 | @app.route('/select')
45 | def select():
46 |     sql = SqlManager()
47 |     name = request.args.get('name')
48 |     anonymity = request.args.get('anonymity', '')
49 |     https = request.args.get('https', '')
50 |     order = request.args.get('order', 'speed')
51 |     sort = request.args.get('sort', 'asc')
52 |     count = request.args.get('count', 100)
53 | 
54 |     kwargs = {
55 |         'anonymity': anonymity,
56 |         'https': https,
57 |         'order': order,
58 |         'sort': sort,
59 |         'count': count
60 |     }
61 |     result = sql.select_proxy(name, **kwargs)
62 |     data = [{
63 |         'ip': item.get('ip'), 'port': item.get('port'),
64 |         'anonymity': item.get('anonymity'), 'https': item.get('https'),
65 |         'speed': item.get('speed'), 'save_time': item.get('save_time', '')
66 |     } for item in result]
67 |     return json.dumps(data, indent=4)
68 | 
69 | 
70 | @app.route('/delete')
71 | def delete():
72 |     sql = SqlManager()
73 |     name = request.args.get('name')
74 |     ip = request.args.get('ip')
75 |     result = sql.del_proxy_with_ip(name, ip)
76 |     data = {'result': result}
77 | 
78 |     return json.dumps(data, indent=4)
79 | 
80 | @app.route('/query')
81 | def query():
82 |     sql = SqlManager()
83 |     start_id = request.args.get('sid')
84 |     limit = int(request.args.get('limit','100'))
85 |     proxies = sql.get_proxies_info(config.httpbin_table,start_id=start_id,limit=limit)
86 |     data = [{'id':proxy[0],'ip':proxy[1],'port':proxy[2],'https':proxy[3]}
87 |             for proxy in proxies]
88 |     return json.dumps(data,indent=4)
89 |     
90 | 
91 |     
92 | 


--------------------------------------------------------------------------------
/sql/__init__.py:
--------------------------------------------------------------------------------
1 | #-*- coding: utf-8 -*-
2 | 
3 | import sql.sql_base
4 | from sql.mysql import MySql
5 | from sql.mongodb import Mongodb
6 | from sql.sql_manager import SqlManager


--------------------------------------------------------------------------------
/sql/mongodb.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import logging
 4 | import pymongo
 5 | import config
 6 | import json
 7 | import datetime
 8 | 
 9 | from proxy import Proxy
10 | from sql.sql_base import SqlBase
11 | 
12 | 
13 | class Mongodb(SqlBase):
14 |     def __init__(self, **kwargs):
15 |         super(Mongodb, self).__init__(**kwargs)
16 |         self.client = pymongo.MongoClient(**kwargs)
17 |         self.db = self.client[config.database]
18 | 
19 |     def init_database(self, database_name):
20 |         pass
21 | 
22 |     def init_proxy_table(self, table_name):
23 |         pass
24 | 
25 |     def insert_proxy(self, table_name, proxy):
26 |         data = proxy.get_dict()
27 |         data['save_time'] = str(datetime.datetime.now())
28 |         self.db[table_name].insert(data)
29 | 
30 |     def select_proxy(self, table_name, **kwargs):
31 |         filter = {}
32 |         if kwargs.get('anonymity') != '':
33 |             filter['anonymity'] = kwargs.get('anonymity')
34 |         if kwargs.get('https') != '':
35 |             filter['https'] = kwargs.get('https')
36 | 
37 |         data = [item for item in self.db[table_name].find(filter).limit(int(kwargs.get('count')))]
38 |         return data
39 | 
40 |     def update_proxy(self, table_name, proxy):
41 |         self.db[table_name].update_one(
42 |             {'_id': proxy.id},
43 |             {'$set':
44 |                  {'https': proxy.https, 'speed': proxy.speed, 'vali_count': proxy.vali_count,
45 |                   'anonymity': proxy.anonymity, 'save_time': str(datetime.datetime.now())}})
46 | 
47 |     def delete_proxy(self, table_name, proxy):
48 |         return self.del_proxy_with_id(table_name, proxy.id)
49 | 
50 |     def delete_old(self, table_name, day):
51 |         start = datetime.datetime.now()
52 |         end = datetime.datetime.now()
53 |         pass
54 | 
55 |     def get_proxy_count(self, table_name):
56 |         count = self.db[table_name].find().count()
57 |         logging.debug('count:%s' % count)
58 |         return count
59 | 
60 |     def get_proxy_ids(self, table_name):
61 |         ids = self.db[table_name].distinct('_id')
62 |         logging.debug('ids:%s' % ids)
63 |         return ids
64 | 
65 |     def get_proxy_with_id(self, table_name, id):
66 |         data = self.db[table_name].find_one({'_id': id})
67 |         logging.debug(data)
68 |         proxy = Proxy()
69 |         proxy.set_value(
70 |             ip=data.get('ip'),
71 |             port=data.get('port'),
72 |             country=data.get('country'),
73 |             anonymity=data.get('country'),
74 |             https=data.get('https'),
75 |             speed=data.get('speed'),
76 |             source=data.get('source'),
77 |             vali_count=data.get('vali_count')
78 |         )
79 |         proxy.id = data.get('_id')
80 |         return proxy
81 | 
82 |     def del_proxy_with_id(self, table_name, id):
83 |         self.db[table_name].delete_one({'_id': id})
84 |         return True
85 | 
86 |     def del_proxy_with_ip(self, table_name, ip):
87 |         self.db[table_name].delete_one({'ip': ip})
88 |         return True
89 | 
90 |     def commit(self):
91 |         pass
92 | 


--------------------------------------------------------------------------------
/sql/mysql.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import logging
  4 | import utils
  5 | import config
  6 | import pymysql
  7 | 
  8 | from proxy import Proxy
  9 | from sql.sql_base import SqlBase
 10 | 
 11 | 
 12 | class MySql(SqlBase):
 13 |     def __init__(self, **kwargs):
 14 |         super(MySql, self).__init__(**kwargs)
 15 | 
 16 |         self.conn = pymysql.connect(**kwargs)
 17 |         self.cursor = self.conn.cursor()
 18 | 
 19 |         try:
 20 |             self.conn.select_db(config.database)
 21 |         except:
 22 |             self.create_database(config.database)
 23 |             self.conn.select_db(config.database)
 24 | 
 25 |     def create_database(self, database_name):
 26 |         try:
 27 |             command = 'CREATE DATABASE IF NOT EXISTS %s DEFAULT CHARACTER SET \'utf8\' ' % database_name
 28 |             logging.debug('mysql create_database command:%s' % command)
 29 |             self.cursor.execute(command)
 30 |             self.conn.commit()
 31 |         except Exception as e:
 32 |             logging.exception('mysql create_database exception:%s' % e)
 33 | 
 34 |     def init_database(self, database_name):
 35 |         try:
 36 |             command = 'CREATE DATABASE IF NOT EXISTS %s DEFAULT CHARACTER SET \'utf8\' ' % database_name
 37 |             logging.debug('mysql create_database command:%s' % command)
 38 |             self.cursor.execute(command)
 39 |             self.conn.commit()
 40 |         except Exception as e:
 41 |             logging.exception('mysql create_database exception:%s' % e)
 42 | 
 43 |     def init_proxy_table(self, table_name):
 44 |         command = (
 45 |             "CREATE TABLE IF NOT EXISTS {} ("
 46 |             "`id` INT(8) NOT NULL AUTO_INCREMENT,"
 47 |             "`ip` CHAR(25) NOT NULL UNIQUE,"
 48 |             "`port` INT(4) NOT NULL,"
 49 |             "`country` TEXT DEFAULT NULL,"
 50 |             "`anonymity` INT(2) DEFAULT NULL,"
 51 |             "`https` CHAR(4) DEFAULT NULL ,"
 52 |             "`speed` FLOAT DEFAULT NULL,"
 53 |             "`source` CHAR(20) DEFAULT NULL,"
 54 |             "`vali_count` INT(5) DEFAULT 0,"
 55 |             "`created_at` timestamp NULL DEFAULT CURRENT_TIMESTAMP,"
 56 |             "`updated_at` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,"
 57 |             "PRIMARY KEY(id),"
 58 |             "UNIQUE KEY `uniq_ip` (`ip`)"
 59 |             ") ENGINE=InnoDB".format(table_name))
 60 | 
 61 |         self.cursor.execute(command)
 62 |         self.conn.commit()
 63 | 
 64 |     def insert_proxy(self, table_name, proxy):
 65 |         try:
 66 |             command = ("INSERT IGNORE INTO {} "
 67 |                        "(id, ip, port, country, anonymity, https, speed, source, vali_count)"
 68 |                        "VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s)".format(table_name))
 69 | 
 70 |             data = (None, proxy.ip, proxy.port, proxy.country, proxy.anonymity,
 71 |             proxy.https, proxy.speed, proxy.source, proxy.vali_count)
 72 | 
 73 |             self.cursor.execute(command, data)
 74 |             return True
 75 |         except Exception as e:
 76 |             logging.exception('mysql insert_proxy exception msg:%s' % e)
 77 |             return False
 78 | 
 79 |     def insert_valid_proxy(self, id):
 80 |         try:
 81 |             command = "CALL ip_transfer({id})".format(id=id)
 82 |             self.cursor.execute(command)
 83 |         except Exception as e:
 84 |             logging.exception('[Error]mysql#insert_valid_proxy Exception msg:{}'.format(str(e)))
 85 |             raise e
 86 | 
 87 |     def select_proxy(self, table_name, **kwargs):
 88 |         filter = {}
 89 |         for k, v in kwargs.items():
 90 |             if v != '':
 91 |                 filter[k] = v
 92 | 
 93 |         table_name = table_name if table_name else 'free_ipproxy'
 94 | 
 95 |         try:
 96 |             command = "SELECT * FROM {name} WHERE anonymity LIKE '{anonymity}' AND https LIKE '{https}' ORDER BY " \
 97 |                       "{order} {sort} limit {count}". \
 98 |                 format(name=table_name, anonymity=filter.get('anonymity', '%'),
 99 |                        https=filter.get('https', '%'), order=filter.get('order', 'save_time'),
100 |                        sort=filter.get('sort', 'desc'), count=filter.get('count', 100))
101 |             result = self.query(command)
102 |             data = [{
103 |                 'ip': item[1], 'port': item[2], 'anonymity': item[4], 'https': item[5],
104 |                 'speed': item[6], 'save_time': str(item[8])
105 |             } for item in result]
106 |             return data
107 |         except Exception as e:
108 |             logging.exception('mysql select_proxy exception msg:%s' % e)
109 |         return []
110 | 
111 |     def update_proxy(self, table_name, proxy):
112 |         try:
113 |             command = "UPDATE {table_name} set https='{https}', speed={speed}, " \
114 |                       "vali_count={vali_count}, anonymity = {anonymity},save_time={save_time} " \
115 |                       "where id={id};".format(
116 |                 table_name=table_name, https=proxy.https,
117 |                 speed=proxy.speed, id=proxy.id, vali_count=proxy.vali_count, anonymity=proxy.anonymity,
118 |                 save_time='NOW()')
119 |             logging.debug('mysql update_proxy command:%s' % command)
120 |             self.cursor.execute(command)
121 |         except Exception as e:
122 |             logging.exception('mysql update_proxy exception msg:%s' % e)
123 | 
124 |     def update_valid_proxy(self, id=0):
125 |         try:
126 |             command = "UPDATE httpbin SET vali_count=vali_count+1 WHERE id={id}".format(id=id)
127 |             affected_row = self.cursor.execute(command)
128 |             self.commit()
129 |             return affected_row
130 |         except Exception as e:
131 |             logging.exception('[mysql] update_valid_proxy exception:{msg}'.format(str(e)))
132 | 
133 |     def delete_proxy(self, table_name, proxy):
134 |         self.del_proxy_with_id(table_name=table_name, id=proxy.id)
135 | 
136 |     def delete_old(self, table_name, day):
137 |         try:
138 |             command = "DELETE FROM {table} where save_time < SUBDATE(NOW(), INTERVAL {day} DAY)".format(
139 |                 table=config.free_ipproxy_table, day=day)
140 | 
141 |             self.cursor.execute(command)
142 |             self.commit()
143 |         except Exception as e:
144 |             logging.exception('mysql delete_old exception msg:%s' % e)
145 | 
146 |     def get_proxy_count(self, table_name):
147 |         try:
148 |             command = "SELECT COUNT(*) from {}".format(table_name)
149 |             count, = self.query_one(command)
150 |             logging.debug('mysql get_proxy_count count:%s' % count)
151 |             return count
152 |         except Exception as e:
153 |             logging.exception('mysql get_proxy_count exception msg:%s' % e)
154 | 
155 |         return 0
156 | 
157 |     def get_proxy_ids(self, table_name):
158 |         ids = []
159 |         try:
160 |             command = "SELECT id from {}".format(table_name)
161 |             result = self.query(command)
162 |             ids = [item[0] for item in result]
163 |         except Exception as e:
164 |             logging.exception('mysql get_proxy_ids exception msg:%s' % e)
165 | 
166 |         return ids
167 | 
168 |     def get_proxy_with_id(self, table_name, id):
169 |         proxy = Proxy()
170 |         try:
171 |             command = "SELECT * FROM {0} WHERE id=\'{1}\'".format(table_name, id)
172 |             result = self.query_one(command)
173 |             if result != None:
174 |                 # data = {
175 |                 #     'id': result[0],
176 |                 #     'ip': result[1],
177 |                 #     'port': result[2],
178 |                 #     'country': result[3],
179 |                 #     'anonymity': result[4],
180 |                 #     'https': result[5],
181 |                 #     'speed': result[6],
182 |                 #     'source': result[7],
183 |                 #     'save_time': result[8],
184 |                 #     'vali_count': result[9],
185 |                 # }
186 |                 proxy = Proxy()
187 |                 proxy.set_value(
188 |                     ip=result[1],
189 |                     port=result[2],
190 |                     country=result[3],
191 |                     anonymity=result[4],
192 |                     https=result[5],
193 |                     speed=result[6],
194 |                     source=result[7],
195 |                     vali_count=result[9])
196 |                 proxy.id = result[0]
197 |                 proxy.save_time = result[8]
198 |         except Exception as e:
199 |             logging.exception('mysql get_proxy_ids exception msg:%s' % e)
200 | 
201 |         return proxy
202 | 
203 |     def get_proxies_info(self, table_name, start_id=0, limit=100):
204 |         '''批量获取代理表中的id，ip和port信息
205 |         Args:
206 |             @table_name 表名
207 |             @start_id  起始id
208 |             @limit   单批次最大记录数
209 | 
210 |         Return
211 |             包含id,ip,port信息的元组列表
212 | 
213 |         '''
214 |         command = ('SELECT id,ip,port,https from {table} where id >={start_id}'
215 |                    ' order by id asc limit {limit}')
216 |         command = command.format(table=table_name, start_id=start_id, limit=limit)
217 |         proxies_info = []
218 |         try:
219 |             result = self.query(command)
220 |             proxies_info = [proxy for proxy in result]
221 |         except Exception as e:
222 |             logging.exception('[ERROR]#mysql get_proxies_info: {msg}'.format(msg=e))
223 | 
224 |         return proxies_info
225 | 
226 |     def del_proxy_with_id(self, table_name, id):
227 |         res = False
228 |         try:
229 |             command = "DELETE FROM {0} WHERE id={1}".format(table_name, id)
230 |             self.cursor.execute(command)
231 |             res = True
232 |         except Exception as e:
233 |             logging.exception('mysql get_proxy_ids exception msg:%s' % e)
234 | 
235 |         return res
236 | 
237 |     def del_proxy_with_ip(self, table_name, ip):
238 |         res = False
239 |         try:
240 |             command = "DELETE FROM {0} WHERE ip='{1}'".format(table_name, ip)
241 |             self.cursor.execute(command)
242 |             self.commit()
243 |             res = True
244 |         except Exception as e:
245 |             logging.exception('mysql del_proxy_with_ip exception msg:%s' % e)
246 | 
247 |         return res
248 | 
249 |     def create_table(self, command):
250 |         try:
251 |             logging.debug('mysql create_table command:%s' % command)
252 |             x = self.cursor.execute(command)
253 |             self.conn.commit()
254 |             return x
255 |         except Exception as e:
256 |             logging.exception('mysql create_table exception:%s' % e)
257 | 
258 |     def insert_data(self, command, data, commit=False):
259 |         try:
260 |             logging.debug('mysql insert_data command:%s, data:%s' % (command, data))
261 |             x = self.cursor.execute(command, data)
262 |             if commit:
263 |                 self.conn.commit()
264 |             return x
265 |         except Exception as e:
266 |             logging.debug('mysql insert_data exception msg:%s' % e)
267 | 
268 |     def commit(self):
269 |         self.conn.commit()
270 | 
271 |     def execute(self, command, commit=True):
272 |         try:
273 |             logging.debug('mysql execute command:%s' % command)
274 |             data = self.cursor.execute(command)
275 |             if commit:
276 |                 self.conn.commit()
277 |             return data
278 |         except Exception as e:
279 |             logging.exception('mysql execute exception msg:%s' % e)
280 |             return None
281 | 
282 |     def query(self, command, commit=False):
283 |         try:
284 |             logging.debug('mysql execute command:%s' % command)
285 | 
286 |             self.cursor.execute(command)
287 |             data = self.cursor.fetchall()
288 |             if commit:
289 |                 self.conn.commit()
290 |             return data
291 |         except Exception as e:
292 |             logging.exception('mysql execute exception msg:%s' % e)
293 |             return None
294 | 
295 |     def query_one(self, command, commit=False):
296 |         try:
297 |             logging.debug('mysql execute command:%s' % command)
298 | 
299 |             self.cursor.execute(command)
300 |             data = self.cursor.fetchone()
301 |             if commit:
302 |                 self.conn.commit()
303 | 
304 |             return data
305 |         except Exception as e:
306 |             logging.debug('mysql execute exception msg:%s' % str(e))
307 |             return None
308 | 


--------------------------------------------------------------------------------
/sql/sql_base.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: utf-8 -*-
 2 | 
 3 | class SqlBase(object):
 4 |     def __init__(self, **kwargs):
 5 |         pass
 6 | 
 7 |     def init_database(self, database_name):
 8 |         pass
 9 | 
10 |     def init_proxy_table(self, table_name):
11 |         pass
12 | 
13 |     def insert_proxy(self, table_name, proxy):
14 |         pass
15 | 
16 |     def select_proxy(self, table_name, **kwargs):
17 |         pass
18 | 
19 |     def update_proxy(self, table_name, proxy):
20 |         pass
21 | 
22 |     def delete_proxy(self, table_name, proxy):
23 |         pass
24 | 
25 |     def delete_old(self, table_name, day):
26 |         pass
27 | 
28 |     def get_proxy_count(self, table_name):
29 |         pass
30 | 
31 |     def get_proxy_ids(self, table_name):
32 |         pass
33 | 
34 |     def get_proxy_with_id(self, table_name, id):
35 |         pass
36 | 
37 |     def del_proxy_with_id(self, table_name, id):
38 |         pass
39 | 
40 |     def del_proxy_with_ip(self, table_name, ip):
41 |         pass
42 | 
43 |     def commit(self):
44 |         pass
45 | 


--------------------------------------------------------------------------------
/sql/sql_manager.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import config
 4 | 
 5 | from sql.sql_base import SqlBase
 6 | 
 7 | 
 8 | class SqlManager(object):
 9 |     def __init__(self):
10 |         db_type = config.DB_config.get('db_type', 'mysql')
11 |         db_config = config.DB_config.get(db_type)
12 | 
13 |         if db_type == 'mysql':
14 |             from sql.mysql import MySql
15 |             self.sql = MySql(**db_config)
16 |         elif db_type == 'redis':
17 |             pass
18 |         elif db_type == 'sqlite':
19 |             pass
20 |         elif db_type == 'mongodb':
21 |             from sql.mongodb import Mongodb
22 |             self.sql = Mongodb(**db_config)
23 |         else:  # default mysql
24 |             from sql.mysql import MySql
25 |             self.sql = MySql(**config.DB_config.get('db_type'))
26 | 
27 |     def init_database(self, database_name):
28 |         pass
29 | 
30 |     def init_proxy_table(self, table_name):
31 |         return self.sql.init_proxy_table(table_name)
32 | 
33 |     def insert_proxy(self, table_name, proxy):
34 |         return self.sql.insert_proxy(table_name, proxy)
35 | 
36 |     def insert_valid_proxy(self,id=id):
37 |         return self.sql.insert_valid_proxy(id)
38 |  
39 |     def select_proxy(self, table_name, **kwargs):
40 |         return self.sql.select_proxy(table_name, **kwargs)
41 | 
42 |     def update_proxy(self, table_name, proxy):
43 |         return self.sql.update_proxy(table_name, proxy)
44 | 
45 |     def update_valid_proxy(self,id=0):
46 |         return self.sql.update_valid_proxy(id=id)
47 |  
48 |     def delete_proxy(self, table_name, proxy):
49 |         return self.sql.delete_proxy(table_name, proxy)
50 | 
51 |     def delete_old(self, table_name, day):
52 |         return self.sql.delete_old(table_name, day)
53 | 
54 |     def get_proxy_count(self, table_name):
55 |         return self.sql.get_proxy_count(table_name = table_name)
56 | 
57 |     def get_proxy_ids(self, table_name):
58 |         return self.sql.get_proxy_ids(table_name = table_name)
59 | 
60 |     def get_proxy_with_id(self, table_name, id):
61 |         return self.sql.get_proxy_with_id(table_name = table_name, id = id)
62 | 
63 |     def del_proxy_with_id(self, table_name, id):
64 |         return self.sql.del_proxy_with_id(table_name = table_name, id = id)
65 | 
66 |     def del_proxy_with_ip(self, table_name, ip):
67 |         return self.sql.del_proxy_with_ip(table_name = table_name, ip = ip)
68 | 
69 |     def get_proxies_info(self,table_name,start_id=0,limit=10):
70 |         return self.sql.get_proxies_info(table_name=table_name, start_id=start_id, limit=limit)
71 | 
72 |     def commit(self):
73 |         return self.sql.commit()
74 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import logging
 4 | import os
 5 | import re
 6 | import subprocess
 7 | import traceback
 8 | import time
 9 | import datetime
10 | 
11 | 
12 | # 自定义的日志输出
13 | def log(msg, level=logging.DEBUG):
14 |     logging.log(level, msg)
15 |     print('%s [%s], msg:%s' % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), level, msg))
16 | 
17 |     if level == logging.WARNING or level == logging.ERROR:
18 |         for line in traceback.format_stack():
19 |             print(line.strip())
20 | 
21 |         for line in traceback.format_stack():
22 |             logging.log(level, line.strip())
23 | 
24 | 
25 | # 服务器使用，清理端口占用
26 | def kill_ports(ports):
27 |     for port in ports:
28 |         log('kill %s start' % port)
29 |         popen = subprocess.Popen('lsof -i:%s' % port, shell=True, stdout=subprocess.PIPE)
30 |         (data, err) = popen.communicate()
31 |         log('data:\n%s  \nerr:\n%s' % (data, err))
32 | 
33 |         pattern = re.compile(r'\b\d+\b', re.S)
34 |         pids = re.findall(pattern, data.decode())
35 | 
36 |         log('pids:%s' % str(pids))
37 | 
38 |         for pid in pids:
39 |             if pid != '' and pid != None:
40 |                 try:
41 |                     log('pid:%s' % pid)
42 |                     popen = subprocess.Popen('kill -9 %s' % pid, shell=True, stdout=subprocess.PIPE)
43 |                     (data, err) = popen.communicate()
44 |                     log('data:\n%s  \nerr:\n%s' % (data, err))
45 |                 except Exception as e:
46 |                     log('kill_ports exception:%s' % e)
47 | 
48 |         log('kill %s finish' % port)
49 | 
50 |     time.sleep(1)
51 | 
52 | 
53 | # 创建文件夹
54 | def make_dir(dir):
55 |     log('make dir:%s' % dir)
56 |     if not os.path.exists(dir):
57 |         os.makedirs(dir)
58 | 


--------------------------------------------------------------------------------
/weixin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/awolfly9/IPProxyTool/4e4e3aadd30a75f74393b54e8077568b6a58a813/weixin.png


--------------------------------------------------------------------------------