├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
├── __init__.py
├── config.py
├── crawler
├── __init__.py
├── items.py
├── pipelines.py
├── settings.py
└── spiders
│ ├── __init__.py
│ ├── proxy
│ ├── __init__.py
│ ├── basespider.py
│ ├── data5u.py
│ ├── freeproxylists.py
│ ├── gatherproxy.py
│ ├── hidemy.py
│ ├── ip181.py
│ ├── kuaidaili.py
│ ├── proxydb.py
│ ├── proxylistplus.py
│ ├── sixsixip.py
│ ├── usproxy.py
│ └── xicidaili.py
│ └── validator
│ ├── __init__.py
│ ├── amazoncn.py
│ ├── anjuke.py
│ ├── assetstore.py
│ ├── baidu.py
│ ├── bbs.py
│ ├── boss.py
│ ├── douban.py
│ ├── gather.py
│ ├── httpbin.py
│ ├── jd.py
│ ├── lagou.py
│ ├── liepin.py
│ ├── steam.py
│ ├── validator.py
│ └── zhilian.py
├── db.sql
├── ipproxytool.py
├── proxy.py
├── requirements.txt
├── run_crawl_proxy.py
├── run_server.py
├── run_spider.py
├── run_validator.py
├── run_validator_async.py
├── scrapy.cfg
├── server
├── __init__.py
└── dataserver.py
├── sql
├── __init__.py
├── mongodb.py
├── mysql.py
├── sql_base.py
└── sql_manager.py
├── utils.py
└── weixin.png
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a template
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 |
48 | # Translations
49 | *.mo
50 | *.pot
51 |
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 |
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 |
60 | # Scrapy stuff:
61 | .scrapy
62 |
63 | # Sphinx documentation
64 | docs/_build/
65 |
66 | # PyBuilder
67 | target/
68 |
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 |
72 | # pyenv
73 | .python-version
74 |
75 | # celery beat schedule file
76 | celerybeat-schedule
77 |
78 | # dotenv
79 | .env
80 |
81 | # virtualenv
82 | venv/
83 | ENV/
84 |
85 | # Spyder project settings
86 | .spyderproject
87 |
88 | # Rope project settings
89 | .ropeproject
90 |
91 | # custom file
92 | *.json
93 | *.idea
94 | *.DS_Store
95 | *.pyc
96 | test*
97 | headers.py
98 |
99 | # custom dir
100 | log/
101 |
102 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM docker.io/mrjogo/scrapy
2 | ENV PATH /usr/local/bin:$PATH
3 | ENV PATH /home:$PATH
4 | ADD . /home
5 | WORKDIR /home
6 | RUN pip install -i https://mirrors.aliyun.com/pypi/simple -r requirements.txt
7 | CMD python ipproxytool.py
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2016
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # IPProxyTool
2 | 使用 scrapy 爬虫抓取代理网站,获取大量的免费代理 ip。过滤出所有可用的 ip,存入数据库以备使用。
3 | 可以访问我的个人站点,查看我的更多有趣项目 [西瓜](http://xigua233.com/)
4 |
5 | 感谢 [youngjeff](https://github.com/youngjeff) 和我一起维护该项目
6 |
7 | ## 运行环境
8 | 安装 python3 and mysql 数据库
9 |
10 | cryptography模块安装环境:
11 | ```
12 | sudo yum install gcc libffi-devel python-devel openssl-devel
13 | ```
14 |
15 |
16 | ```
17 | $ pip install -r requirements.txt
18 | ```
19 |
20 |
21 |
22 | ## 下载使用
23 | 将项目克隆到本地
24 |
25 | ```
26 | $ git clone https://github.com/awolfly9/IPProxyTool.git
27 | ```
28 |
29 | 进入工程目录
30 |
31 | ```
32 | $ cd IPProxyTool
33 | ```
34 | 修改 mysql 数据库配置 [config.py](https://github.com/awolfly9/IPProxyTool/blob/master/config.py) 中 database_config 的用户名和密码为数据库的用户名和密码
35 |
36 | ```
37 | $ vim config.py
38 | ---------------
39 |
40 | database_config = {
41 | 'host': 'localhost',
42 | 'port': 3306,
43 | 'user': 'root',
44 | 'password': '123456',
45 | 'charset': 'utf8',
46 | }
47 | ```
48 |
49 | MYSQL: 导入数据表结构
50 | ```
51 | $ mysql> create database ipproxy;
52 | Query OK, 1 row affected (0.00 sec)
53 | $ mysql> use ipproxy;
54 | Database changed
55 | $ mysql> source '/你的项目目录/db.sql'
56 |
57 | ```
58 |
59 |
60 | 运行启动脚本 ipproxytool.py 也可以分别运行抓取,验证,服务器接口脚本,运行方法参考项目说明
61 |
62 | ```
63 | $ python ipproxytool.py
64 | ```
65 |
66 | 新增异步验证方式,运行方法如下
67 |
68 | ```
69 | $ python ipproxytool.py async
70 | ```
71 |
72 |
73 | ## 项目说明
74 | #### 抓取代理网站
75 | 所有抓取代理网站的代码都在 [proxy](https://github.com/awolfly9/IPProxyTool/tree/master/ipproxytool/spiders/proxy)
76 | ##### 扩展抓取其他的代理网站
77 | 1.在 proxy 目录下新建脚本并继承自 BaseSpider
78 | 2.设置 name、urls、headers
79 | 3.重写 parse_page 方法,提取代理数据
80 | 4.将数据存入数据库 具体可以参考 [ip181](https://github.com/awolfly9/IPProxyTool/blob/master/ipproxytool/spiders/proxy/ip181.py) [kuaidaili](https://github.com/awolfly9/IPProxyTool/blob/master/ipproxytool/spiders/proxy/kuaidaili.py)
81 | 5.如果需要抓取特别复杂的代理网站,可以参考[peuland](https://github.com/awolfly9/IPProxyTool/blob/master/ipproxytool/spiders/proxy/peuland.py)
82 |
83 | ##### 修改 run_crawl_proxy.py 导入抓取库,添加到抓取队列
84 |
85 | 可以单独运行 run_crawl_proxy.py 脚本开始抓取代理网站
86 |
87 | ```
88 | $ python run_crawl_proxy.py
89 | ```
90 |
91 | #### 验证代理 ip 是否有效
92 | 目前验证方式:
93 | 1.从上一步抓取并存储的数据库中取出所有的代理 IP
94 | 2.利用取出的代理 IP 去请求 [httpbin](http://httpbin.org/get?show_env=1)
95 | 3.根据请求结果判断出代理 IP 的有效性,是否支持 HTTPS 以及匿名度,并存储到表 httpbin 中
96 | 4.从 httpbin 表中取出代理去访问目标网站,例如 [豆瓣](https://www.douban.com/)
97 | 5.如果请求在合适的时间返回成功的数据,则认为这个代理 IP 有效。并且存入相应的表中
98 |
99 | 一个目标网站对应一个脚本,所有验证代理 ip 的代码都在 [validator](https://github.com/awolfly9/IPProxyTool/tree/master/ipproxytool/spiders/validator)
100 | ##### 扩展验证其他网站
101 | 1.在 validator 目录下新建脚本并继承 Validator
102 | 2.设置 name、timeout、urls、headers
103 | 3.然后调用 init 方法,可以参考 [baidu](https://github.com/awolfly9/IPProxyTool/blob/master/ipproxytool/spiders/validator/baidu.py) [douban](https://github.com/awolfly9/IPProxyTool/blob/master/ipproxytool/spiders/validator/douban.py)
104 | 4.如果需要特别复杂的验证方式,可以参考 [assetstore](https://github.com/awolfly9/IPProxyTool/blob/master/ipproxytool/spiders/validator/assetstore.py)
105 | ##### 修改 run_validator.py 导入验证库,添加到验证队列
106 | 可以单独运行 run_validator.py 开始验证代理ip的有效性
107 |
108 | ```
109 | $ python run_validator.py
110 | ```
111 |
112 | ### 获取代理 ip 数据服务器接口
113 | 在 config.py 中修改启动服务器端口配置 data_port,默认为 8000
114 | 启动服务器
115 |
116 | ```
117 | $ python run_server.py
118 | ```
119 |
120 | 服务器提供接口
121 | #### 获取
122 |
123 |
124 | 参数
125 |
126 | | Name | Type | Description | must |
127 | | ---- | ---- | ---- | ---- |
128 | | name | str | 数据库名称 | 是 |
129 | | anonymity | int | 1:高匿 2:匿名 3:透明 | 否 |
130 | | https | str | https:yes http:no | 否 |
131 | | order | str | table 字段 | 否 |
132 | | sort | str | asc 升序,desc 降序 | 否 |
133 | | count | int | 获取代理数量,默认 100 | 否 |
134 |
135 |
136 |
137 |
138 | #### 删除
139 |
140 |
141 | 参数
142 |
143 | | Name | Type | Description | 是否必须|
144 | | ----| ---- | ---- | --- |
145 | | name | str | 数据库名称 | 是 |
146 | | ip | str | 需要删除的 ip | 是 |
147 |
148 | #### 插入
149 |
150 |
151 | 参数
152 |
153 | | Name | Type | Description | 是否必须|
154 | | ----| ---- | ---- | ----|
155 | | name | str | 数据库名称 |是 |
156 | | ip | str | ip 地址 | 是|
157 | | port | str | 端口 |是|
158 | | country | str | 国家 |否|
159 | | anonymity | int | 1:高匿,2:匿名,3:透明 |否|
160 | | https | str | yes:https,no:http |否|
161 | | speed | float | 访问速度 |否|
162 | | source | str | ip 来源 |否|
163 |
164 |
165 | ## TODO
166 | * 添加多数据库支持
167 | * mysql
168 | * redis TODO...
169 | * sqlite TODO...
170 | * 添加抓取更多免费代理网站,目前支持的抓取的免费代理 IP 站点,目前有一些国外的站点连接不稳定
171 | * (国外)
172 | * (国外)
173 | * (国内)
174 | * (国内)
175 | * (国内)
176 | * (国外)
177 | * (国外)
178 | * (国内)
179 | * (国外)
180 | * (国内)
181 | * 分布式部署项目
182 | * ~~添加服务器获取接口更多筛选条件~~
183 | * ~~多进程验证代理 IP~~
184 | * ~~添加 https 支持~~
185 | * ~~添加检测 ip 的匿名度~~
186 |
187 |
188 | ## 参考
189 | * [IPProxyPool](https://github.com/qiyeboy/IPProxyPool)
190 |
191 |
192 | ## 项目更新
193 | -----------------------------2020-12-29----------------------------
194 | 1. 修改之前错误的路径命名
195 | 2. 修改mysql 表结构
196 |
197 | -----------------------------2017-6-23----------------------------
198 | 1.python2 -> python3
199 | 2.web.py -> flask
200 |
201 | -----------------------------2017-5-17----------------------------
202 | 1.本系统在原来的基础上加入了docker。操作见下方,关于docker的相关知识可以上官网看看http://www.docker.com.
203 |
204 | -----------------------------2017-3-30----------------------------
205 | 1.修改完善 readme
206 | 2.数据插入支持事务
207 |
208 | -----------------------------2017-3-14----------------------------
209 | 1.更改服务器接口,添加排序方式
210 | 2.添加多进程方式验证代理 ip 的有效性
211 |
212 | -----------------------------2017-2-20----------------------------
213 | 1.添加服务器获取接口更多筛选条件
214 |
215 |
216 | -----------------------------2017-2-16----------------------------
217 | 1.验证代理 IP 的匿名度
218 | 2.验证代理 IP HTTPS 支持
219 | 3.添加 httpbin 验证并发数设置,默认为 4
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 | ## 在系统中安装docker就可以使用本程序:
233 |
234 | 下载本程序
235 | ```
236 | git clone https://github.com/awolfly9/IPProxyTool
237 | ```
238 |
239 | 然后进入目录:
240 | ```
241 | cd IPProxyTool
242 | ```
243 |
244 | 创建镜像:
245 | ```
246 | docker build -t proxy .
247 | ```
248 |
249 | 运行容器:
250 | ```
251 | docker run -it proxy
252 | ```
253 |
254 | ## 在config.py中按照自己的需求修改配置信息
255 | ```
256 | database_config = {
257 | 'host': 'localhost',
258 | 'port': 3306,
259 | 'user': 'root',
260 | 'password': 'root',
261 | 'charset': 'utf8',
262 | }
263 | ```
264 |
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | #-*- coding: utf-8 -*-
2 |
3 |
--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | DB_config = {
4 | # 'db_type': 'mongodb',
5 | 'db_type': 'mysql',
6 |
7 | 'mysql': {
8 | 'host': 'localhost',
9 | 'port': 3306,
10 | 'user': 'root',
11 | 'password': '123456',
12 | 'charset': 'utf8',
13 | },
14 | 'redis': {
15 | 'host': 'localhost',
16 | 'port': 6379,
17 | 'password': '123456',
18 | 'db': 1,
19 | },
20 | 'mongodb':{
21 | 'host': 'localhost',
22 | 'port': 27017,
23 | 'username': '',
24 | 'password': '',
25 | }
26 | }
27 |
28 | database = 'ipproxy'
29 | free_ipproxy_table = 'free_ipproxy'
30 | httpbin_table = 'httpbin'
31 |
32 | data_port = 8000
33 |
--------------------------------------------------------------------------------
/crawler/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/awolfly9/IPProxyTool/4e4e3aadd30a75f74393b54e8077568b6a58a813/crawler/__init__.py
--------------------------------------------------------------------------------
/crawler/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class CrawlerItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | pass
15 |
--------------------------------------------------------------------------------
/crawler/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 |
9 | class CrawlerPipeline(object):
10 | def process_item(self, item, spider):
11 | return item
12 |
--------------------------------------------------------------------------------
/crawler/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for crawler project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'crawler'
13 |
14 | SPIDER_MODULES = ['crawler.spiders','crawler.spiders.proxy']
15 | NEWSPIDER_MODULE = 'crawler.spiders'
16 |
17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
18 | #USER_AGENT = 'crawler (+http://www.yourdomain.com)'
19 |
20 | # Obey robots.txt rules
21 | ROBOTSTXT_OBEY = False
22 |
23 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
24 | #CONCURRENT_REQUESTS = 32
25 |
26 | # Configure a delay for requests for the same website (default: 0)
27 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
28 | # See also autothrottle settings and docs
29 | DOWNLOAD_DELAY = 0.5
30 | # The download delay setting will honor only one of:
31 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
32 | #CONCURRENT_REQUESTS_PER_IP = 16
33 |
34 | # Disable cookies (enabled by default)
35 | #COOKIES_ENABLED = False
36 |
37 | # Disable Telnet Console (enabled by default)
38 | #TELNETCONSOLE_ENABLED = False
39 |
40 | # Override the default request headers:
41 | #DEFAULT_REQUEST_HEADERS = {
42 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
43 | # 'Accept-Language': 'en',
44 | #}
45 |
46 | # Enable or disable spider middlewares
47 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
48 | #SPIDER_MIDDLEWARES = {
49 | # 'crawler.middlewares.MyCustomSpiderMiddleware': 543,
50 | #}
51 |
52 | # Enable or disable downloader middlewares
53 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
54 | #DOWNLOADER_MIDDLEWARES = {
55 | # 'crawler.middlewares.MyCustomDownloaderMiddleware': 543,
56 | #}
57 |
58 | # Enable or disable extensions
59 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
60 | EXTENSIONS = {
61 | 'scrapy.extensions.telnet.TelnetConsole': None,
62 | }
63 |
64 | # Configure item pipelines
65 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
66 | #ITEM_PIPELINES = {
67 | # 'crawler.pipelines.SomePipeline': 300,
68 | #}
69 |
70 | # Enable and configure the AutoThrottle extension (disabled by default)
71 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
72 | #AUTOTHROTTLE_ENABLED = True
73 | # The initial download delay
74 | #AUTOTHROTTLE_START_DELAY = 5
75 | # The maximum download delay to be set in case of high latencies
76 | #AUTOTHROTTLE_MAX_DELAY = 60
77 | # The average number of requests Scrapy should be sending in parallel to
78 | # each remote server
79 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
80 | # Enable showing throttling stats for every response received:
81 | #AUTOTHROTTLE_DEBUG = False
82 |
83 | # Enable and configure HTTP caching (disabled by default)
84 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
85 | #HTTPCACHE_ENABLED = True
86 | #HTTPCACHE_EXPIRATION_SECS = 0
87 | #HTTPCACHE_DIR = 'httpcache'
88 | #HTTPCACHE_IGNORE_HTTP_CODES = []
89 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
90 |
91 | # RETRY_ENABLED = False
92 |
93 | LOG_ENABLED = True
94 |
--------------------------------------------------------------------------------
/crawler/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/crawler/spiders/proxy/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 |
--------------------------------------------------------------------------------
/crawler/spiders/proxy/basespider.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | import sys
4 | import config
5 | import utils
6 | import datetime
7 |
8 | from scrapy.spiders import Spider
9 | from scrapy.http import Request
10 | from sql import SqlManager
11 |
12 |
13 | class BaseSpider(Spider):
14 | name = 'basespider'
15 |
16 | def __init__(self, *a, **kw):
17 | super(BaseSpider, self).__init__(*a, **kw)
18 |
19 | self.urls = []
20 | self.headers = {}
21 | self.timeout = 10
22 | self.is_record_web_page = True
23 |
24 | self.sql = SqlManager()
25 |
26 | def init(self):
27 | self.meta = {
28 | 'download_timeout': self.timeout,
29 | }
30 |
31 | self.dir_log = 'log/proxy/%s' % self.name
32 | utils.make_dir(self.dir_log)
33 | self.sql.init_proxy_table(config.free_ipproxy_table)
34 |
35 | def start_requests(self):
36 | for i, url in enumerate(self.urls):
37 | yield Request(
38 | url=url,
39 | headers=self.headers,
40 | meta=self.meta,
41 | dont_filter=True,
42 | callback=self.parse_page,
43 | errback=self.error_parse,
44 | )
45 |
46 | def parse_page(self, response):
47 | self.write(response.body)
48 | pass
49 |
50 | def error_parse(self, failure):
51 | request = failure.request
52 | pass
53 |
54 | def add_proxy(self, proxy):
55 | self.sql.insert_proxy(config.free_ipproxy_table, proxy)
56 |
57 | def write(self, data):
58 | if self.is_record_web_page:
59 | with open('%s/%s.html' % (self.dir_log, datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S:%f')),
60 | 'wb') as f:
61 | f.write(data)
62 | f.close()
63 |
64 | def close(spider, reason):
65 | spider.sql.commit()
66 |
--------------------------------------------------------------------------------
/crawler/spiders/proxy/data5u.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from scrapy import Selector
4 | from .basespider import BaseSpider
5 | from proxy import Proxy
6 |
7 |
8 | class Data5uSpider(BaseSpider):
9 | name = 'data5u'
10 |
11 | def __init__(self, *a, **kw):
12 | # 在类的继承中,如果重定义某个方法,该方法会覆盖父类的同名方法
13 | # 但有时,我们希望能同时实现父类的功能,这时,我们就需要调用父类的方法了,可通过使用 super 来实现,比如:
14 | super(Data5uSpider, self).__init__(*a, **kw)
15 |
16 | self.urls = [
17 | 'http://www.data5u.com/'
18 | ]
19 | self.headers = {
20 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
21 | # 'Accept-Encoding': 'gzip, deflate, sdch',
22 | # 'Accept-Language': 'zh-CN,zh;q=0.8',
23 | # 'Connection': 'keep-alive',
24 | 'Host': 'www.data5u.com',
25 | 'Upgrade-Insecure-Requests': 1,
26 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36',
27 | }
28 |
29 | self.init()
30 |
31 | def parse_page(self, response):
32 | self.write(response.body)
33 |
34 | sel = Selector(response)
35 | infos = sel.xpath('//ul[@class="l2"]').extract()
36 | for i, info in enumerate(infos):
37 | val = Selector(text=info)
38 | ip = val.xpath('//ul[@class="l2"]/span[1]/li/text()').extract_first()
39 | port = val.xpath('//ul[@class="l2"]/span[2]/li/text()').extract_first()
40 | anonymity = val.xpath('//ul[@class="l2"]/span[3]/li/text()').extract_first()
41 | https = val.xpath('//ul[@class="l2"]/span[4]/li/text()').extract_first()
42 | country = val.xpath('//ul[@class="l2"]/span[5]/li/a/text()').extract_first()
43 |
44 | proxy = Proxy()
45 | proxy.set_value(
46 | ip=ip,
47 | port=port,
48 | country=country,
49 | anonymity=anonymity,
50 | source=self.name,
51 | )
52 | self.add_proxy(proxy=proxy)
53 |
--------------------------------------------------------------------------------
/crawler/spiders/proxy/freeproxylists.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | import urllib
4 | import re
5 |
6 | from proxy import Proxy
7 | from .basespider import BaseSpider
8 | from bs4 import BeautifulSoup
9 |
10 |
11 | class FreeProxyListsSpider(BaseSpider):
12 | name = 'freeproxylists'
13 |
14 | def __init__(self, *a, **kwargs):
15 | super(FreeProxyListsSpider, self).__init__(*a, **kwargs)
16 | self.urls = [
17 | 'http://www.freeproxylists.net/'
18 | ]
19 | self.headers = {
20 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
21 | 'Accept-Encoding': 'gzip, deflate',
22 | 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
23 | 'Connection': 'keep-alive',
24 | 'Host': 'www.freeproxylists.net',
25 | 'Upgrade-Insecure-Requests': '1',
26 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 Firefox/50.0',
27 | }
28 |
29 | self.init()
30 |
31 | def parse_page(self, response):
32 | pattern = re.compile('', re.S)
33 | items = re.findall(pattern = pattern, string = response.body)
34 | for i, item in enumerate(items):
35 | if i > 0:
36 | if 'async' in item:
37 | continue
38 |
39 | ip_pattern = re.compile('IPDecode\(\"(.*?)\"\)', re.S)
40 | ip_decode = re.findall(ip_pattern, item)[0]
41 | ip_url = urllib.unquote(ip_decode)
42 | ip_soup = BeautifulSoup(ip_url, 'lxml')
43 | ip = ip_soup.text.encode()
44 |
45 | item = '
'
46 | soup = BeautifulSoup(item, 'lxml')
47 | tbodys = soup.find_all('td')
48 |
49 | proxy = Proxy()
50 | proxy.set_value(
51 | ip = ip,
52 | port = tbodys[1].text.encode(),
53 | country = tbodys[4].text.encode(),
54 | anonymity = tbodys[3].text.encode(),
55 | source = self.name,
56 | )
57 |
58 | self.add_proxy(proxy = proxy)
59 |
--------------------------------------------------------------------------------
/crawler/spiders/proxy/gatherproxy.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | import json
4 | import random
5 | import re
6 | import requests
7 |
8 | from proxy import Proxy
9 | from .basespider import BaseSpider
10 |
11 |
12 | class GatherproxySpider(BaseSpider):
13 | name = 'gatherproxy'
14 |
15 | def __init__(self, *a, **kwargs):
16 | super(GatherproxySpider, self).__init__(*a, **kwargs)
17 | self.urls = [
18 | 'http://gatherproxy.com/',
19 | 'http://www.gatherproxy.com/proxylist/anonymity/?t=Anonymous',
20 | 'http://gatherproxy.com/proxylist/country/?c=China',
21 | ]
22 |
23 | self.headers = {
24 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
25 | 'Accept-Encoding': 'gzip, deflate',
26 | 'Accept-Language': 'en-US,en;q=0.5',
27 | 'Connection': 'keep-alive',
28 | 'Host': 'www.gatherproxy.com',
29 | 'Upgrade-Insecure-Requests': '1',
30 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:52.0) Gecko/20100101 Firefox/52.0'
31 | }
32 |
33 | # self.proxies = self.get_proxy()
34 | self.init()
35 |
36 | def parse_page(self, response):
37 | pattern = re.compile('gp.insertPrx\((.*?)\)', re.S)
38 | items = re.findall(pattern, response.body.decode())
39 | for item in items:
40 | data = json.loads(item)
41 | #端口用的是十六进制
42 | port = data.get('PROXY_PORT')
43 | port = str(int(port, 16))
44 |
45 | proxy = Proxy()
46 | proxy.set_value(
47 | ip = data.get('PROXY_IP'),
48 | port = port,
49 | country = data.get('PROXY_COUNTRY'),
50 | anonymity = data.get('PROXY_TYPE'),
51 | source = self.name,
52 | )
53 |
54 | self.add_proxy(proxy = proxy)
55 |
56 | def get_proxy(self):
57 | try:
58 | url = 'http://127.0.0.1:8000/?name={0}'.format(self.name)
59 | r = requests.get(url = url)
60 | if r.text != None and r.text != '':
61 | data = json.loads(r.text)
62 | if len(data) > 0:
63 | proxy = random.choice(data)
64 | ip = proxy.get('ip')
65 | port = proxy.get('port')
66 | address = '%s:%s' % (ip, port)
67 |
68 | proxies = {
69 | 'http': 'http://%s' % address
70 | }
71 | return proxies
72 | except:
73 | return None
74 |
--------------------------------------------------------------------------------
/crawler/spiders/proxy/hidemy.py:
--------------------------------------------------------------------------------
1 | #-*- coding: utf-8 -*-
2 |
3 | import utils
4 |
5 | from scrapy import Selector
6 | from .basespider import BaseSpider
7 | from proxy import Proxy
8 |
9 |
10 | class HidemySpider(BaseSpider):
11 | name = 'hidemy'
12 |
13 | def __init__(self, *a, **kw):
14 | super(HidemySpider, self).__init__(*a, **kw)
15 |
16 | self.urls = ['https://hidemy.name/en/proxy-list/?start=%s' % n for n in range(0, 5 * 64, 64)]
17 | self.headers = {
18 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
19 | 'Accept-Encoding': 'gzip, deflate, br',
20 | 'Accept-Language': 'en-US,en;q=0.5',
21 | 'Connection': 'keep-alive',
22 | 'Host': 'hidemy.name',
23 | 'Referer': 'https://hidemy.name/en/proxy-list/?start=0',
24 | 'Upgrade-Insecure-Requests': '1',
25 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 Firefox/51.0',
26 | }
27 |
28 | self.init()
29 |
30 | def parse_page(self, response):
31 | self.write(response.body)
32 |
33 | sel = Selector(response)
34 | infos = sel.xpath('//tbody/tr').extract()
35 | for i, info in enumerate(infos):
36 | if i == 0:
37 | continue
38 |
39 | val = Selector(text = info)
40 | ip = val.xpath('//td[1]/text()').extract_first()
41 | port = val.xpath('//td[2]/text()').extract_first()
42 | country = val.xpath('//td[3]/div/text()').extract_first()
43 | anonymity = val.xpath('//td[6]/text()').extract_first()
44 |
45 | proxy = Proxy()
46 | proxy.set_value(
47 | ip = ip,
48 | port = port,
49 | country = country,
50 | anonymity = anonymity,
51 | source = self.name,
52 | )
53 |
54 | self.add_proxy(proxy = proxy)
55 |
--------------------------------------------------------------------------------
/crawler/spiders/proxy/ip181.py:
--------------------------------------------------------------------------------
1 | #-*- coding: utf-8 -*-
2 |
3 | from scrapy import Selector
4 | from .basespider import BaseSpider
5 | from proxy import Proxy
6 |
7 |
8 | class IpOneEightOneSpider(BaseSpider):
9 | name = 'ip181'
10 |
11 | def __init__(self, *a, **kw):
12 | super(IpOneEightOneSpider, self).__init__(*a, **kw)
13 |
14 | self.urls = ['http://www.ip181.com/']
15 | self.headers = {
16 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
17 | 'Accept-Encoding': 'gzip, deflate',
18 | 'Accept-Language': 'en-US,en;q=0.5',
19 | 'Connection': 'keep-alive',
20 | 'Host': 'www.ip181.com',
21 | 'Upgrade-Insecure-Requests': '1',
22 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 Firefox/50.0',
23 | }
24 |
25 | self.init()
26 |
27 | def parse_page(self, response):
28 | self.write(response.body)
29 |
30 | sel = Selector(response)
31 | infos = sel.xpath('//tbody/tr').extract()
32 | for i, info in enumerate(infos):
33 | if i == 0:
34 | continue
35 |
36 | val = Selector(text = info)
37 | ip = val.xpath('//td[1]/text()').extract_first()
38 | port = val.xpath('//td[2]/text()').extract_first()
39 | country = val.xpath('//td[6]/text()').extract_first()
40 | anonymity = val.xpath('//td[3]/text()').extract_first()
41 | https = val.xpath('//td[4]/text()').extract_first()
42 |
43 | proxy = Proxy()
44 | proxy.set_value(
45 | ip = ip,
46 | port = port,
47 | country = country,
48 | anonymity = anonymity,
49 | source = self.name,
50 | )
51 |
52 | self.add_proxy(proxy = proxy)
53 |
--------------------------------------------------------------------------------
/crawler/spiders/proxy/kuaidaili.py:
--------------------------------------------------------------------------------
1 | #-*- coding: utf-8 -*-
2 |
3 | import re
4 |
5 | from proxy import Proxy
6 | from .basespider import BaseSpider
7 |
8 |
9 | class KuaiDaiLiSpider(BaseSpider):
10 | name = 'kuaidaili'
11 |
12 | def __init__(self, *a, **kwargs):
13 | super(KuaiDaiLiSpider, self).__init__(*a, **kwargs)
14 |
15 | self.urls = ['https://www.kuaidaili.com/free/inha/%s/' % i for i in range(1, 5)]
16 |
17 | self.headers = {
18 | 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
19 | 'Accept-Encoding':'gzip, deflate',
20 | 'Accept-Language':'zh-CN,zh;q=0.9',
21 | 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
22 | }
23 |
24 | self.is_record_web_page = False
25 | self.init()
26 |
27 | def parse_page(self, response):
28 | pattern = re.compile(
29 | '
\s.*?(.*?)\s.*?(.*?)\s.*?(.*?)\s.*?('
30 | '.*?)\s.*?(.*?)\s.*?(.*?)\s.*?(.*?)\s.*?
',
31 | re.S)
32 | items = re.findall(pattern, response.body.decode())
33 |
34 | for item in items:
35 | proxy = Proxy()
36 | proxy.set_value(
37 | ip = item[0],
38 | port = item[1],
39 | country = item[4],
40 | anonymity = item[2],
41 | source = self.name,
42 | )
43 |
44 | self.add_proxy(proxy)
45 |
--------------------------------------------------------------------------------
/crawler/spiders/proxy/proxydb.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | from proxy import Proxy
4 | from .basespider import BaseSpider
5 | from scrapy.selector import Selector
6 | import re
7 | from base64 import b64decode
8 |
9 | class ProxyDBSpider(BaseSpider):
10 | name = 'proxydb'
11 |
12 | def __init__(self, *a, **kwargs):
13 | super(ProxyDBSpider, self).__init__(*a, **kwargs)
14 | self.urls = ['http://proxydb.net/?protocol=http&protocol=https&offset=%s' % n for n in range(1, 500, 50)]
15 | self.headers = {
16 | 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
17 | 'Accept-Encoding':'gzip, deflate',
18 | 'Accept-Language':'zh-CN,zh;q=0.9',
19 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
20 | }
21 |
22 | self.is_record_web_page = False
23 | self.init()
24 |
25 | def parse_page(self, response):
26 | super(ProxyDBSpider, self).parse_page(response)
27 | for table_item in response.xpath('//tbody/tr'):
28 | ip,port = self.parse_ip(table_item.xpath('.//td[1]/script/text()').extract_first())
29 | country = table_item.xpath('.//td/img/@title').extract_first().strip()
30 | anonymity = table_item.xpath('.//td/span/text()').extract_first().strip()
31 | proxy = Proxy()
32 | proxy.set_value(
33 | ip = ip,
34 | port = port,
35 | country = country,
36 | anonymity = anonymity,
37 | source = self.name
38 | )
39 | self.add_proxy(proxy = proxy)
40 |
41 | def parse_ip(self, page):
42 | ip_part1 = re.search(r'\'(.*)\'\.split',page).group(1)[::-1]
43 | ip_part2= ''.join([chr(int(x,16)) for x in re.findall(r'\\x([0-9A-Fa-f]{2})', page)])
44 | ip_part2= b64decode(ip_part2).decode('utf-8')
45 | port = re.search(r'pp = -(\d+) \+ (\d+);',page).groups()
46 | port = -int(port[0]) + int(port[1])
47 | return [''.join([ip_part1,ip_part2]),port]
48 |
49 |
--------------------------------------------------------------------------------
/crawler/spiders/proxy/proxylistplus.py:
--------------------------------------------------------------------------------
1 | #-*- coding: utf-8 -*-
2 |
3 | from scrapy import Selector
4 | from .basespider import BaseSpider
5 | from proxy import Proxy
6 |
7 |
8 | class ProxylistplusSpider(BaseSpider):
9 | name = 'proxylistplus'
10 |
11 | def __init__(self, *a, **kw):
12 | super(ProxylistplusSpider, self).__init__(*a, **kw)
13 |
14 | self.urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-%s' % n for n in range(1, 3)]
15 | self.headers = {
16 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
17 | 'Accept-Encoding': 'gzip, deflate, br',
18 | 'Accept-Language': 'en-US,en;q=0.5',
19 | 'Cache-Control': 'max-age=0',
20 | 'Connection': 'keep-alive',
21 | 'Host': 'list.proxylistplus.com',
22 | 'If-Modified-Since': 'Mon, 20 Feb 2017 07:47:35 GMT',
23 | 'If-None-Match': 'list381487576865',
24 | 'Referer': 'https://list.proxylistplus.com/Fresh-HTTP-Proxy',
25 | 'Upgrade-Insecure-Requests': '1',
26 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 Firefox/51.0',
27 | }
28 |
29 | self.is_record_web_page = False
30 | self.init()
31 |
32 | def parse_page(self, response):
33 | self.write(response.body)
34 |
35 | sel = Selector(response)
36 | infos = sel.xpath('//tr[@class="cells"]').extract()
37 | for i, info in enumerate(infos):
38 | self.log(info)
39 | val = Selector(text = info)
40 |
41 | ip = val.xpath('//td[2]/text()').extract_first()
42 | port = val.xpath('//td[3]/text()').extract_first()
43 | country = val.xpath('//td[5]/text()').extract_first()
44 | anonymity = val.xpath('//td[4]/text()').extract_first()
45 |
46 | proxy = Proxy()
47 | proxy.set_value(
48 | ip = ip,
49 | port = port,
50 | country = country,
51 | anonymity = anonymity,
52 | source = self.name,
53 | )
54 |
55 | self.add_proxy(proxy = proxy)
56 |
--------------------------------------------------------------------------------
/crawler/spiders/proxy/sixsixip.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | import re
4 |
5 | from proxy import Proxy
6 | from .basespider import BaseSpider
7 |
8 |
9 | class SixSixIpSpider(BaseSpider):
10 | name = 'sixsixip'
11 |
12 | def __init__(self, *a, **kwargs):
13 | super(SixSixIpSpider, self).__init__(*a, **kwargs)
14 |
15 | self.urls = ['http://m.66ip.cn/%s.html' % n for n in range(1, 10)]
16 | self.headers = {
17 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
18 | 'Accept-Encoding': 'gzip, deflate',
19 | 'Accept-Language': 'en-US,en;q=0.5',
20 | 'Cache-Control': 'max-age=0',
21 | 'Connection': 'keep-alive',
22 | 'Host': 'm.66ip.cn',
23 | 'Upgrade-Insecure-Requests': '1',
24 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 Firefox/50.0',
25 | }
26 |
27 | self.init()
28 |
29 | def parse_page(self, response):
30 | pattern = re.compile('(.*?) | (.*?) | (.*?) | (.*?) | (.*?) |
',
31 | re.S)
32 | items = re.findall(pattern, response.body.decode())
33 | for i, item in enumerate(items):
34 | if i >= 1:
35 | proxy = Proxy()
36 | proxy.set_value(
37 | ip = item[0],
38 | port = item[1],
39 | country = item[2],
40 | anonymity = item[3],
41 | source = self.name
42 | )
43 |
44 | self.add_proxy(proxy = proxy)
45 |
--------------------------------------------------------------------------------
/crawler/spiders/proxy/usproxy.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | import re
4 |
5 | from proxy import Proxy
6 | from .basespider import BaseSpider
7 |
8 |
9 | class UsProxySpider(BaseSpider):
10 | name = 'usproxy'
11 |
12 | def __init__(self, *a, **kwargs):
13 | super(UsProxySpider, self).__init__(*a, **kwargs)
14 |
15 | self.urls = [
16 | 'http://www.sslproxies.org/',
17 | 'http://www.us-proxy.org/',
18 | 'http://free-proxy-list.net/uk-proxy.html',
19 | 'http://www.socks-proxy.net/',
20 | ]
21 | self.headers = {
22 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
23 | 'Accept-Encoding': 'gzip, deflate',
24 | 'Accept-Language': 'en-US,en;q=0.5',
25 | 'Cache-Control': 'max-age=0',
26 | 'Connection': 'keep-alive',
27 | 'Host': 'www.us-proxy.org',
28 | 'If-Modified-Since': 'Tue, 24 Jan 2017 03:32:01 GMT',
29 | 'Referer': 'http://www.sslproxies.org/',
30 | 'Upgrade-Insecure-Requests': '1',
31 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 Firefox/50.0',
32 | }
33 |
34 | self.init()
35 |
36 | def parse_page(self, response):
37 | pattern = re.compile(
38 | '(.*?) | (.*?) | (.*?) | (.*?)(.*?) | (.*?)(.*?)(.*?)
',
39 | re.S)
40 | items = re.findall(pattern, response.body.decode())
41 |
42 | if items is not None:
43 | for item in items:
44 | proxy = Proxy()
45 | proxy.set_value(
46 | ip = item[0],
47 | port = item[1],
48 | country = item[3],
49 | anonymity = item[4],
50 | source = self.name,
51 | )
52 |
53 | self.add_proxy(proxy)
54 |
--------------------------------------------------------------------------------
/crawler/spiders/proxy/xicidaili.py:
--------------------------------------------------------------------------------
1 | #-*- coding: utf-8 -*-
2 |
3 | from proxy import Proxy
4 | from .basespider import BaseSpider
5 | from scrapy.selector import Selector
6 |
7 |
8 | class XiCiDaiLiSpider(BaseSpider):
9 | name = 'xici'
10 |
11 | def __init__(self, *a, **kw):
12 | super(XiCiDaiLiSpider, self).__init__(*a, **kw)
13 |
14 | self.urls = ['http://www.xicidaili.com/nn/%s' % n for n in range(1, 10)]
15 | self.headers = {
16 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
17 | 'Accept-Encoding': 'gzip, deflate',
18 | 'Accept-Language': 'en-US,en;q=0.5',
19 | 'Cache-Control': 'max-age=0',
20 | 'Connection': 'keep-alive',
21 | 'Host': 'www.xicidaili.com',
22 | 'If-None-Match': 'W/"cb655e834a031d9237e3c33f3499bd34"',
23 | 'Upgrade-Insecure-Requests': '1',
24 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 Firefox/50.0',
25 | }
26 |
27 | self.init()
28 |
29 | def parse_page(self, response):
30 | sel = Selector(text = response.body)
31 | infos = sel.xpath('//tr[@class="odd"]').extract()
32 | for info in infos:
33 | val = Selector(text = info)
34 | ip = val.xpath('//td[2]/text()').extract_first()
35 | port = val.xpath('//td[3]/text()').extract_first()
36 | country = val.xpath('//td[4]/a/text()').extract_first()
37 | anonymity = val.xpath('//td[5]/text()').extract_first()
38 |
39 | proxy = Proxy()
40 | proxy.set_value(
41 | ip = ip,
42 | port = port,
43 | country = country,
44 | anonymity = anonymity,
45 | source = self.name,
46 | )
47 |
48 | self.add_proxy(proxy = proxy)
49 |
--------------------------------------------------------------------------------
/crawler/spiders/validator/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/crawler/spiders/validator/amazoncn.py:
--------------------------------------------------------------------------------
1 | #-*- coding: utf-8 -*-
2 |
3 | from .validator import Validator
4 |
5 |
6 | class AmazonCnSpider(Validator):
7 | name = 'amazoncn'
8 |
9 | def __init__(self, name = None, **kwargs):
10 | super(AmazonCnSpider, self).__init__(name, **kwargs)
11 |
12 | self.timeout = 5
13 |
14 | self.urls = [
15 | 'https://www.amazon.cn/dp/B00ID363S4',
16 | 'https://www.amazon.cn/gp/product/B01BDBJ71W',
17 | 'https://www.amazon.cn/gp/product/B06XBHPZNC',
18 | ]
19 |
20 | self.headers = {
21 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
22 | 'Accept-Encoding': 'gzip, deflate, br',
23 | 'Accept-Language': 'en-US,en;q=0.5',
24 | 'Connection': 'keep-alive',
25 | 'Host': 'www.amazon.cn',
26 | 'Upgrade-Insecure-Requests': '1',
27 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 '
28 | 'Firefox/50.0',
29 | }
30 |
31 | self.init()
32 |
33 | def success_content_parse(self, response):
34 | if 'Amazon CAPTCHA' in response.text:
35 | return False
36 | return True
37 |
38 |
39 |
40 |
41 |
--------------------------------------------------------------------------------
/crawler/spiders/validator/anjuke.py:
--------------------------------------------------------------------------------
1 | # -*- coding=utf-8 -*-
2 |
3 | import datetime
4 | import json
5 | import random
6 | import re
7 | import time
8 | import config
9 |
10 | from scrapy import Request
11 | from scrapy.selector import Selector
12 | from crawler.spiders.validator.validator import Validator
13 |
14 |
15 | class AJKSpider(Validator):
16 | name = 'ajk'
17 |
18 | concurrent_requests = 16
19 |
20 | def __init__(self, name = None, **kwargs):
21 | super(AJKSpider, self).__init__(name, **kwargs)
22 |
23 | self.region_urls = [
24 | 'aolinpikegongyuan/',
25 | 'anzhen/',
26 | 'baiziwan/',
27 | 'beiyuan/',
28 | 'balizhuangb/',
29 | 'chaoyanggongyuandong/',
30 | 'chaowaidajie/',
31 | 'changying/',
32 | 'chaoyangjichang/',
33 | 'chaoqing/',
34 | 'chaoyanggongyuanxi/',
35 | 'dawanglu/',
36 | 'dongbaa/',
37 | 'dougezhuang/',
38 | 'dongdaqiao/',
39 | 'dingfuzhuang/',
40 | 'fatou/',
41 | 'gaobeidian/',
42 | 'guanzhuang/',
43 | 'guomao/',
44 | 'hepinglibei/',
45 | 'huaweiqiaocy/',
46 | 'jinsongdong/',
47 | 'jianzhanxiang/',
48 | 'jianxiangqiao/',
49 | 'jianguomenwai/',
50 | 'jiuxianqiao/',
51 | 'jinsongxi/',
52 | 'laiguangying/',
53 | 'liufang/',
54 | 'nanshatan/',
55 | 'panjiayuan/',
56 | 'shilihe/',
57 | 'sanlitun/',
58 | 'sihui/',
59 | 'shuangqiaoc/',
60 | 'shifoying/',
61 | 'shibalidian/',
62 | 'shaoyaoju/',
63 | 'shuangjing/',
64 | 'sanyuanqiao/',
65 | 'taiyanggong/',
66 | 'tuanjiehu/',
67 | 'wangjingxi/',
68 | 'wangjingdong/',
69 | 'xiaohongmen/',
70 | 'yayuncun/',
71 | 'chaoyang/',
72 | 'haidian/',
73 | 'dongchenga/',
74 | 'xicheng/',
75 | 'fengtai/',
76 | 'tongzhou/',
77 | 'shijingshan/',
78 | 'changping/',
79 | 'daxing/',
80 | 'shunyi/',
81 | 'fangshan/',
82 | 'mentougou/',
83 | 'miyun/',
84 | 'huairou/',
85 | 'pinggua/',
86 | 'yanqing/',
87 | 'beijingzhoubiana/',
88 | 'baishiqiao/',
89 | 'chedaogou/',
90 | 'dinghuisi/',
91 | 'erlizhuang/',
92 | 'gongzhufenxi/',
93 | 'ganjiakou/',
94 | 'gongzhufendong/',
95 | 'haidianbeibu/',
96 | 'junbo/',
97 | 'madians/',
98 | 'malianwa/',
99 | 'mudanyuan/',
100 | 'qinghe/',
101 | 'shijicheng/',
102 | 'sijiqing/',
103 | 'suzhouqiao/',
104 | 'shangdi/',
105 | 'shuangyushu/',
106 | 'tiancun/',
107 | 'wenquand/',
108 | 'wanquanhe/',
109 | 'wanshoulu/',
110 | 'wanliu/',
111 | 'wudaokou/',
112 | 'weigongcun/',
113 | 'xiangshandong/',
114 | 'xibeiwang/',
115 | 'xierqi/',
116 | 'xiangshangxi/',
117 | 'xiaoxitian/',
118 | 'xisanqi/',
119 | 'xueyuanlu/',
120 | 'yuquanlu/',
121 | 'yiheyuan/',
122 | 'yuanmingyuan/',
123 | 'zaojunmiao/',
124 | 'zizhuqiao/',
125 | 'zhichunlu/',
126 | 'zhongguancun/',
127 | 'andingmen/',
128 | 'chongwenmens/',
129 | 'chaoyangmennei/',
130 | 'dongzhimenwai/',
131 | 'donghuashis/',
132 | 'dongdan/',
133 | 'dongsia/',
134 | 'dongzhimennei/',
135 | 'dengshikou/',
136 | 'guangqumen/',
137 | 'hepinglianan/',
138 | 'jiaodaokou/',
139 | 'jianguomennei/',
140 | 'longtanhus/',
141 | 'qianmens/',
142 | 'tiantans/',
143 | 'wangfujing/',
144 | 'yongdingmens/',
145 | 'yonghegong/',
146 | 'baizhifangs/',
147 | 'baiyunluxc/',
148 | 'changchunjiexc/',
149 | 'chegongzhuanga/',
150 | 'deshengmen/',
151 | 'fuchengmen/',
152 | 'guanganmenwai/',
153 | 'guanganmennei/',
154 | 'guanyuan/',
155 | 'hepingmen/',
156 | 'jinrongjie/',
157 | 'liupukang/',
158 | 'maliandaos/',
159 | 'shichahai/',
160 | 'tianningshi/',
161 | 'taorantings/',
162 | 'xuanwumens/',
163 | 'xizhimenwai/',
164 | 'xisi/',
165 | 'xizhimen/',
166 | 'xinjiekou/',
167 | 'xidan/',
168 | 'yuetan/',
169 | 'beidadi/',
170 | 'caoqiao/',
171 | 'chengshousi/',
172 | 'caihuying/',
173 | 'dahongmen/',
174 | 'fangzhuang/',
175 | 'heyi/',
176 | 'jiaomen/',
177 | 'kandanqiao/',
178 | 'kejiyuanquft/',
179 | 'lizeqiao/',
180 | 'liuliqiaoxi/',
181 | 'lugouqiao/',
182 | 'liujiayao/',
183 | 'liuliqiaodong/',
184 | 'majiabao/',
185 | 'muxiyuan/',
186 | 'puhuangyu/',
187 | 'qilizhuang/',
188 | 'qingta/',
189 | 'songjiazhuang/',
190 | 'xinfadi/',
191 | 'xiluoyuan/',
192 | 'youanmenwai/',
193 | 'yuquanying/',
194 | 'beiguan/',
195 | 'guoyuan/',
196 | 'jiukeshu/',
197 | 'luyuan/',
198 | 'liyuan/',
199 | 'majuqiao/',
200 | 'qiaozhuang/',
201 | 'tuqiao/',
202 | 'tongzhouquqita/',
203 | 'tongzhoubeiyuan/',
204 | 'wuyihuayuan/',
205 | 'xinhuadajie/',
206 | 'bajiao/',
207 | 'gucheng/',
208 | 'laoshan/',
209 | 'lugu/',
210 | 'pingguoyuan/',
211 | 'shijingshana/',
212 | 'yuquanluxi/',
213 | 'yangzhuang/',
214 | 'baishanzhen/',
215 | 'beiqijia/',
216 | 'changpingquqita/',
217 | 'changpingxiancheng/',
218 | 'dongxiaokouzhen/',
219 | 'huoying/',
220 | 'huilongguan/',
221 | 'longze/',
222 | 'lishuiqiao/',
223 | 'nanshao/',
224 | 'nankou/',
225 | 'shahea/',
226 | 'tiantongyuan/',
227 | 'xingshouzhen/',
228 | 'xiaotangshanbei/',
229 | 'xiaotangshannan/',
230 | 'zhuxinzhuang/',
231 | 'daxingquqita/',
232 | 'guanyinsi/',
233 | 'gaomidian/',
234 | 'huangcun/',
235 | 'jiugong/',
236 | 'luchengxiang/',
237 | 'panggezhuang/',
238 | 'qingyundianzhen/',
239 | 'tiangongyuannan/',
240 | 'tiangongyuanbei/',
241 | 'xihongmen/',
242 | 'yinghaizhen/',
243 | 'yizhuang/',
244 | 'zaoyuans/',
245 | 'houshayu/',
246 | 'jichangfujin/',
247 | 'liqiao/',
248 | 'mapo/',
249 | 'shunyiquqita/',
250 | 'shunyicheng/',
251 | 'tianzhu/',
252 | 'yangzhen/',
253 | 'zhongyangbieshuqu/',
254 | 'changyang/',
255 | 'chengguanbj/',
256 | 'doudian/',
257 | 'fangshanquqita/',
258 | 'guandaozhen/',
259 | 'hancunhe/',
260 | 'liangxiang/',
261 | 'liulihe/',
262 | 'yancun/',
263 | 'yanshan/',
264 | 'binhexiqu/',
265 | 'chengzi/',
266 | 'dayu/',
267 | 'fengcun/',
268 | 'mentougouquqita/',
269 | 'shimenying/',
270 | 'yongdingzhen/',
271 | 'badaling/',
272 | 'dayushu/',
273 | 'kangzhuang/',
274 | 'yanqingquqita/',
275 | 'yanqingchengqu/',
276 | 'baodinga/',
277 | 'langfanga/',
278 | 'qinhuangdaoa/',
279 | 'tangshang/',
280 | 'weihaia/',
281 | 'yantaia/',
282 | 'yanjiao/',
283 | 'zhangjiakou/',
284 | ]
285 |
286 | self.price_urls = [
287 | 'zj5332/',
288 | 'zj297/',
289 | 'zj298/',
290 | 'zj299/',
291 | 'zj300/',
292 | 'zj301/',
293 | 'zj33/',
294 | 'zj5333/',
295 | 'zj5334/',
296 | 'zj5335/',
297 | 'zj5336/',
298 | ]
299 |
300 | self.init()
301 |
302 | def start_requests(self):
303 | count = self.sql.get_proxy_count(self.name)
304 | count_free = self.sql.get_proxy_count(config.httpbin_table)
305 |
306 | ids = self.sql.get_proxy_ids(self.name)
307 | ids_httpbin = self.sql.get_proxy_ids(config.httpbin_table)
308 |
309 | for i in range(0, count + count_free):
310 | table = self.name if (i < count) else config.httpbin_table
311 | id = ids[i] if i < count else ids_httpbin[i - len(ids)]
312 |
313 | proxy = self.sql.get_proxy_with_id(table, id)
314 | if proxy == None:
315 | continue
316 |
317 | full_url = 'https://bj.zu.anjuke.com/fangyuan/{region}p{page}-{price}'.format(
318 | region = random.choice(self.region_urls), price = random.choice(self.price_urls), page = 1)
319 | cur_time = time.time()
320 | yield Request(
321 | url = full_url,
322 | headers = {
323 | 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
324 | },
325 | dont_filter = True,
326 | meta = {
327 | 'cur_time': cur_time,
328 | 'download_timeout': self.timeout,
329 | 'proxy_info': proxy,
330 | 'table': table,
331 | 'proxy': 'http://%s:%s' % (proxy.ip, proxy.port),
332 | },
333 | callback = self.success_parse,
334 | errback = self.error_parse,
335 | )
336 |
--------------------------------------------------------------------------------
/crawler/spiders/validator/assetstore.py:
--------------------------------------------------------------------------------
1 | #-*- coding: utf-8 -*-
2 |
3 | import json
4 | import time
5 | import config
6 |
7 | from scrapy.http import Request
8 | from .validator import Validator
9 |
10 |
11 | class AssetStoreSpider(Validator):
12 | name = 'assetstore'
13 |
14 | def __init__(self, *a, **kwargs):
15 | super(AssetStoreSpider, self).__init__(*a, **kwargs)
16 |
17 | self.timeout = 10
18 |
19 | self.init()
20 |
21 | def start_requests(self):
22 | url = 'https://www.assetstore.unity3d.com/login'
23 | yield Request(
24 | url = url,
25 | headers = {
26 | 'Accept': 'application/json',
27 | 'Accept-Encoding': 'gzip, deflate, br',
28 | 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
29 | 'Connection': 'keep-alive',
30 | 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
31 | 'Host': 'www.assetstore.unity3d.com',
32 | 'Referer': 'https://www.assetstore.unity3d.com/en/',
33 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 '
34 | 'Firefox/50.0',
35 | 'X-Kharma-Version': '0',
36 | 'X-Requested-With': 'UnityAssetStore',
37 | 'X-Unity-Session': '26c4202eb475d02864b40827dfff11a14657aa41',
38 | },
39 | meta = {
40 | },
41 | dont_filter = True,
42 | callback = self.get_unity_version,
43 | errback = self.error_parse,
44 | )
45 |
46 | def get_unity_version(self, response):
47 | content = json.loads(response.body)
48 | self.log('unity content:%s' % response.body)
49 |
50 | unity_version = content.get('kharma_version', '')
51 |
52 | headers = {
53 | 'Accept': '*/*',
54 | 'Accept-Encoding': 'gzip, deflate, br',
55 | 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
56 | 'Connection': 'keep-alive',
57 | 'Host': 'www.assetstore.unity3d.com',
58 | 'Referer': 'https://www.assetstore.unity3d.com/en/',
59 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 Firefox/50.0',
60 | 'X-Kharma-Version': unity_version,
61 | 'X-Requested-With': 'UnityAssetStore',
62 | 'X-Unity-Session': '26c4202eb475d02864b40827dfff11a14657aa41',
63 | }
64 |
65 | count = self.sql.get_proxy_count(self.name)
66 | count_free = self.sql.get_proxy_count(config.httpbin_table)
67 |
68 | ids = self.sql.get_proxy_ids(self.name)
69 | ids_free = self.sql.get_proxy_ids(config.httpbin_table)
70 |
71 | for i in range(0, count + count_free):
72 | table = self.name if (i < count) else config.httpbin_table
73 | id = ids[i] if i < count else ids_free[i - len(ids)]
74 |
75 | proxy = self.sql.get_proxy_with_id(table, id)
76 | if proxy == None:
77 | continue
78 |
79 | url = 'https://www.assetstore.unity3d.com/api/en-US/content/overview/' + '368' + '.json'
80 | cur_time = time.time()
81 | yield Request(
82 | url = url,
83 | headers = headers,
84 | meta = {
85 | 'cur_time': cur_time,
86 | 'download_timeout': self.timeout,
87 | 'proxy_info': proxy,
88 | 'table': table,
89 | 'proxy': 'http://%s:%s' % (proxy.ip, proxy.port),
90 | },
91 | dont_filter = True,
92 | callback = self.success_parse,
93 | errback = self.error_parse,
94 | )
95 |
--------------------------------------------------------------------------------
/crawler/spiders/validator/baidu.py:
--------------------------------------------------------------------------------
1 | #-*- coding: utf-8 -*-
2 |
3 | from .validator import Validator
4 |
5 |
6 | class BaiduSpider(Validator):
7 | name = 'baidu'
8 |
9 | def __init__(self, name = None, **kwargs):
10 | super(BaiduSpider, self).__init__(name, **kwargs)
11 |
12 | self.urls = [
13 | 'https://www.baidu.com/'
14 | ]
15 |
16 | self.headers = {
17 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
18 | 'Accept-Encoding': 'gzip, deflate, br',
19 | 'Accept-Language': 'en-US,en;q=0.5',
20 | 'Cache-Control': 'max-age=0',
21 | 'Connection': 'keep-alive',
22 | 'Host': 'www.baidu.com',
23 | 'Upgrade-Insecure-Requests': '1',
24 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 '
25 | 'Firefox/50.0',
26 | }
27 |
28 | self.init()
29 |
--------------------------------------------------------------------------------
/crawler/spiders/validator/bbs.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from .validator import Validator
4 |
5 |
6 | class BBSSpider(Validator):
7 | name = 'bbs'
8 | concurrent_requests = 8
9 |
10 | def __init__(self, name = None, **kwargs):
11 | super(BBSSpider, self).__init__(name, **kwargs)
12 |
13 | self.urls = [
14 | 'http://www.autohome.com.cn/beijing/',
15 | 'http://club.autohome.com.cn/bbs/thread-c-2098-64053713-1.html',
16 | 'http://club.autohome.com.cn/bbs/thread-c-2098-61435076-1.html',
17 | 'http://club.autohome.com.cn/bbs/threadqa-c-4034-63834038-1.html',
18 | 'http://club.autohome.com.cn/bbs/threadqa-c-4034-63083758-1.html',
19 | 'http://club.autohome.com.cn/bbs/threadqa-c-4044-64310067-1.html',
20 | 'http://club.autohome.com.cn/bbs/threadqa-c-4044-64328047-1.html',
21 | 'http://club.autohome.com.cn/bbs/thread-c-4044-63233315-1.html',
22 | 'http://club.autohome.com.cn/bbs/threadqa-c-4044-62349867-1.html',
23 | 'http://club.autohome.com.cn/bbs/thread-c-4034-63846295-1.html',
24 | ]
25 |
26 | self.headers = {
27 | 'Host': 'club.autohome.com.cn',
28 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 '
29 | 'Firefox/50.0',
30 | }
31 |
32 | self.is_record_web_page = False
33 | self.init()
34 |
35 | def success_content_parse(self, response):
36 | if 'conmain' in response.text:
37 | return True
38 | return False
39 |
--------------------------------------------------------------------------------
/crawler/spiders/validator/boss.py:
--------------------------------------------------------------------------------
1 | #-*- coding: utf-8 -*-
2 |
3 | from .validator import Validator
4 |
5 |
6 | class BossSpider(Validator):
7 | name = 'boss'
8 | concurrent_requests = 8
9 |
10 | def __init__(self, name = None, **kwargs):
11 | super(BossSpider, self).__init__(name, **kwargs)
12 |
13 | self.urls = [
14 | 'https://www.zhipin.com/c101010100/h_101010100/?query=java&page=1&ka=page-1'
15 | ]
16 |
17 | self.headers = {
18 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
19 | # 'Accept-Encoding': 'gzip, deflate, br',
20 | # 'Accept-Language': 'en-US,en;q=0.5',
21 | # 'Cache-Control': 'max-age=0',
22 | # 'Connection': 'keep-alive',
23 | # 'Upgrade-Insecure-Requests': '1',
24 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 '
25 | 'Firefox/50.0',
26 | }
27 |
28 | self.is_record_web_page = False
29 | self.init()
30 |
31 | def success_content_parse(self, response):
32 | if '' in response.text:
33 | return True
34 | return False
--------------------------------------------------------------------------------
/crawler/spiders/validator/douban.py:
--------------------------------------------------------------------------------
1 | #-*- coding: utf-8 -*-
2 |
3 | from .validator import Validator
4 |
5 |
6 | class DoubanSpider(Validator):
7 | name = 'douban'
8 |
9 | def __init__(self, name = None, **kwargs):
10 | super(DoubanSpider, self).__init__(name, **kwargs)
11 |
12 | self.timeout = 5
13 |
14 | self.urls = [
15 | 'https://movie.douban.com/subject/3434070/?from=subject-page'
16 | ]
17 |
18 | self.headers = {
19 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
20 | 'Accept-Encoding': 'gzip, deflate, br',
21 | 'Accept-Language': 'en-US,en;q=0.5',
22 | 'Connection': 'keep-alive',
23 | 'Host': 'movie.douban.com',
24 | 'Upgrade-Insecure-Requests': '1',
25 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 '
26 | 'Firefox/50.0',
27 | }
28 |
29 | self.init()
30 |
--------------------------------------------------------------------------------
/crawler/spiders/validator/gather.py:
--------------------------------------------------------------------------------
1 | #-*- coding: utf-8 -*-
2 |
3 | from .validator import Validator
4 |
5 |
6 | class GatherSpider(Validator):
7 | name = 'gather'
8 |
9 | def __init__(self, name = None, **kwargs):
10 | super(GatherSpider, self).__init__(name, **kwargs)
11 |
12 | self.timeout = 10
13 | self.urls = [
14 | 'http://gatherproxy.com/proxylist/anonymity/?t=Anonymous',
15 | 'http://gatherproxy.com/proxylist/country/?c=China'
16 | ]
17 |
18 | self.headers = {
19 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
20 | 'Accept-Encoding': 'gzip, deflate',
21 | 'Accept-Language': 'en-US,en;q=0.5',
22 | 'Connection': 'keep-alive',
23 | 'Host': 'gatherproxy.com',
24 | 'Upgrade-Insecure-Requests': '1',
25 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 '
26 | 'Firefox/50.0',
27 | }
28 |
29 | self.init()
30 |
--------------------------------------------------------------------------------
/crawler/spiders/validator/httpbin.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import json
4 | import time
5 | import requests
6 | import config
7 |
8 | from scrapy import Request
9 | from .validator import Validator
10 |
11 |
12 | class HttpBinSpider(Validator):
13 | name = 'httpbin'
14 | concurrent_requests = 16
15 |
16 | def __init__(self, name=None, **kwargs):
17 | super(HttpBinSpider, self).__init__(name, **kwargs)
18 | self.timeout = 20
19 | self.urls = [
20 | 'http://httpbin.org/get?show_env=1',
21 | 'https://httpbin.org/get?show_env=1',
22 | ]
23 | self.headers = {
24 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
25 | "Accept-Encoding": "gzip, deflate, br",
26 | "Accept-Language": "en-US,en;q=0.5",
27 | "Host": "httpbin.org",
28 | "Upgrade-Insecure-Requests": "1",
29 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 Firefox/51.0"
30 | }
31 |
32 | self.origin_ip = ''
33 |
34 | self.init()
35 |
36 | def init(self):
37 | super(HttpBinSpider, self).init()
38 |
39 | r = requests.get(url=self.urls[0], timeout=20)
40 | data = json.loads(r.text)
41 | self.origin_ip = data.get('origin', '')
42 | self.log('origin ip:%s' % self.origin_ip)
43 |
44 | def start_requests(self):
45 | count = self.sql.get_proxy_count(self.name)
46 | count_free = self.sql.get_proxy_count(config.free_ipproxy_table)
47 |
48 | ids = self.sql.get_proxy_ids(self.name)
49 | ids_free = self.sql.get_proxy_ids(config.free_ipproxy_table)
50 |
51 | for i in range(0, count + count_free):
52 | table = self.name if (i < count) else config.free_ipproxy_table
53 | id = ids[i] if i < count else ids_free[i - len(ids)]
54 |
55 | proxy = self.sql.get_proxy_with_id(table, id)
56 | if proxy == None:
57 | continue
58 |
59 | for url in self.urls:
60 | https = 'yes' if 'https' in url else 'no'
61 |
62 | yield Request(
63 | url=url,
64 | headers=self.headers,
65 | dont_filter=True,
66 | priority=0 if https == 'yes' else 10,
67 | meta={
68 | 'cur_time': time.time(),
69 | 'download_timeout': self.timeout,
70 | 'proxy_info': proxy,
71 | 'table': table,
72 | 'https': https,
73 | 'proxy': 'http://%s:%s' % (proxy.ip, proxy.port),
74 | 'vali_count': proxy.vali_count,
75 | },
76 | callback=self.success_parse,
77 | errback=self.error_parse,
78 | )
79 |
80 | def success_parse(self, response):
81 | proxy = response.meta.get('proxy_info')
82 | table = response.meta.get('table')
83 | proxy.https = response.meta.get('https')
84 |
85 | self.save_page(proxy.ip, response.body)
86 |
87 | if self.success_content_parse(response):
88 | proxy.speed = time.time() - response.meta.get('cur_time')
89 | proxy.vali_count += 1
90 | self.log('proxy_info:%s' % (str(proxy)))
91 |
92 | if proxy.https == 'no':
93 | data = json.loads(response.body)
94 | origin = data.get('origin')
95 | headers = data.get('headers')
96 | x_forwarded_for = headers.get('X-Forwarded-For', None)
97 | x_real_ip = headers.get('X-Real-Ip', None)
98 | via = headers.get('Via', None)
99 |
100 | if self.origin_ip in origin:
101 | proxy.anonymity = 3
102 | elif via is not None:
103 | proxy.anonymity = 2
104 | elif x_forwarded_for is not None and x_real_ip is not None:
105 | proxy.anonymity = 1
106 |
107 | if table == self.name:
108 | if proxy.speed > self.timeout:
109 | self.sql.del_proxy_with_id(table_name=table, id=proxy.id)
110 | else:
111 | self.sql.update_proxy(table_name=table, proxy=proxy)
112 | else:
113 | if proxy.speed < self.timeout:
114 | self.sql.insert_proxy(table_name=self.name, proxy=proxy)
115 | else:
116 | self.sql.update_proxy(table_name=table, proxy=proxy)
117 |
118 | self.sql.commit()
119 |
120 | def error_parse(self, failure):
121 | request = failure.request
122 | self.log('error_parse value:%s url:%s meta:%s' % (failure.value, request.url, request.meta))
123 | https = request.meta.get('https')
124 | if https == 'no':
125 | table = request.meta.get('table')
126 | proxy = request.meta.get('proxy_info')
127 |
128 | if table == self.name:
129 | self.sql.del_proxy_with_id(table_name=table, id=proxy.id)
130 | else:
131 | # TODO... 如果 ip 验证失败应该针对特定的错误类型,进行处理
132 | pass
133 |
--------------------------------------------------------------------------------
/crawler/spiders/validator/jd.py:
--------------------------------------------------------------------------------
1 | #-*- coding: utf-8 -*-
2 |
3 | import random
4 | import time
5 | import re
6 | import config
7 |
8 | from scrapy import Request
9 | from .validator import Validator
10 |
11 |
12 | class JDSpider(Validator):
13 | name = 'jd'
14 |
15 | def __init__(self, name = None, **kwargs):
16 | super(JDSpider, self).__init__(name, **kwargs)
17 |
18 | self.urls = [
19 | 'https://item.jd.com/11478178241.html',
20 | 'https://item.jd.com/4142680.html',
21 | 'https://item.jd.com/3133859.html',
22 | 'https://item.jd.com/11349957411.html',
23 | 'https://item.jd.com/1231104.html',
24 | 'https://item.jd.com/11290644320.html',
25 | 'https://item.jd.com/3553539.html',
26 | 'https://item.jd.com/3553567.html',
27 | 'https://item.jd.com/4640524.html',
28 | 'https://item.jd.com/3652063.html',
29 | 'https://item.jd.com/2967929.html',
30 | 'https://item.jd.com/3367822.html',
31 | 'https://item.jd.com/1217500.html',
32 | ]
33 |
34 | self.headers = {
35 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
36 | 'Accept-Encoding': 'gzip, deflate, br',
37 | 'Accept-Language': 'en-US,en;q=0.5',
38 | 'Connection': 'keep-alive',
39 | 'Host': 'item.jd.com',
40 | 'Upgrade-Insecure-Requests': '1',
41 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:52.0) Gecko/20100101 Firefox/52.0',
42 | }
43 |
44 | self.is_record_web_page = False
45 | self.init()
46 |
47 | def success_content_parse(self, response):
48 | if 'comments' in response.text:
49 | return True
50 | return False
51 |
52 | def start_requests(self):
53 | count = self.sql.get_proxy_count(self.name)
54 | count_httpbin = self.sql.get_proxy_count(config.httpbin_table)
55 |
56 | ids = self.sql.get_proxy_ids(self.name)
57 | ids_httpbin = self.sql.get_proxy_ids(config.httpbin_table)
58 |
59 | for i in range(0, count + count_httpbin):
60 | table = self.name if (i < count) else config.httpbin_table
61 | id = ids[i] if i < count else ids_httpbin[i - len(ids)]
62 |
63 | proxy = self.sql.get_proxy_with_id(table, id)
64 | if proxy == None:
65 | continue
66 |
67 | url = random.choice(self.urls)
68 | pattern = re.compile('\d+', re.S)
69 | product_id = re.search(pattern, url).group()
70 |
71 | cur_time = time.time()
72 | yield Request(
73 | url = url,
74 | headers = self.headers,
75 | meta = {
76 | 'cur_time': cur_time,
77 | 'download_timeout': self.timeout,
78 | 'proxy_info': proxy,
79 | 'table': table,
80 | 'proxy': 'http://%s:%s' % (proxy.ip, proxy.port),
81 | 'product_id': product_id,
82 | },
83 | dont_filter = True,
84 | callback = self.get_comment_count,
85 | errback = self.error_parse,
86 | )
87 |
88 | def get_comment_count(self, response):
89 | name = response.xpath('//img[@id="spec-img"]/@alt').extract_first()
90 | self.log('name:%s time:%s' % (name, time.time() - response.meta.get('cur_time')))
91 |
92 | pattern = re.compile('commentVersion:\'(\d+)\'', re.S)
93 | comment_version = re.search(pattern, response.text).group(1)
94 |
95 | # sort type 5:推荐排序 6:时间排序
96 | url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv' \
97 | '{comment_version}&productId={product_id}&score=0&sortType={sort_type}&page=0&pageSize=10' \
98 | '&isShadowSku=0'. \
99 | format(product_id = response.meta.get('product_id'), comment_version = comment_version, sort_type = '6')
100 |
101 | cur_time = time.time()
102 | yield Request(
103 | url = url,
104 | headers = {
105 | 'Accept': '*/*',
106 | 'Accept-Encoding': 'gzip, deflate, br',
107 | 'Accept-Language': 'en-US,en;q=0.5',
108 | 'Connection': 'keep-alive',
109 | 'Host': 'club.jd.com',
110 | 'Referer': 'https://item.jd.com/%s.html' % response.meta.get('product_id'),
111 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:52.0) Gecko/20100101 '
112 | 'Firefox/52.0',
113 | },
114 | method = 'GET',
115 | meta = {
116 | 'proxy': response.meta.get('proxy'),
117 | 'cur_time': cur_time,
118 | 'download_timeout': self.timeout,
119 | 'proxy_info': response.meta.get('proxy_info'),
120 | 'table': response.meta.get('table'),
121 | },
122 | dont_filter = True,
123 | callback = self.success_parse,
124 | errback = self.error_parse
125 | )
126 |
--------------------------------------------------------------------------------
/crawler/spiders/validator/lagou.py:
--------------------------------------------------------------------------------
1 | #-*- coding: utf-8 -*-
2 |
3 | import time
4 | import config
5 | import utils
6 |
7 | from .validator import Validator
8 | from scrapy.http import FormRequest
9 |
10 |
11 | class LagouSpider(Validator):
12 | name = 'lagou'
13 | concurrent_requests = 8
14 |
15 | def __init__(self, name = None, **kwargs):
16 | super(LagouSpider, self).__init__(name, **kwargs)
17 |
18 | self.urls = [
19 | 'https://www.lagou.com/jobs/positionAjax.json?city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false'
20 | ]
21 |
22 | self.headers = {
23 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
24 | # 'Accept-Encoding': 'gzip, deflate, br',
25 | # 'Accept-Language': 'en-US,en;q=0.5',
26 | # 'Cache-Control': 'max-age=0',
27 | # 'Connection': 'keep-alive',
28 | # 'Upgrade-Insecure-Requests': '1',
29 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 '
30 | 'Firefox/50.0',
31 | }
32 |
33 | self.is_record_web_page = True
34 | self.init()
35 |
36 | def success_content_parse(self, response):
37 | if 'success' in response.text:
38 | return True
39 | return False
40 |
41 | def start_requests(self):
42 | count = self.sql.get_proxy_count(self.name)
43 | count_httpbin = self.sql.get_proxy_count(config.httpbin_table)
44 |
45 | ids = self.sql.get_proxy_ids(self.name)
46 | ids_httpbin = self.sql.get_proxy_ids(config.httpbin_table)
47 |
48 | for i in range(0, count + count_httpbin):
49 | table = self.name if (i < count) else config.httpbin_table
50 | id = ids[i] if i < count else ids_httpbin[i - len(ids)]
51 |
52 | proxy = self.sql.get_proxy_with_id(table, id)
53 | if proxy == None:
54 | continue
55 |
56 | for url in self.urls:
57 | cur_time = time.time()
58 | yield FormRequest(
59 | url = url,
60 | headers = self.headers,
61 | method = 'POST',
62 | meta = {
63 | 'cur_time': cur_time,
64 | 'download_timeout': self.timeout,
65 | 'proxy_info': proxy,
66 | 'table': table,
67 | 'id': proxy.id,
68 | 'proxy': 'http://%s:%s' % (proxy.ip, proxy.port),
69 | 'vali_count': proxy.vali_count,
70 | },
71 | cookies = {
72 | 'Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1488937030',
73 | '_ga': 'GA1.2.40497390.1488937014',
74 | 'TG-TRACK-CODE': 'search_code',
75 | 'index_location_city': '%E5%8C%97%E4%BA%AC',
76 | 'LGRID': '20170308093710-bf6755eb-039f-11e7-8025-525400f775ce',
77 | 'Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1488881288,1488936799,1488936947,1488937014',
78 | 'JSESSIONID': 'BDCBB6167F960CE43AF54B75A651F586',
79 | 'LGSID': '20170308093653-b59316f0-039f-11e7-9229-5254005c3644',
80 | 'LGUID': '20170308093653-b593185f-039f-11e7-9229-5254005c3644',
81 | 'user_trace_token': '20170308093654-723efcfac8fb4c28a670d073d5113e02',
82 | 'SEARCH_ID': '4db4dc3dea1c46b49018ae5421b53ffa'
83 | },
84 | formdata = {
85 | 'first': 'true',
86 | 'kd': 'ios',
87 | 'pn': '1',
88 | },
89 | dont_filter = True,
90 | callback = self.success_parse,
91 | errback = self.error_parse,
92 | )
93 |
--------------------------------------------------------------------------------
/crawler/spiders/validator/liepin.py:
--------------------------------------------------------------------------------
1 | #-*- coding: utf-8 -*-
2 |
3 | from .validator import Validator
4 |
5 |
6 | class LiepinSpider(Validator):
7 | name = 'liepin'
8 | concurrent_requests = 8
9 |
10 | def __init__(self, name = None, **kwargs):
11 | super(LiepinSpider, self).__init__(name, **kwargs)
12 |
13 | self.urls = [
14 | 'https://www.liepin.com/zhaopin/?pubTime=&ckid=17c370b0a0111aa5&fromSearchBtn=2&compkind' \
15 | '=&isAnalysis=&init=-1&searchType=1&dqs=%s&industryType=&jobKind=&sortFlag=15&industries=&salary'
16 | '=&compscale=&clean_condition=&key=%s&headckid=49963e122c30b827&curPage=%s' % ('010', 'ios', '1')
17 | ]
18 |
19 | self.headers = {
20 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
21 | # 'Accept-Encoding': 'gzip, deflate, br',
22 | # 'Accept-Language': 'en-US,en;q=0.5',
23 | # 'Cache-Control': 'max-age=0',
24 | # 'Connection': 'keep-alive',
25 | # 'Upgrade-Insecure-Requests': '1',
26 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 '
27 | 'Firefox/50.0',
28 | }
29 |
30 | self.is_record_web_page = False
31 | self.init()
32 |
33 | def success_content_parse(self, response):
34 | if 'sojob-list' in response.text:
35 | return True
36 | return False
37 |
38 |
--------------------------------------------------------------------------------
/crawler/spiders/validator/steam.py:
--------------------------------------------------------------------------------
1 | #-*- coding: utf-8 -*-
2 |
3 | from .validator import Validator
4 |
5 |
6 | class SteamSpider(Validator):
7 | name = 'steam'
8 |
9 | def __init__(self, name = None, **kwargs):
10 | super(SteamSpider, self).__init__(name, **kwargs)
11 |
12 | self.timeout = 10
13 |
14 | self.urls = [
15 | 'http://store.steampowered.com/app/602580/'
16 | ]
17 |
18 | self.headers = {
19 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
20 | 'Accept-Encoding': 'gzip, deflate',
21 | 'Accept-Language': 'en-US,en;q=0.5',
22 | 'Connection': 'keep-alive',
23 | 'Host': 'store.steampowered.com',
24 | 'Upgrade-Insecure-Requests': '1',
25 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 Firefox/51.0',
26 | }
27 |
28 | self.is_record_web_page = False
29 |
30 | self.init()
31 |
--------------------------------------------------------------------------------
/crawler/spiders/validator/validator.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import random
3 | import time
4 | import datetime
5 | import utils
6 | import config
7 |
8 | from scrapy import Request
9 | from scrapy.spiders import Spider
10 | from sql import SqlManager
11 |
12 |
13 | class Validator(Spider):
14 | name = 'base'
15 | concurrent_requests = 16
16 | retry_enabled = False
17 |
18 | def __init__(self, name = None, **kwargs):
19 | super(Validator, self).__init__(name, **kwargs)
20 |
21 | self.urls = []
22 | self.headers = None
23 | self.timeout = 10
24 | self.success_status = [200]
25 | self.is_record_web_page = False
26 |
27 | self.sql = SqlManager()
28 |
29 | def init(self):
30 | self.dir_log = 'log/validator/%s' % self.name
31 | utils.make_dir(self.dir_log)
32 |
33 | self.sql.init_proxy_table(self.name)
34 |
35 | @classmethod
36 | def update_settings(cls, settings):
37 | settings.setdict(cls.custom_settings or {
38 | 'CONCURRENT_REQUESTS': cls.concurrent_requests,
39 | 'RETRY_ENABLED': cls.retry_enabled,
40 | },
41 | priority = 'spider')
42 |
43 | def start_requests(self):
44 | count = self.sql.get_proxy_count(self.name)
45 | count_free = self.sql.get_proxy_count(config.httpbin_table)
46 |
47 | ids = self.sql.get_proxy_ids(self.name)
48 | ids_httpbin = self.sql.get_proxy_ids(config.httpbin_table)
49 |
50 | for i in range(0, count + count_free):
51 | table = self.name if (i < count) else config.httpbin_table
52 | id = ids[i] if i < count else ids_httpbin[i - len(ids)]
53 |
54 | proxy = self.sql.get_proxy_with_id(table, id)
55 | if proxy == None:
56 | continue
57 |
58 | url = random.choice(self.urls)
59 | cur_time = time.time()
60 | yield Request(
61 | url = url,
62 | headers = self.headers,
63 | meta = {
64 | 'cur_time': cur_time,
65 | 'download_timeout': self.timeout,
66 | 'proxy_info': proxy,
67 | 'table': table,
68 | 'proxy': 'http://%s:%s' % (proxy.ip, proxy.port),
69 | },
70 | dont_filter = True,
71 | callback = self.success_parse,
72 | errback = self.error_parse,
73 | )
74 |
75 | def success_parse(self, response):
76 | proxy = response.meta.get('proxy_info')
77 | table = response.meta.get('table')
78 |
79 | self.save_page(proxy.ip, response.body)
80 | self.log('success_parse speed:%s meta:%s' % (time.time() - response.meta.get('cur_time'), response.meta))
81 |
82 | proxy.vali_count += 1
83 | proxy.speed = time.time() - response.meta.get('cur_time')
84 | if self.success_content_parse(response):
85 | if table == self.name:
86 | if proxy.speed > self.timeout:
87 | self.sql.del_proxy_with_id(table, proxy.id)
88 | else:
89 | self.sql.update_proxy(table, proxy)
90 | else:
91 | if proxy.speed < self.timeout:
92 | self.sql.insert_proxy(table_name = self.name, proxy = proxy)
93 | else:
94 | if table == self.name:
95 | self.sql.del_proxy_with_id(table_name = table, id = proxy.id)
96 |
97 | self.sql.commit()
98 |
99 | def success_content_parse(self, response):
100 | if response.status not in self.success_status:
101 | return False
102 | return True
103 |
104 | def error_parse(self, failure):
105 | request = failure.request
106 | self.log('error_parse value:%s url:%s meta:%s' % (failure.value, request.url, request.meta))
107 |
108 | proxy = failure.request.meta.get('proxy_info')
109 | table = failure.request.meta.get('table')
110 |
111 | if table == self.name:
112 | self.sql.del_proxy_with_id(table_name = table, id = proxy.id)
113 | else:
114 | # TODO... 如果 ip 验证失败应该针对特定的错误类型,进行处理
115 | pass
116 |
117 | #
118 | # request = failure.request.meta
119 | # utils.log('request meta:%s' % str(request))
120 | #
121 | # # log all errback failures,
122 | # # in case you want to do something special for some errors,
123 | # # you may need the failure's type
124 | # self.logger.error(repr(failure))
125 | #
126 | # #if isinstance(failure.value, HttpError):
127 | # if failure.check(HttpError):
128 | # # you can get the response
129 | # response = failure.value.response
130 | # self.logger.error('HttpError on %s', response.url)
131 | #
132 | # #elif isinstance(failure.value, DNSLookupError):
133 | # elif failure.check(DNSLookupError):
134 | # # this is the original request
135 | # request = failure.request
136 | # self.logger.error('DNSLookupError on %s', request.url)
137 | #
138 | # #elif isinstance(failure.value, TimeoutError):
139 | # elif failure.check(TimeoutError):
140 | # request = failure.request
141 | # self.logger.error('TimeoutError on url:%s', request.url)
142 |
143 | def save_page(self, ip, data):
144 | filename = '{time} {ip}'.format(time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S:%f'), ip = ip)
145 |
146 | if self.is_record_web_page:
147 | with open('%s/%s.html' % (self.dir_log, filename), 'wb') as f:
148 | f.write(data)
149 | f.close()
150 |
151 | def close(spider, reason):
152 | spider.sql.commit()
153 |
--------------------------------------------------------------------------------
/crawler/spiders/validator/zhilian.py:
--------------------------------------------------------------------------------
1 | #-*- coding: utf-8 -*-
2 |
3 | from .validator import Validator
4 |
5 |
6 | class ZhiLianSpider(Validator):
7 | name = 'zhilian'
8 | concurrent_requests = 8
9 |
10 | def __init__(self, name = None, **kwargs):
11 | super(ZhiLianSpider, self).__init__(name, **kwargs)
12 |
13 | self.urls = [
14 | 'http://www.zhaopin.com/'
15 | ]
16 |
17 | self.headers = {
18 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
19 | # 'Accept-Encoding': 'gzip, deflate, br',
20 | # 'Accept-Language': 'en-US,en;q=0.5',
21 | # 'Cache-Control': 'max-age=0',
22 | # 'Connection': 'keep-alive',
23 | # 'Upgrade-Insecure-Requests': '1',
24 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36',
25 | }
26 |
27 | self.is_record_web_page = False
28 | self.init()
29 |
30 | def success_content_parse(self, response):
31 | if '' in response.text:
32 | return True
33 | return False
34 |
--------------------------------------------------------------------------------
/db.sql:
--------------------------------------------------------------------------------
1 | -- MySQL dump 10.13 Distrib 5.5.58, for Linux (x86_64)
2 | --
3 | -- Host: localhost Database: ipproxy
4 | -- ------------------------------------------------------
5 | -- Server version 5.5.58
6 |
7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
10 | /*!40101 SET NAMES utf8 */;
11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
12 | /*!40103 SET TIME_ZONE='+00:00' */;
13 | /*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */;
14 | /*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */;
15 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */;
16 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
17 |
18 | --
19 | -- Table structure for table `free_ipproxy`
20 | --
21 |
22 | DROP TABLE IF EXISTS `free_ipproxy`;
23 | /*!40101 SET @saved_cs_client = @@character_set_client */;
24 | /*!40101 SET character_set_client = utf8 */;
25 | CREATE TABLE `free_ipproxy` (
26 | `id` int(8) NOT NULL AUTO_INCREMENT,
27 | `ip` char(25) NOT NULL,
28 | `port` int(4) NOT NULL,
29 | `country` text,
30 | `anonymity` int(2) DEFAULT NULL,
31 | `https` char(4) DEFAULT NULL,
32 | `speed` float DEFAULT NULL,
33 | `source` char(20) DEFAULT NULL,
34 | `save_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
35 | `vali_count` int(5) DEFAULT '0',
36 | PRIMARY KEY (`id`),
37 | UNIQUE KEY `proxy_field` (`ip`,`port`)
38 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
39 | /*!40101 SET character_set_client = @saved_cs_client */;
40 |
41 | --
42 | -- Dumping data for table `free_ipproxy`
43 | --
44 |
45 | LOCK TABLES `free_ipproxy` WRITE;
46 | /*!40000 ALTER TABLE `free_ipproxy` DISABLE KEYS */;
47 | /*!40000 ALTER TABLE `free_ipproxy` ENABLE KEYS */;
48 | UNLOCK TABLES;
49 |
50 | --
51 | -- Table structure for table `httpbin`
52 | --
53 |
54 | DROP TABLE IF EXISTS `httpbin`;
55 | /*!40101 SET @saved_cs_client = @@character_set_client */;
56 | /*!40101 SET character_set_client = utf8 */;
57 | CREATE TABLE `httpbin` (
58 | `id` int(8) NOT NULL AUTO_INCREMENT,
59 | `ip` char(25) NOT NULL,
60 | `port` int(4) NOT NULL,
61 | `country` text,
62 | `anonymity` int(2) DEFAULT NULL,
63 | `https` char(4) DEFAULT NULL,
64 | `speed` float DEFAULT NULL,
65 | `source` char(20) DEFAULT NULL,
66 | `save_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
67 | `vali_count` int(5) DEFAULT '0',
68 | PRIMARY KEY (`id`),
69 | UNIQUE KEY `proxy_field` (`ip`,`port`)
70 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
71 | /*!40101 SET character_set_client = @saved_cs_client */;
72 |
73 | --
74 | -- Dumping data for table `httpbin`
75 | --
76 |
77 | LOCK TABLES `httpbin` WRITE;
78 | /*!40000 ALTER TABLE `httpbin` DISABLE KEYS */;
79 | /*!40000 ALTER TABLE `httpbin` ENABLE KEYS */;
80 | UNLOCK TABLES;
81 |
82 | --
83 | -- Dumping routines for database 'ipproxy'
84 | --
85 | /*!50003 DROP PROCEDURE IF EXISTS `drop_iptables` */;
86 | /*!50003 SET @saved_cs_client = @@character_set_client */ ;
87 | /*!50003 SET @saved_cs_results = @@character_set_results */ ;
88 | /*!50003 SET @saved_col_connection = @@collation_connection */ ;
89 | /*!50003 SET character_set_client = utf8 */ ;
90 | /*!50003 SET character_set_results = utf8 */ ;
91 | /*!50003 SET collation_connection = utf8_general_ci */ ;
92 | /*!50003 SET @saved_sql_mode = @@sql_mode */ ;
93 | /*!50003 SET sql_mode = '' */ ;
94 | DELIMITER ;;
95 | CREATE DEFINER=`root`@`localhost` PROCEDURE `drop_iptables`()
96 | BEGIN
97 | DELETE FROM ipproxy.free_ipproxy;
98 | DELETE FROM ipproxy.httpbin;
99 | TRUNCATE TABLE ipproxy.free_ipproxy;
100 | TRUNCATE TABLE ipproxy.httpbin;
101 | END ;;
102 | DELIMITER ;
103 | /*!50003 SET sql_mode = @saved_sql_mode */ ;
104 | /*!50003 SET character_set_client = @saved_cs_client */ ;
105 | /*!50003 SET character_set_results = @saved_cs_results */ ;
106 | /*!50003 SET collation_connection = @saved_col_connection */ ;
107 | /*!50003 DROP PROCEDURE IF EXISTS `ip_transfer` */;
108 | /*!50003 SET @saved_cs_client = @@character_set_client */ ;
109 | /*!50003 SET @saved_cs_results = @@character_set_results */ ;
110 | /*!50003 SET @saved_col_connection = @@collation_connection */ ;
111 | /*!50003 SET character_set_client = utf8 */ ;
112 | /*!50003 SET character_set_results = utf8 */ ;
113 | /*!50003 SET collation_connection = utf8_general_ci */ ;
114 | /*!50003 SET @saved_sql_mode = @@sql_mode */ ;
115 | /*!50003 SET sql_mode = '' */ ;
116 | DELIMITER ;;
117 | CREATE DEFINER=`root`@`localhost` PROCEDURE `ip_transfer`(IN valid_id INT)
118 | BEGIN DECLARE cur_ip char(25); DECLARE cur_port int(4); SELECT ip,port INTO cur_ip,cur_port FROM free_ipproxy WHERE id = valid_id; DELETE FROM httpbin WHERE ip =cur_ip AND port = cur_port; INSERT INTO httpbin(ip,port,country,anonymity,https,speed,source) SELECT ip,port,country,anonymity,https,speed,source FROM free_ipproxy WHERE id = valid_id; DELETE FROM free_ipproxy where id = valid_id; END ;;
119 | DELIMITER ;
120 | /*!50003 SET sql_mode = @saved_sql_mode */ ;
121 | /*!50003 SET character_set_client = @saved_cs_client */ ;
122 | /*!50003 SET character_set_results = @saved_cs_results */ ;
123 | /*!50003 SET collation_connection = @saved_col_connection */ ;
124 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
125 |
126 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
127 | /*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */;
128 | /*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */;
129 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
130 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
131 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
132 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
133 |
134 | -- Dump completed on 2018-01-25 4:01:20
135 |
--------------------------------------------------------------------------------
/ipproxytool.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 |
3 | import logging
4 | import os
5 | import sys
6 | import subprocess
7 | import run_validator
8 | import run_validator_async
9 |
10 | if __name__ == '__main__':
11 |
12 | # 进入当前项目目录
13 | os.chdir(sys.path[0])
14 |
15 | if not os.path.exists('log'):
16 | os.makedirs('log')
17 |
18 | logging.basicConfig(
19 | filename = 'log/ipproxy.log',
20 | format = '%(asctime)s: %(message)s',
21 | level = logging.DEBUG
22 | )
23 |
24 | subprocess.Popen(['python', 'run_crawl_proxy.py'])
25 | subprocess.Popen(['python', 'run_server.py'])
26 |
27 | if 'async' in sys.argv:
28 | run_validator_async.async_validator()
29 | else:
30 | run_validator.validator()
31 |
32 |
33 |
34 |
--------------------------------------------------------------------------------
/proxy.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 |
4 | class Proxy(object):
5 | def __init__(self):
6 | self.id = 1
7 | self.ip = ''
8 | self.port = ''
9 | self.country = ''
10 | self.anonymity = ''
11 | self.https = ''
12 | self.speed = ''
13 | self.source = ''
14 | self.vali_count = 0
15 |
16 | def __str__(self):
17 | data = {
18 | 'ip': self.ip,
19 | 'port': self.port,
20 | 'country': self.country,
21 | 'anonymity': self.anonymity,
22 | 'https': self.https,
23 | 'speed': self.speed,
24 | 'source': self.source,
25 | 'vali_count': self.vali_count,
26 | }
27 |
28 | return str(data)
29 |
30 | def __dict__(self):
31 | data = {
32 | 'ip': self.ip,
33 | 'port': self.port,
34 | 'country': self.country,
35 | 'anonymity': self.anonymity,
36 | 'https': self.https,
37 | 'speed': self.speed,
38 | 'source': self.source,
39 | 'vali_count': self.vali_count,
40 | }
41 |
42 | return data
43 |
44 | def get_dict(self):
45 | data = {
46 | 'ip': self.ip,
47 | 'port': self.port,
48 | 'country': self.country,
49 | 'anonymity': self.anonymity,
50 | 'https': self.https,
51 | 'speed': self.speed,
52 | 'source': self.source,
53 | 'vali_count': self.vali_count,
54 | }
55 |
56 | return data
57 |
58 | def set_value(self, ip, port, country, anonymity, source='unkonw', https='no', speed=-1, vali_count=0):
59 | self.ip = ip
60 | self.port = port
61 | self.country = country
62 | self.anonymity = self.get_anonymity_type(anonymity)
63 | self.https = https
64 | self.speed = speed
65 | self.source = source
66 | self.vali_count = vali_count
67 |
68 | def get_anonymity_type(self, anonymity):
69 | '''There are 3 levels of proxies according to their anonymity.
70 |
71 | Level 1 - Elite Proxy / Highly Anonymous Proxy: The web server can't detect whether you are using a proxy.
72 | Level 2 - Anonymous Proxy: The web server can know you are using a proxy, but it can't know your real IP.
73 | Level 3 - Transparent Proxy: The web server can know you are using a proxy and it can also know your real
74 | IP.
75 | '''
76 |
77 | if anonymity == u'高匿代理' or anonymity == u'高匿名' or anonymity == 'elite proxy' or \
78 | anonymity == u'超级匿名' or anonymity == u'High':
79 | return '1'
80 | elif anonymity == u'匿名' or anonymity == 'anonymous' or anonymity == u'普通匿名' or anonymity == u'Medium':
81 | return '2'
82 | elif anonymity == u'透明' or anonymity == 'transparent' or anonymity == u'No':
83 | return '3'
84 | else:
85 | return '3'
86 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | aiohttp==3.7.4
2 | async-timeout==3.0.1
3 | attrs==20.3.0
4 | Automat==20.2.0
5 | beautifulsoup4==4.9.3
6 | bs4==0.0.1
7 | certifi==2020.12.5
8 | cffi==1.14.4
9 | chardet==3.0.4
10 | click==7.1.2
11 | constantly==15.1.0
12 | crochet==1.12.0
13 | cryptography==3.3.1
14 | cssselect==1.1.0
15 | Flask==1.1.2
16 | hyperlink==20.0.1
17 | idna==2.10
18 | incremental==17.5.0
19 | itemadapter==0.2.0
20 | itemloaders==1.0.4
21 | itsdangerous==1.1.0
22 | Jinja2==2.11.2
23 | jmespath==0.10.0
24 | logzero==1.6.3
25 | lxml==4.6.2
26 | MarkupSafe==1.1.1
27 | multidict==5.1.0
28 | parsel==1.6.0
29 | Protego==0.1.16
30 | pyasn1==0.4.8
31 | pyasn1-modules==0.2.8
32 | pycparser==2.20
33 | PyDispatcher==2.0.5
34 | Pygments==2.7.3
35 | PyHamcrest==2.0.2
36 | pymongo==3.11.2
37 | PyMySQL==0.10.1
38 | pyOpenSSL==20.0.1
39 | queuelib==1.5.0
40 | requests==2.25.1
41 | Scrapy==2.4.1
42 | scrapydo==0.2.2
43 | service-identity==18.1.0
44 | six==1.15.0
45 | soupsieve==2.1
46 | Twisted==20.3.0
47 | typing-extensions==3.7.4.3
48 | urllib3==1.26.2
49 | w3lib==1.22.0
50 | Werkzeug==1.0.1
51 | wrapt==1.12.1
52 | yarl==1.6.3
53 | zope.interface==5.2.0
54 |
--------------------------------------------------------------------------------
/run_crawl_proxy.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import logging
4 | import os
5 | import sys
6 | import scrapydo
7 | import time
8 | import utils
9 | import config
10 |
11 | from sql import SqlManager
12 | from crawler.spiders.proxy.xicidaili import XiCiDaiLiSpider
13 | from crawler.spiders.proxy.sixsixip import SixSixIpSpider
14 | from crawler.spiders.proxy.ip181 import IpOneEightOneSpider
15 | from crawler.spiders.proxy.kuaidaili import KuaiDaiLiSpider
16 | from crawler.spiders.proxy.gatherproxy import GatherproxySpider
17 | from crawler.spiders.proxy.hidemy import HidemySpider
18 | from crawler.spiders.proxy.proxylistplus import ProxylistplusSpider
19 | from crawler.spiders.proxy.freeproxylists import FreeProxyListsSpider
20 | from crawler.spiders.proxy.usproxy import UsProxySpider
21 | from crawler.spiders.proxy.proxydb import ProxyDBSpider
22 | from crawler.spiders.proxy.data5u import Data5uSpider
23 |
24 |
25 | scrapydo.setup()
26 |
27 | if __name__ == '__main__':
28 | os.chdir(sys.path[0])
29 |
30 | if not os.path.exists('log'):
31 | os.makedirs('log')
32 |
33 | logging.basicConfig(
34 | filename = 'log/crawl_proxy.log',
35 | format = '%(levelname)s %(asctime)s: %(message)s',
36 | level = logging.DEBUG
37 | )
38 | sql = SqlManager()
39 |
40 | spiders = [
41 | # XiCiDaiLiSpider, # 已失效
42 | SixSixIpSpider,
43 | IpOneEightOneSpider,
44 | KuaiDaiLiSpider, # 在访问前加了一个 js ,反爬
45 | GatherproxySpider,
46 | # HidemySpider, 已失效
47 | ProxylistplusSpider,
48 | FreeProxyListsSpider,
49 | # PeulandSpider, # 目标站点失效
50 | UsProxySpider,
51 | ProxyDBSpider,
52 | Data5uSpider,
53 | ]
54 | while True:
55 | utils.log('*******************run spider start...*******************')
56 | #sql.delete_old(config.free_ipproxy_table, 0.5)
57 | try:
58 | for spider in spiders:
59 | scrapydo.run_spider(spider_cls = spider)
60 | except Exception as e:
61 | utils.log('[Error]# spider goes wroing.Return Message: {}'.format(str(e)))
62 |
63 | utils.log('*******************run spider waiting...*******************')
64 | time.sleep(1200)
65 |
--------------------------------------------------------------------------------
/run_server.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import os
4 | import logging
5 | import config
6 | import utils
7 |
8 | from server import dataserver
9 |
10 | if __name__ == '__main__':
11 | if not os.path.exists('log'):
12 | os.makedirs('log')
13 |
14 | logging.basicConfig(
15 | filename='log/server.log',
16 | format='%(levelname)s %(asctime)s: %(message)s',
17 | level=logging.DEBUG
18 | )
19 |
20 | utils.kill_ports([config.data_port])
21 |
22 | dataserver.app.run(
23 | debug=False,
24 | host='127.0.0.1',
25 | port=config.data_port,
26 | )
27 |
--------------------------------------------------------------------------------
/run_spider.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import os
4 | import logging
5 | import sys
6 |
7 | from scrapy.crawler import CrawlerProcess
8 | from scrapy.utils.log import configure_logging
9 | from scrapy.utils.project import get_project_settings
10 |
11 |
12 | def runspider(name):
13 | configure_logging(install_root_handler=False)
14 | logging.basicConfig(
15 | filename='log/%s.log' % name,
16 | format='%(levelname)s %(asctime)s: %(message)s',
17 | level=logging.DEBUG
18 | )
19 | process = CrawlerProcess(get_project_settings())
20 | try:
21 | logging.info('runspider start spider:%s' % name)
22 | process.crawl(name)
23 | process.start()
24 | except Exception as e:
25 | logging.exception('runspider spider:%s exception:%s' % (name, e))
26 |
27 | logging.debug('finish this spider:%s\n\n' % name)
28 |
29 |
30 | if __name__ == '__main__':
31 | try:
32 | name = sys.argv[1] or 'base'
33 | runspider(name)
34 | except Exception as e:
35 | logging.exception('run_spider main exception msg:%s' % e)
36 |
--------------------------------------------------------------------------------
/run_validator.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import logging
4 | import os
5 | import subprocess
6 | import sys
7 | import time
8 | import scrapydo
9 | import utils
10 | from importlib import import_module
11 |
12 | VALIDATORS = {
13 | 'HttpBinSpider': 'crawler.spiders.validator.httpbin',
14 | # 'DoubanSpider':'ipproxytool.spiders.validator.douban',
15 | # 'AssetStoreSpider':'ipproxytool.spiders.validator.assetstore',
16 | # 'GatherSpider' :'ipproxytool.spiders.validator.gather',
17 | # 'HttpBinSpider' :'ipproxytool.spiders.validator.httpbin',
18 | # 'SteamSpider' :'ipproxytool.spiders.validator.steam',
19 | # 'BossSpider' :'ipproxytool.spiders.validator.boss',
20 | # 'LagouSpider' :'ipproxytool.spiders.validator.lagou',
21 | # 'LiepinSpider' :'ipproxytool.spiders.validator.liepin',
22 | # 'JDSpider' :'ipproxytool.spiders.validator.jd',
23 | # 'BBSSpider' :'ipproxytool.spiders.validator.bbs',
24 | # 'ZhiLianSpider' :'ipproxytool.spiders.validator.zhilian',
25 | # 'AmazonCnSpider' :'ipproxytool.spiders.validator.amazoncn',
26 | }
27 |
28 | scrapydo.setup()
29 |
30 |
31 | def validator():
32 | process_list = []
33 | for item, path in VALIDATORS.items():
34 | module = import_module(path)
35 | validator = getattr(module, item)
36 | popen = subprocess.Popen(['python', 'run_spider.py', validator.name], shell=False)
37 | data = {
38 | 'name': validator.name,
39 | 'popen': popen,
40 | }
41 | process_list.append(data)
42 |
43 | while True:
44 | time.sleep(60)
45 | for process in process_list:
46 | popen = process.get('popen', None)
47 | utils.log('name:%s poll:%s' % (process.get('name'), popen.poll()))
48 |
49 | # 检测结束进程,如果有结束进程,重新开启
50 | if popen != None and popen.poll() == 0:
51 | name = process.get('name')
52 | utils.log('%(name)s spider finish...\n' % {'name': name})
53 | process_list.remove(process)
54 | p = subprocess.Popen(['python', 'run_spider.py', name], shell=False)
55 | data = {
56 | 'name': name,
57 | 'popen': p,
58 | }
59 | process_list.append(data)
60 | time.sleep(1)
61 | break
62 |
63 |
64 | if __name__ == '__main__':
65 | os.chdir(sys.path[0])
66 |
67 | if not os.path.exists('log'):
68 | os.makedirs('log')
69 |
70 | logging.basicConfig(
71 | filename='log/validator.log',
72 | format='%(asctime)s: %(message)s',
73 | level=logging.DEBUG
74 | )
75 |
76 | validator()
77 |
--------------------------------------------------------------------------------
/run_validator_async.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import logging
4 | import os
5 | import sys
6 | import time
7 | import utils
8 | import aiohttp
9 | from aiohttp import ClientSession
10 | from sql.sql_manager import SqlManager
11 | import config
12 | import asyncio
13 |
14 | TEST_URL='http://httpbin.org/ip'
15 |
16 | async def test_connect(proxy,operator,mode=None):
17 | conn = aiohttp.TCPConnector(verify_ssl=False)
18 | async with ClientSession(connector=conn) as s:
19 | try:
20 | async with s.get(url=TEST_URL,proxy=proxy[2],
21 | timeout=10,allow_redirects=False) as resp:
22 | page = await resp.text()
23 | if (resp.status != 200 or str(resp.url) != TEST_URL):
24 | utils.log(('[INFO]#proxy:{ip} has been dropped\n'
25 | ' #Reason:Abnormal url or return Code').format(ip=proxy[1]))
26 | operator.del_proxy_with_id(config.free_ipproxy_table,proxy[0])
27 | operator.del_proxy_with_id(config.httpbin_table,proxy[0])
28 | elif mode == 'add':
29 | operator.insert_valid_proxy(id=proxy[0])
30 | else:
31 | operator.update_valid_proxy(id=proxy[0])
32 |
33 | except Exception as e:
34 | utils.log(('[INFO]#proxy:{ip} has been dropped\n'
35 | ' #Reason:{msg}').format(ip=proxy[1],msg=str(e)))
36 | operator.del_proxy_with_id(config.free_ipproxy_table,proxy[0])
37 | operator.del_proxy_with_id(config.httpbin_table,proxy[0])
38 | finally:
39 | operator.commit()
40 |
41 |
42 | def async_validator():
43 | utils.log('[INFO]#Loading ip proxies....60 sec left')
44 | time.sleep(60)
45 | proxy_factory = SqlManager()
46 | loop = asyncio.get_event_loop()
47 | def test_process(table_name,mode=None,limit=50):
48 | id_list = proxy_factory.get_proxy_ids(table_name)
49 | if len(id_list) > 0:
50 | task_len = len(id_list)
51 | cur_id = 0
52 | for sig in range(0,task_len,limit):
53 | proxies = proxy_factory.get_proxies_info(table_name=table_name,
54 | start_id=cur_id,
55 | limit=limit)
56 | if len(proxies) == 0:
57 | break
58 | cur_id = proxies[-1][0]
59 | proxies = [[proxy[0],proxy[1],'http://{}:{}'.format(proxy[1],proxy[2])] for proxy in proxies]
60 | tasks = [test_connect(proxy,proxy_factory,mode) for proxy in proxies]
61 | loop.run_until_complete(asyncio.wait(tasks))
62 | while True:
63 | utils.log('[INFO]Validator process started')
64 | utils.log('[INFO]Validator process:Verify mode start')
65 | test_process(config.httpbin_table)
66 | utils.log('[INFO]Validator process:Add mode start')
67 | test_process(config.free_ipproxy_table,mode='add')
68 | utils.log('[INFO]Validator process completed')
69 | time.sleep(300)
70 |
71 |
72 | if __name__ == '__main__':
73 | if not os.path.exists('log'):
74 | os.makedirs('log')
75 |
76 | logging.basicConfig(
77 | filename = 'log/validator.log',
78 | format = '%(asctime)s: %(message)s',
79 | level = logging.INFO
80 | )
81 | async_validator()
82 |
83 |
84 |
--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = crawler.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = crawler
12 |
--------------------------------------------------------------------------------
/server/__init__.py:
--------------------------------------------------------------------------------
1 | #-*- coding: utf-8 -*-
2 |
3 |
--------------------------------------------------------------------------------
/server/dataserver.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import json
4 | import logging
5 | import sys
6 | import config
7 |
8 | from proxy import Proxy
9 | from sql import SqlManager
10 | from flask import Flask
11 | from flask import request
12 |
13 | app = Flask(__name__)
14 |
15 |
16 | @app.route('/')
17 | def index():
18 | return 'Hello, World!'
19 |
20 |
21 | @app.route('/insert')
22 | def insert():
23 | sql = SqlManager()
24 | name = request.args.get('name')
25 | proxy = Proxy()
26 | proxy.set_value(
27 | ip=request.args.get('ip'),
28 | port=request.args.get('port'),
29 | country=request.args.get('country', None),
30 | anonymity=request.args.get('anonymity', None),
31 | https=request.args.get('https', 'no'),
32 | speed=request.args.get('speed', -1),
33 | source=request.args.get('source', name),
34 | )
35 |
36 | result = sql.insert_proxy(name, proxy)
37 | data = {
38 | 'result': result
39 | }
40 |
41 | return json.dumps(data, indent=4)
42 |
43 |
44 | @app.route('/select')
45 | def select():
46 | sql = SqlManager()
47 | name = request.args.get('name')
48 | anonymity = request.args.get('anonymity', '')
49 | https = request.args.get('https', '')
50 | order = request.args.get('order', 'speed')
51 | sort = request.args.get('sort', 'asc')
52 | count = request.args.get('count', 100)
53 |
54 | kwargs = {
55 | 'anonymity': anonymity,
56 | 'https': https,
57 | 'order': order,
58 | 'sort': sort,
59 | 'count': count
60 | }
61 | result = sql.select_proxy(name, **kwargs)
62 | data = [{
63 | 'ip': item.get('ip'), 'port': item.get('port'),
64 | 'anonymity': item.get('anonymity'), 'https': item.get('https'),
65 | 'speed': item.get('speed'), 'save_time': item.get('save_time', '')
66 | } for item in result]
67 | return json.dumps(data, indent=4)
68 |
69 |
70 | @app.route('/delete')
71 | def delete():
72 | sql = SqlManager()
73 | name = request.args.get('name')
74 | ip = request.args.get('ip')
75 | result = sql.del_proxy_with_ip(name, ip)
76 | data = {'result': result}
77 |
78 | return json.dumps(data, indent=4)
79 |
80 | @app.route('/query')
81 | def query():
82 | sql = SqlManager()
83 | start_id = request.args.get('sid')
84 | limit = int(request.args.get('limit','100'))
85 | proxies = sql.get_proxies_info(config.httpbin_table,start_id=start_id,limit=limit)
86 | data = [{'id':proxy[0],'ip':proxy[1],'port':proxy[2],'https':proxy[3]}
87 | for proxy in proxies]
88 | return json.dumps(data,indent=4)
89 |
90 |
91 |
92 |
--------------------------------------------------------------------------------
/sql/__init__.py:
--------------------------------------------------------------------------------
1 | #-*- coding: utf-8 -*-
2 |
3 | import sql.sql_base
4 | from sql.mysql import MySql
5 | from sql.mongodb import Mongodb
6 | from sql.sql_manager import SqlManager
--------------------------------------------------------------------------------
/sql/mongodb.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import logging
4 | import pymongo
5 | import config
6 | import json
7 | import datetime
8 |
9 | from proxy import Proxy
10 | from sql.sql_base import SqlBase
11 |
12 |
13 | class Mongodb(SqlBase):
14 | def __init__(self, **kwargs):
15 | super(Mongodb, self).__init__(**kwargs)
16 | self.client = pymongo.MongoClient(**kwargs)
17 | self.db = self.client[config.database]
18 |
19 | def init_database(self, database_name):
20 | pass
21 |
22 | def init_proxy_table(self, table_name):
23 | pass
24 |
25 | def insert_proxy(self, table_name, proxy):
26 | data = proxy.get_dict()
27 | data['save_time'] = str(datetime.datetime.now())
28 | self.db[table_name].insert(data)
29 |
30 | def select_proxy(self, table_name, **kwargs):
31 | filter = {}
32 | if kwargs.get('anonymity') != '':
33 | filter['anonymity'] = kwargs.get('anonymity')
34 | if kwargs.get('https') != '':
35 | filter['https'] = kwargs.get('https')
36 |
37 | data = [item for item in self.db[table_name].find(filter).limit(int(kwargs.get('count')))]
38 | return data
39 |
40 | def update_proxy(self, table_name, proxy):
41 | self.db[table_name].update_one(
42 | {'_id': proxy.id},
43 | {'$set':
44 | {'https': proxy.https, 'speed': proxy.speed, 'vali_count': proxy.vali_count,
45 | 'anonymity': proxy.anonymity, 'save_time': str(datetime.datetime.now())}})
46 |
47 | def delete_proxy(self, table_name, proxy):
48 | return self.del_proxy_with_id(table_name, proxy.id)
49 |
50 | def delete_old(self, table_name, day):
51 | start = datetime.datetime.now()
52 | end = datetime.datetime.now()
53 | pass
54 |
55 | def get_proxy_count(self, table_name):
56 | count = self.db[table_name].find().count()
57 | logging.debug('count:%s' % count)
58 | return count
59 |
60 | def get_proxy_ids(self, table_name):
61 | ids = self.db[table_name].distinct('_id')
62 | logging.debug('ids:%s' % ids)
63 | return ids
64 |
65 | def get_proxy_with_id(self, table_name, id):
66 | data = self.db[table_name].find_one({'_id': id})
67 | logging.debug(data)
68 | proxy = Proxy()
69 | proxy.set_value(
70 | ip=data.get('ip'),
71 | port=data.get('port'),
72 | country=data.get('country'),
73 | anonymity=data.get('country'),
74 | https=data.get('https'),
75 | speed=data.get('speed'),
76 | source=data.get('source'),
77 | vali_count=data.get('vali_count')
78 | )
79 | proxy.id = data.get('_id')
80 | return proxy
81 |
82 | def del_proxy_with_id(self, table_name, id):
83 | self.db[table_name].delete_one({'_id': id})
84 | return True
85 |
86 | def del_proxy_with_ip(self, table_name, ip):
87 | self.db[table_name].delete_one({'ip': ip})
88 | return True
89 |
90 | def commit(self):
91 | pass
92 |
--------------------------------------------------------------------------------
/sql/mysql.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import logging
4 | import utils
5 | import config
6 | import pymysql
7 |
8 | from proxy import Proxy
9 | from sql.sql_base import SqlBase
10 |
11 |
12 | class MySql(SqlBase):
13 | def __init__(self, **kwargs):
14 | super(MySql, self).__init__(**kwargs)
15 |
16 | self.conn = pymysql.connect(**kwargs)
17 | self.cursor = self.conn.cursor()
18 |
19 | try:
20 | self.conn.select_db(config.database)
21 | except:
22 | self.create_database(config.database)
23 | self.conn.select_db(config.database)
24 |
25 | def create_database(self, database_name):
26 | try:
27 | command = 'CREATE DATABASE IF NOT EXISTS %s DEFAULT CHARACTER SET \'utf8\' ' % database_name
28 | logging.debug('mysql create_database command:%s' % command)
29 | self.cursor.execute(command)
30 | self.conn.commit()
31 | except Exception as e:
32 | logging.exception('mysql create_database exception:%s' % e)
33 |
34 | def init_database(self, database_name):
35 | try:
36 | command = 'CREATE DATABASE IF NOT EXISTS %s DEFAULT CHARACTER SET \'utf8\' ' % database_name
37 | logging.debug('mysql create_database command:%s' % command)
38 | self.cursor.execute(command)
39 | self.conn.commit()
40 | except Exception as e:
41 | logging.exception('mysql create_database exception:%s' % e)
42 |
43 | def init_proxy_table(self, table_name):
44 | command = (
45 | "CREATE TABLE IF NOT EXISTS {} ("
46 | "`id` INT(8) NOT NULL AUTO_INCREMENT,"
47 | "`ip` CHAR(25) NOT NULL UNIQUE,"
48 | "`port` INT(4) NOT NULL,"
49 | "`country` TEXT DEFAULT NULL,"
50 | "`anonymity` INT(2) DEFAULT NULL,"
51 | "`https` CHAR(4) DEFAULT NULL ,"
52 | "`speed` FLOAT DEFAULT NULL,"
53 | "`source` CHAR(20) DEFAULT NULL,"
54 | "`vali_count` INT(5) DEFAULT 0,"
55 | "`created_at` timestamp NULL DEFAULT CURRENT_TIMESTAMP,"
56 | "`updated_at` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,"
57 | "PRIMARY KEY(id),"
58 | "UNIQUE KEY `uniq_ip` (`ip`)"
59 | ") ENGINE=InnoDB".format(table_name))
60 |
61 | self.cursor.execute(command)
62 | self.conn.commit()
63 |
64 | def insert_proxy(self, table_name, proxy):
65 | try:
66 | command = ("INSERT IGNORE INTO {} "
67 | "(id, ip, port, country, anonymity, https, speed, source, vali_count)"
68 | "VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s)".format(table_name))
69 |
70 | data = (None, proxy.ip, proxy.port, proxy.country, proxy.anonymity,
71 | proxy.https, proxy.speed, proxy.source, proxy.vali_count)
72 |
73 | self.cursor.execute(command, data)
74 | return True
75 | except Exception as e:
76 | logging.exception('mysql insert_proxy exception msg:%s' % e)
77 | return False
78 |
79 | def insert_valid_proxy(self, id):
80 | try:
81 | command = "CALL ip_transfer({id})".format(id=id)
82 | self.cursor.execute(command)
83 | except Exception as e:
84 | logging.exception('[Error]mysql#insert_valid_proxy Exception msg:{}'.format(str(e)))
85 | raise e
86 |
87 | def select_proxy(self, table_name, **kwargs):
88 | filter = {}
89 | for k, v in kwargs.items():
90 | if v != '':
91 | filter[k] = v
92 |
93 | table_name = table_name if table_name else 'free_ipproxy'
94 |
95 | try:
96 | command = "SELECT * FROM {name} WHERE anonymity LIKE '{anonymity}' AND https LIKE '{https}' ORDER BY " \
97 | "{order} {sort} limit {count}". \
98 | format(name=table_name, anonymity=filter.get('anonymity', '%'),
99 | https=filter.get('https', '%'), order=filter.get('order', 'save_time'),
100 | sort=filter.get('sort', 'desc'), count=filter.get('count', 100))
101 | result = self.query(command)
102 | data = [{
103 | 'ip': item[1], 'port': item[2], 'anonymity': item[4], 'https': item[5],
104 | 'speed': item[6], 'save_time': str(item[8])
105 | } for item in result]
106 | return data
107 | except Exception as e:
108 | logging.exception('mysql select_proxy exception msg:%s' % e)
109 | return []
110 |
111 | def update_proxy(self, table_name, proxy):
112 | try:
113 | command = "UPDATE {table_name} set https='{https}', speed={speed}, " \
114 | "vali_count={vali_count}, anonymity = {anonymity},save_time={save_time} " \
115 | "where id={id};".format(
116 | table_name=table_name, https=proxy.https,
117 | speed=proxy.speed, id=proxy.id, vali_count=proxy.vali_count, anonymity=proxy.anonymity,
118 | save_time='NOW()')
119 | logging.debug('mysql update_proxy command:%s' % command)
120 | self.cursor.execute(command)
121 | except Exception as e:
122 | logging.exception('mysql update_proxy exception msg:%s' % e)
123 |
124 | def update_valid_proxy(self, id=0):
125 | try:
126 | command = "UPDATE httpbin SET vali_count=vali_count+1 WHERE id={id}".format(id=id)
127 | affected_row = self.cursor.execute(command)
128 | self.commit()
129 | return affected_row
130 | except Exception as e:
131 | logging.exception('[mysql] update_valid_proxy exception:{msg}'.format(str(e)))
132 |
133 | def delete_proxy(self, table_name, proxy):
134 | self.del_proxy_with_id(table_name=table_name, id=proxy.id)
135 |
136 | def delete_old(self, table_name, day):
137 | try:
138 | command = "DELETE FROM {table} where save_time < SUBDATE(NOW(), INTERVAL {day} DAY)".format(
139 | table=config.free_ipproxy_table, day=day)
140 |
141 | self.cursor.execute(command)
142 | self.commit()
143 | except Exception as e:
144 | logging.exception('mysql delete_old exception msg:%s' % e)
145 |
146 | def get_proxy_count(self, table_name):
147 | try:
148 | command = "SELECT COUNT(*) from {}".format(table_name)
149 | count, = self.query_one(command)
150 | logging.debug('mysql get_proxy_count count:%s' % count)
151 | return count
152 | except Exception as e:
153 | logging.exception('mysql get_proxy_count exception msg:%s' % e)
154 |
155 | return 0
156 |
157 | def get_proxy_ids(self, table_name):
158 | ids = []
159 | try:
160 | command = "SELECT id from {}".format(table_name)
161 | result = self.query(command)
162 | ids = [item[0] for item in result]
163 | except Exception as e:
164 | logging.exception('mysql get_proxy_ids exception msg:%s' % e)
165 |
166 | return ids
167 |
168 | def get_proxy_with_id(self, table_name, id):
169 | proxy = Proxy()
170 | try:
171 | command = "SELECT * FROM {0} WHERE id=\'{1}\'".format(table_name, id)
172 | result = self.query_one(command)
173 | if result != None:
174 | # data = {
175 | # 'id': result[0],
176 | # 'ip': result[1],
177 | # 'port': result[2],
178 | # 'country': result[3],
179 | # 'anonymity': result[4],
180 | # 'https': result[5],
181 | # 'speed': result[6],
182 | # 'source': result[7],
183 | # 'save_time': result[8],
184 | # 'vali_count': result[9],
185 | # }
186 | proxy = Proxy()
187 | proxy.set_value(
188 | ip=result[1],
189 | port=result[2],
190 | country=result[3],
191 | anonymity=result[4],
192 | https=result[5],
193 | speed=result[6],
194 | source=result[7],
195 | vali_count=result[9])
196 | proxy.id = result[0]
197 | proxy.save_time = result[8]
198 | except Exception as e:
199 | logging.exception('mysql get_proxy_ids exception msg:%s' % e)
200 |
201 | return proxy
202 |
203 | def get_proxies_info(self, table_name, start_id=0, limit=100):
204 | '''批量获取代理表中的id,ip和port信息
205 | Args:
206 | @table_name 表名
207 | @start_id 起始id
208 | @limit 单批次最大记录数
209 |
210 | Return
211 | 包含id,ip,port信息的元组列表
212 |
213 | '''
214 | command = ('SELECT id,ip,port,https from {table} where id >={start_id}'
215 | ' order by id asc limit {limit}')
216 | command = command.format(table=table_name, start_id=start_id, limit=limit)
217 | proxies_info = []
218 | try:
219 | result = self.query(command)
220 | proxies_info = [proxy for proxy in result]
221 | except Exception as e:
222 | logging.exception('[ERROR]#mysql get_proxies_info: {msg}'.format(msg=e))
223 |
224 | return proxies_info
225 |
226 | def del_proxy_with_id(self, table_name, id):
227 | res = False
228 | try:
229 | command = "DELETE FROM {0} WHERE id={1}".format(table_name, id)
230 | self.cursor.execute(command)
231 | res = True
232 | except Exception as e:
233 | logging.exception('mysql get_proxy_ids exception msg:%s' % e)
234 |
235 | return res
236 |
237 | def del_proxy_with_ip(self, table_name, ip):
238 | res = False
239 | try:
240 | command = "DELETE FROM {0} WHERE ip='{1}'".format(table_name, ip)
241 | self.cursor.execute(command)
242 | self.commit()
243 | res = True
244 | except Exception as e:
245 | logging.exception('mysql del_proxy_with_ip exception msg:%s' % e)
246 |
247 | return res
248 |
249 | def create_table(self, command):
250 | try:
251 | logging.debug('mysql create_table command:%s' % command)
252 | x = self.cursor.execute(command)
253 | self.conn.commit()
254 | return x
255 | except Exception as e:
256 | logging.exception('mysql create_table exception:%s' % e)
257 |
258 | def insert_data(self, command, data, commit=False):
259 | try:
260 | logging.debug('mysql insert_data command:%s, data:%s' % (command, data))
261 | x = self.cursor.execute(command, data)
262 | if commit:
263 | self.conn.commit()
264 | return x
265 | except Exception as e:
266 | logging.debug('mysql insert_data exception msg:%s' % e)
267 |
268 | def commit(self):
269 | self.conn.commit()
270 |
271 | def execute(self, command, commit=True):
272 | try:
273 | logging.debug('mysql execute command:%s' % command)
274 | data = self.cursor.execute(command)
275 | if commit:
276 | self.conn.commit()
277 | return data
278 | except Exception as e:
279 | logging.exception('mysql execute exception msg:%s' % e)
280 | return None
281 |
282 | def query(self, command, commit=False):
283 | try:
284 | logging.debug('mysql execute command:%s' % command)
285 |
286 | self.cursor.execute(command)
287 | data = self.cursor.fetchall()
288 | if commit:
289 | self.conn.commit()
290 | return data
291 | except Exception as e:
292 | logging.exception('mysql execute exception msg:%s' % e)
293 | return None
294 |
295 | def query_one(self, command, commit=False):
296 | try:
297 | logging.debug('mysql execute command:%s' % command)
298 |
299 | self.cursor.execute(command)
300 | data = self.cursor.fetchone()
301 | if commit:
302 | self.conn.commit()
303 |
304 | return data
305 | except Exception as e:
306 | logging.debug('mysql execute exception msg:%s' % str(e))
307 | return None
308 |
--------------------------------------------------------------------------------
/sql/sql_base.py:
--------------------------------------------------------------------------------
1 | #-*- coding: utf-8 -*-
2 |
3 | class SqlBase(object):
4 | def __init__(self, **kwargs):
5 | pass
6 |
7 | def init_database(self, database_name):
8 | pass
9 |
10 | def init_proxy_table(self, table_name):
11 | pass
12 |
13 | def insert_proxy(self, table_name, proxy):
14 | pass
15 |
16 | def select_proxy(self, table_name, **kwargs):
17 | pass
18 |
19 | def update_proxy(self, table_name, proxy):
20 | pass
21 |
22 | def delete_proxy(self, table_name, proxy):
23 | pass
24 |
25 | def delete_old(self, table_name, day):
26 | pass
27 |
28 | def get_proxy_count(self, table_name):
29 | pass
30 |
31 | def get_proxy_ids(self, table_name):
32 | pass
33 |
34 | def get_proxy_with_id(self, table_name, id):
35 | pass
36 |
37 | def del_proxy_with_id(self, table_name, id):
38 | pass
39 |
40 | def del_proxy_with_ip(self, table_name, ip):
41 | pass
42 |
43 | def commit(self):
44 | pass
45 |
--------------------------------------------------------------------------------
/sql/sql_manager.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import config
4 |
5 | from sql.sql_base import SqlBase
6 |
7 |
8 | class SqlManager(object):
9 | def __init__(self):
10 | db_type = config.DB_config.get('db_type', 'mysql')
11 | db_config = config.DB_config.get(db_type)
12 |
13 | if db_type == 'mysql':
14 | from sql.mysql import MySql
15 | self.sql = MySql(**db_config)
16 | elif db_type == 'redis':
17 | pass
18 | elif db_type == 'sqlite':
19 | pass
20 | elif db_type == 'mongodb':
21 | from sql.mongodb import Mongodb
22 | self.sql = Mongodb(**db_config)
23 | else: # default mysql
24 | from sql.mysql import MySql
25 | self.sql = MySql(**config.DB_config.get('db_type'))
26 |
27 | def init_database(self, database_name):
28 | pass
29 |
30 | def init_proxy_table(self, table_name):
31 | return self.sql.init_proxy_table(table_name)
32 |
33 | def insert_proxy(self, table_name, proxy):
34 | return self.sql.insert_proxy(table_name, proxy)
35 |
36 | def insert_valid_proxy(self,id=id):
37 | return self.sql.insert_valid_proxy(id)
38 |
39 | def select_proxy(self, table_name, **kwargs):
40 | return self.sql.select_proxy(table_name, **kwargs)
41 |
42 | def update_proxy(self, table_name, proxy):
43 | return self.sql.update_proxy(table_name, proxy)
44 |
45 | def update_valid_proxy(self,id=0):
46 | return self.sql.update_valid_proxy(id=id)
47 |
48 | def delete_proxy(self, table_name, proxy):
49 | return self.sql.delete_proxy(table_name, proxy)
50 |
51 | def delete_old(self, table_name, day):
52 | return self.sql.delete_old(table_name, day)
53 |
54 | def get_proxy_count(self, table_name):
55 | return self.sql.get_proxy_count(table_name = table_name)
56 |
57 | def get_proxy_ids(self, table_name):
58 | return self.sql.get_proxy_ids(table_name = table_name)
59 |
60 | def get_proxy_with_id(self, table_name, id):
61 | return self.sql.get_proxy_with_id(table_name = table_name, id = id)
62 |
63 | def del_proxy_with_id(self, table_name, id):
64 | return self.sql.del_proxy_with_id(table_name = table_name, id = id)
65 |
66 | def del_proxy_with_ip(self, table_name, ip):
67 | return self.sql.del_proxy_with_ip(table_name = table_name, ip = ip)
68 |
69 | def get_proxies_info(self,table_name,start_id=0,limit=10):
70 | return self.sql.get_proxies_info(table_name=table_name, start_id=start_id, limit=limit)
71 |
72 | def commit(self):
73 | return self.sql.commit()
74 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import logging
4 | import os
5 | import re
6 | import subprocess
7 | import traceback
8 | import time
9 | import datetime
10 |
11 |
12 | # 自定义的日志输出
13 | def log(msg, level=logging.DEBUG):
14 | logging.log(level, msg)
15 | print('%s [%s], msg:%s' % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), level, msg))
16 |
17 | if level == logging.WARNING or level == logging.ERROR:
18 | for line in traceback.format_stack():
19 | print(line.strip())
20 |
21 | for line in traceback.format_stack():
22 | logging.log(level, line.strip())
23 |
24 |
25 | # 服务器使用,清理端口占用
26 | def kill_ports(ports):
27 | for port in ports:
28 | log('kill %s start' % port)
29 | popen = subprocess.Popen('lsof -i:%s' % port, shell=True, stdout=subprocess.PIPE)
30 | (data, err) = popen.communicate()
31 | log('data:\n%s \nerr:\n%s' % (data, err))
32 |
33 | pattern = re.compile(r'\b\d+\b', re.S)
34 | pids = re.findall(pattern, data.decode())
35 |
36 | log('pids:%s' % str(pids))
37 |
38 | for pid in pids:
39 | if pid != '' and pid != None:
40 | try:
41 | log('pid:%s' % pid)
42 | popen = subprocess.Popen('kill -9 %s' % pid, shell=True, stdout=subprocess.PIPE)
43 | (data, err) = popen.communicate()
44 | log('data:\n%s \nerr:\n%s' % (data, err))
45 | except Exception as e:
46 | log('kill_ports exception:%s' % e)
47 |
48 | log('kill %s finish' % port)
49 |
50 | time.sleep(1)
51 |
52 |
53 | # 创建文件夹
54 | def make_dir(dir):
55 | log('make dir:%s' % dir)
56 | if not os.path.exists(dir):
57 | os.makedirs(dir)
58 |
--------------------------------------------------------------------------------
/weixin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/awolfly9/IPProxyTool/4e4e3aadd30a75f74393b54e8077568b6a58a813/weixin.png
--------------------------------------------------------------------------------