├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── __init__.py ├── config.py ├── crawler ├── __init__.py ├── items.py ├── pipelines.py ├── settings.py └── spiders │ ├── __init__.py │ ├── proxy │ ├── __init__.py │ ├── basespider.py │ ├── data5u.py │ ├── freeproxylists.py │ ├── gatherproxy.py │ ├── hidemy.py │ ├── ip181.py │ ├── kuaidaili.py │ ├── proxydb.py │ ├── proxylistplus.py │ ├── sixsixip.py │ ├── usproxy.py │ └── xicidaili.py │ └── validator │ ├── __init__.py │ ├── amazoncn.py │ ├── anjuke.py │ ├── assetstore.py │ ├── baidu.py │ ├── bbs.py │ ├── boss.py │ ├── douban.py │ ├── gather.py │ ├── httpbin.py │ ├── jd.py │ ├── lagou.py │ ├── liepin.py │ ├── steam.py │ ├── validator.py │ └── zhilian.py ├── db.sql ├── ipproxytool.py ├── proxy.py ├── requirements.txt ├── run_crawl_proxy.py ├── run_server.py ├── run_spider.py ├── run_validator.py ├── run_validator_async.py ├── scrapy.cfg ├── server ├── __init__.py └── dataserver.py ├── sql ├── __init__.py ├── mongodb.py ├── mysql.py ├── sql_base.py └── sql_manager.py ├── utils.py └── weixin.png /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | 91 | # custom file 92 | *.json 93 | *.idea 94 | *.DS_Store 95 | *.pyc 96 | test* 97 | headers.py 98 | 99 | # custom dir 100 | log/ 101 | 102 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM docker.io/mrjogo/scrapy 2 | ENV PATH /usr/local/bin:$PATH 3 | ENV PATH /home:$PATH 4 | ADD . /home 5 | WORKDIR /home 6 | RUN pip install -i https://mirrors.aliyun.com/pypi/simple -r requirements.txt 7 | CMD python ipproxytool.py -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # IPProxyTool 2 | 使用 scrapy 爬虫抓取代理网站,获取大量的免费代理 ip。过滤出所有可用的 ip,存入数据库以备使用。 3 | 可以访问我的个人站点,查看我的更多有趣项目 [西瓜](http://xigua233.com/) 4 | 5 | 感谢 [youngjeff](https://github.com/youngjeff) 和我一起维护该项目 6 | 7 | ## 运行环境 8 | 安装 python3 and mysql 数据库 9 | 10 | cryptography模块安装环境: 11 | ``` 12 | sudo yum install gcc libffi-devel python-devel openssl-devel 13 | ``` 14 | 15 | 16 | ``` 17 | $ pip install -r requirements.txt 18 | ``` 19 | 20 | 21 | 22 | ## 下载使用 23 | 将项目克隆到本地 24 | 25 | ``` 26 | $ git clone https://github.com/awolfly9/IPProxyTool.git 27 | ``` 28 | 29 | 进入工程目录 30 | 31 | ``` 32 | $ cd IPProxyTool 33 | ``` 34 | 修改 mysql 数据库配置 [config.py](https://github.com/awolfly9/IPProxyTool/blob/master/config.py) 中 database_config 的用户名和密码为数据库的用户名和密码 35 | 36 | ``` 37 | $ vim config.py 38 | --------------- 39 | 40 | database_config = { 41 | 'host': 'localhost', 42 | 'port': 3306, 43 | 'user': 'root', 44 | 'password': '123456', 45 | 'charset': 'utf8', 46 | } 47 | ``` 48 | 49 | MYSQL: 导入数据表结构 50 | ``` 51 | $ mysql> create database ipproxy; 52 | Query OK, 1 row affected (0.00 sec) 53 | $ mysql> use ipproxy; 54 | Database changed 55 | $ mysql> source '/你的项目目录/db.sql' 56 | 57 | ``` 58 | 59 | 60 | 运行启动脚本 ipproxytool.py 也可以分别运行抓取,验证,服务器接口脚本,运行方法参考项目说明 61 | 62 | ``` 63 | $ python ipproxytool.py 64 | ``` 65 | 66 | 新增异步验证方式,运行方法如下 67 | 68 | ``` 69 | $ python ipproxytool.py async 70 | ``` 71 |
72 | 73 | ## 项目说明 74 | #### 抓取代理网站 75 | 所有抓取代理网站的代码都在 [proxy](https://github.com/awolfly9/IPProxyTool/tree/master/ipproxytool/spiders/proxy)
76 | ##### 扩展抓取其他的代理网站 77 | 1.在 proxy 目录下新建脚本并继承自 BaseSpider
78 | 2.设置 name、urls、headers
79 | 3.重写 parse_page 方法,提取代理数据
80 | 4.将数据存入数据库 具体可以参考 [ip181](https://github.com/awolfly9/IPProxyTool/blob/master/ipproxytool/spiders/proxy/ip181.py) [kuaidaili](https://github.com/awolfly9/IPProxyTool/blob/master/ipproxytool/spiders/proxy/kuaidaili.py)
81 | 5.如果需要抓取特别复杂的代理网站,可以参考[peuland](https://github.com/awolfly9/IPProxyTool/blob/master/ipproxytool/spiders/proxy/peuland.py)
82 | 83 | ##### 修改 run_crawl_proxy.py 导入抓取库,添加到抓取队列 84 | 85 | 可以单独运行 run_crawl_proxy.py 脚本开始抓取代理网站 86 | 87 | ``` 88 | $ python run_crawl_proxy.py 89 | ``` 90 | 91 | #### 验证代理 ip 是否有效 92 | 目前验证方式:
93 | 1.从上一步抓取并存储的数据库中取出所有的代理 IP
94 | 2.利用取出的代理 IP 去请求 [httpbin](http://httpbin.org/get?show_env=1)
95 | 3.根据请求结果判断出代理 IP 的有效性,是否支持 HTTPS 以及匿名度,并存储到表 httpbin 中
96 | 4.从 httpbin 表中取出代理去访问目标网站,例如 [豆瓣](https://www.douban.com/)
97 | 5.如果请求在合适的时间返回成功的数据,则认为这个代理 IP 有效。并且存入相应的表中
98 | 99 | 一个目标网站对应一个脚本,所有验证代理 ip 的代码都在 [validator](https://github.com/awolfly9/IPProxyTool/tree/master/ipproxytool/spiders/validator) 100 | ##### 扩展验证其他网站 101 | 1.在 validator 目录下新建脚本并继承 Validator
102 | 2.设置 name、timeout、urls、headers
103 | 3.然后调用 init 方法,可以参考 [baidu](https://github.com/awolfly9/IPProxyTool/blob/master/ipproxytool/spiders/validator/baidu.py) [douban](https://github.com/awolfly9/IPProxyTool/blob/master/ipproxytool/spiders/validator/douban.py)
104 | 4.如果需要特别复杂的验证方式,可以参考 [assetstore](https://github.com/awolfly9/IPProxyTool/blob/master/ipproxytool/spiders/validator/assetstore.py)
105 | ##### 修改 run_validator.py 导入验证库,添加到验证队列 106 | 可以单独运行 run_validator.py 开始验证代理ip的有效性 107 | 108 | ``` 109 | $ python run_validator.py 110 | ``` 111 | 112 | ### 获取代理 ip 数据服务器接口 113 | 在 config.py 中修改启动服务器端口配置 data_port,默认为 8000 114 | 启动服务器 115 | 116 | ``` 117 | $ python run_server.py 118 | ``` 119 | 120 | 服务器提供接口 121 | #### 获取 122 | 123 | 124 | 参数 125 | 126 | | Name | Type | Description | must | 127 | | ---- | ---- | ---- | ---- | 128 | | name | str | 数据库名称 | 是 | 129 | | anonymity | int | 1:高匿 2:匿名 3:透明 | 否 | 130 | | https | str | https:yes http:no | 否 | 131 | | order | str | table 字段 | 否 | 132 | | sort | str | asc 升序,desc 降序 | 否 | 133 | | count | int | 获取代理数量,默认 100 | 否 | 134 | 135 | 136 | 137 | 138 | #### 删除 139 | 140 | 141 | 参数 142 | 143 | | Name | Type | Description | 是否必须| 144 | | ----| ---- | ---- | --- | 145 | | name | str | 数据库名称 | 是 | 146 | | ip | str | 需要删除的 ip | 是 | 147 | 148 | #### 插入 149 | 150 | 151 | 参数 152 | 153 | | Name | Type | Description | 是否必须| 154 | | ----| ---- | ---- | ----| 155 | | name | str | 数据库名称 |是 | 156 | | ip | str | ip 地址 | 是| 157 | | port | str | 端口 |是| 158 | | country | str | 国家 |否| 159 | | anonymity | int | 1:高匿,2:匿名,3:透明 |否| 160 | | https | str | yes:https,no:http |否| 161 | | speed | float | 访问速度 |否| 162 | | source | str | ip 来源 |否| 163 | 164 | 165 | ## TODO 166 | * 添加多数据库支持 167 | * mysql 168 | * redis TODO... 169 | * sqlite TODO... 170 | * 添加抓取更多免费代理网站,目前支持的抓取的免费代理 IP 站点,目前有一些国外的站点连接不稳定 171 | * (国外) 172 | * (国外) 173 | * (国内) 174 | * (国内) 175 | * (国内) 176 | * (国外) 177 | * (国外) 178 | * (国内) 179 | * (国外) 180 | * (国内) 181 | * 分布式部署项目 182 | * ~~添加服务器获取接口更多筛选条件~~ 183 | * ~~多进程验证代理 IP~~ 184 | * ~~添加 https 支持~~ 185 | * ~~添加检测 ip 的匿名度~~ 186 | 187 | 188 | ## 参考 189 | * [IPProxyPool](https://github.com/qiyeboy/IPProxyPool) 190 | 191 | 192 | ## 项目更新 193 | -----------------------------2020-12-29----------------------------
194 | 1. 修改之前错误的路径命名 195 | 2. 修改mysql 表结构 196 |
197 | -----------------------------2017-6-23----------------------------
198 | 1.python2 -> python3
199 | 2.web.py -> flask
200 |
201 | -----------------------------2017-5-17----------------------------
202 | 1.本系统在原来的基础上加入了docker。操作见下方,关于docker的相关知识可以上官网看看http://www.docker.com.
203 |
204 | -----------------------------2017-3-30----------------------------
205 | 1.修改完善 readme
206 | 2.数据插入支持事务
207 |
208 | -----------------------------2017-3-14----------------------------
209 | 1.更改服务器接口,添加排序方式
210 | 2.添加多进程方式验证代理 ip 的有效性
211 |
212 | -----------------------------2017-2-20----------------------------
213 | 1.添加服务器获取接口更多筛选条件
214 |
215 | 216 | -----------------------------2017-2-16----------------------------
217 | 1.验证代理 IP 的匿名度
218 | 2.验证代理 IP HTTPS 支持
219 | 3.添加 httpbin 验证并发数设置,默认为 4 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | ## 在系统中安装docker就可以使用本程序: 233 | 234 | 下载本程序 235 | ``` 236 | git clone https://github.com/awolfly9/IPProxyTool 237 | ``` 238 | 239 | 然后进入目录: 240 | ``` 241 | cd IPProxyTool 242 | ``` 243 | 244 | 创建镜像: 245 | ``` 246 | docker build -t proxy . 247 | ``` 248 | 249 | 运行容器: 250 | ``` 251 | docker run -it proxy 252 | ``` 253 | 254 | ## 在config.py中按照自己的需求修改配置信息 255 | ``` 256 | database_config = { 257 | 'host': 'localhost', 258 | 'port': 3306, 259 | 'user': 'root', 260 | 'password': 'root', 261 | 'charset': 'utf8', 262 | } 263 | ``` 264 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | 3 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | DB_config = { 4 | # 'db_type': 'mongodb', 5 | 'db_type': 'mysql', 6 | 7 | 'mysql': { 8 | 'host': 'localhost', 9 | 'port': 3306, 10 | 'user': 'root', 11 | 'password': '123456', 12 | 'charset': 'utf8', 13 | }, 14 | 'redis': { 15 | 'host': 'localhost', 16 | 'port': 6379, 17 | 'password': '123456', 18 | 'db': 1, 19 | }, 20 | 'mongodb':{ 21 | 'host': 'localhost', 22 | 'port': 27017, 23 | 'username': '', 24 | 'password': '', 25 | } 26 | } 27 | 28 | database = 'ipproxy' 29 | free_ipproxy_table = 'free_ipproxy' 30 | httpbin_table = 'httpbin' 31 | 32 | data_port = 8000 33 | -------------------------------------------------------------------------------- /crawler/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awolfly9/IPProxyTool/4e4e3aadd30a75f74393b54e8077568b6a58a813/crawler/__init__.py -------------------------------------------------------------------------------- /crawler/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class CrawlerItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /crawler/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class CrawlerPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /crawler/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for crawler project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'crawler' 13 | 14 | SPIDER_MODULES = ['crawler.spiders','crawler.spiders.proxy'] 15 | NEWSPIDER_MODULE = 'crawler.spiders' 16 | 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 18 | #USER_AGENT = 'crawler (+http://www.yourdomain.com)' 19 | 20 | # Obey robots.txt rules 21 | ROBOTSTXT_OBEY = False 22 | 23 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 24 | #CONCURRENT_REQUESTS = 32 25 | 26 | # Configure a delay for requests for the same website (default: 0) 27 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 28 | # See also autothrottle settings and docs 29 | DOWNLOAD_DELAY = 0.5 30 | # The download delay setting will honor only one of: 31 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 32 | #CONCURRENT_REQUESTS_PER_IP = 16 33 | 34 | # Disable cookies (enabled by default) 35 | #COOKIES_ENABLED = False 36 | 37 | # Disable Telnet Console (enabled by default) 38 | #TELNETCONSOLE_ENABLED = False 39 | 40 | # Override the default request headers: 41 | #DEFAULT_REQUEST_HEADERS = { 42 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 43 | # 'Accept-Language': 'en', 44 | #} 45 | 46 | # Enable or disable spider middlewares 47 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 48 | #SPIDER_MIDDLEWARES = { 49 | # 'crawler.middlewares.MyCustomSpiderMiddleware': 543, 50 | #} 51 | 52 | # Enable or disable downloader middlewares 53 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 54 | #DOWNLOADER_MIDDLEWARES = { 55 | # 'crawler.middlewares.MyCustomDownloaderMiddleware': 543, 56 | #} 57 | 58 | # Enable or disable extensions 59 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 60 | EXTENSIONS = { 61 | 'scrapy.extensions.telnet.TelnetConsole': None, 62 | } 63 | 64 | # Configure item pipelines 65 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 66 | #ITEM_PIPELINES = { 67 | # 'crawler.pipelines.SomePipeline': 300, 68 | #} 69 | 70 | # Enable and configure the AutoThrottle extension (disabled by default) 71 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 72 | #AUTOTHROTTLE_ENABLED = True 73 | # The initial download delay 74 | #AUTOTHROTTLE_START_DELAY = 5 75 | # The maximum download delay to be set in case of high latencies 76 | #AUTOTHROTTLE_MAX_DELAY = 60 77 | # The average number of requests Scrapy should be sending in parallel to 78 | # each remote server 79 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 80 | # Enable showing throttling stats for every response received: 81 | #AUTOTHROTTLE_DEBUG = False 82 | 83 | # Enable and configure HTTP caching (disabled by default) 84 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 85 | #HTTPCACHE_ENABLED = True 86 | #HTTPCACHE_EXPIRATION_SECS = 0 87 | #HTTPCACHE_DIR = 'httpcache' 88 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 89 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 90 | 91 | # RETRY_ENABLED = False 92 | 93 | LOG_ENABLED = True 94 | -------------------------------------------------------------------------------- /crawler/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /crawler/spiders/proxy/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | -------------------------------------------------------------------------------- /crawler/spiders/proxy/basespider.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import sys 4 | import config 5 | import utils 6 | import datetime 7 | 8 | from scrapy.spiders import Spider 9 | from scrapy.http import Request 10 | from sql import SqlManager 11 | 12 | 13 | class BaseSpider(Spider): 14 | name = 'basespider' 15 | 16 | def __init__(self, *a, **kw): 17 | super(BaseSpider, self).__init__(*a, **kw) 18 | 19 | self.urls = [] 20 | self.headers = {} 21 | self.timeout = 10 22 | self.is_record_web_page = True 23 | 24 | self.sql = SqlManager() 25 | 26 | def init(self): 27 | self.meta = { 28 | 'download_timeout': self.timeout, 29 | } 30 | 31 | self.dir_log = 'log/proxy/%s' % self.name 32 | utils.make_dir(self.dir_log) 33 | self.sql.init_proxy_table(config.free_ipproxy_table) 34 | 35 | def start_requests(self): 36 | for i, url in enumerate(self.urls): 37 | yield Request( 38 | url=url, 39 | headers=self.headers, 40 | meta=self.meta, 41 | dont_filter=True, 42 | callback=self.parse_page, 43 | errback=self.error_parse, 44 | ) 45 | 46 | def parse_page(self, response): 47 | self.write(response.body) 48 | pass 49 | 50 | def error_parse(self, failure): 51 | request = failure.request 52 | pass 53 | 54 | def add_proxy(self, proxy): 55 | self.sql.insert_proxy(config.free_ipproxy_table, proxy) 56 | 57 | def write(self, data): 58 | if self.is_record_web_page: 59 | with open('%s/%s.html' % (self.dir_log, datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S:%f')), 60 | 'wb') as f: 61 | f.write(data) 62 | f.close() 63 | 64 | def close(spider, reason): 65 | spider.sql.commit() 66 | -------------------------------------------------------------------------------- /crawler/spiders/proxy/data5u.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from scrapy import Selector 4 | from .basespider import BaseSpider 5 | from proxy import Proxy 6 | 7 | 8 | class Data5uSpider(BaseSpider): 9 | name = 'data5u' 10 | 11 | def __init__(self, *a, **kw): 12 | # 在类的继承中,如果重定义某个方法,该方法会覆盖父类的同名方法 13 | # 但有时,我们希望能同时实现父类的功能,这时,我们就需要调用父类的方法了,可通过使用 super 来实现,比如: 14 | super(Data5uSpider, self).__init__(*a, **kw) 15 | 16 | self.urls = [ 17 | 'http://www.data5u.com/' 18 | ] 19 | self.headers = { 20 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 21 | # 'Accept-Encoding': 'gzip, deflate, sdch', 22 | # 'Accept-Language': 'zh-CN,zh;q=0.8', 23 | # 'Connection': 'keep-alive', 24 | 'Host': 'www.data5u.com', 25 | 'Upgrade-Insecure-Requests': 1, 26 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36', 27 | } 28 | 29 | self.init() 30 | 31 | def parse_page(self, response): 32 | self.write(response.body) 33 | 34 | sel = Selector(response) 35 | infos = sel.xpath('//ul[@class="l2"]').extract() 36 | for i, info in enumerate(infos): 37 | val = Selector(text=info) 38 | ip = val.xpath('//ul[@class="l2"]/span[1]/li/text()').extract_first() 39 | port = val.xpath('//ul[@class="l2"]/span[2]/li/text()').extract_first() 40 | anonymity = val.xpath('//ul[@class="l2"]/span[3]/li/text()').extract_first() 41 | https = val.xpath('//ul[@class="l2"]/span[4]/li/text()').extract_first() 42 | country = val.xpath('//ul[@class="l2"]/span[5]/li/a/text()').extract_first() 43 | 44 | proxy = Proxy() 45 | proxy.set_value( 46 | ip=ip, 47 | port=port, 48 | country=country, 49 | anonymity=anonymity, 50 | source=self.name, 51 | ) 52 | self.add_proxy(proxy=proxy) 53 | -------------------------------------------------------------------------------- /crawler/spiders/proxy/freeproxylists.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import urllib 4 | import re 5 | 6 | from proxy import Proxy 7 | from .basespider import BaseSpider 8 | from bs4 import BeautifulSoup 9 | 10 | 11 | class FreeProxyListsSpider(BaseSpider): 12 | name = 'freeproxylists' 13 | 14 | def __init__(self, *a, **kwargs): 15 | super(FreeProxyListsSpider, self).__init__(*a, **kwargs) 16 | self.urls = [ 17 | 'http://www.freeproxylists.net/' 18 | ] 19 | self.headers = { 20 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 21 | 'Accept-Encoding': 'gzip, deflate', 22 | 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 23 | 'Connection': 'keep-alive', 24 | 'Host': 'www.freeproxylists.net', 25 | 'Upgrade-Insecure-Requests': '1', 26 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 Firefox/50.0', 27 | } 28 | 29 | self.init() 30 | 31 | def parse_page(self, response): 32 | pattern = re.compile('', re.S) 33 | items = re.findall(pattern = pattern, string = response.body) 34 | for i, item in enumerate(items): 35 | if i > 0: 36 | if 'async' in item: 37 | continue 38 | 39 | ip_pattern = re.compile('IPDecode\(\"(.*?)\"\)', re.S) 40 | ip_decode = re.findall(ip_pattern, item)[0] 41 | ip_url = urllib.unquote(ip_decode) 42 | ip_soup = BeautifulSoup(ip_url, 'lxml') 43 | ip = ip_soup.text.encode() 44 | 45 | item = '' 46 | soup = BeautifulSoup(item, 'lxml') 47 | tbodys = soup.find_all('td') 48 | 49 | proxy = Proxy() 50 | proxy.set_value( 51 | ip = ip, 52 | port = tbodys[1].text.encode(), 53 | country = tbodys[4].text.encode(), 54 | anonymity = tbodys[3].text.encode(), 55 | source = self.name, 56 | ) 57 | 58 | self.add_proxy(proxy = proxy) 59 | -------------------------------------------------------------------------------- /crawler/spiders/proxy/gatherproxy.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import json 4 | import random 5 | import re 6 | import requests 7 | 8 | from proxy import Proxy 9 | from .basespider import BaseSpider 10 | 11 | 12 | class GatherproxySpider(BaseSpider): 13 | name = 'gatherproxy' 14 | 15 | def __init__(self, *a, **kwargs): 16 | super(GatherproxySpider, self).__init__(*a, **kwargs) 17 | self.urls = [ 18 | 'http://gatherproxy.com/', 19 | 'http://www.gatherproxy.com/proxylist/anonymity/?t=Anonymous', 20 | 'http://gatherproxy.com/proxylist/country/?c=China', 21 | ] 22 | 23 | self.headers = { 24 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 25 | 'Accept-Encoding': 'gzip, deflate', 26 | 'Accept-Language': 'en-US,en;q=0.5', 27 | 'Connection': 'keep-alive', 28 | 'Host': 'www.gatherproxy.com', 29 | 'Upgrade-Insecure-Requests': '1', 30 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:52.0) Gecko/20100101 Firefox/52.0' 31 | } 32 | 33 | # self.proxies = self.get_proxy() 34 | self.init() 35 | 36 | def parse_page(self, response): 37 | pattern = re.compile('gp.insertPrx\((.*?)\)', re.S) 38 | items = re.findall(pattern, response.body.decode()) 39 | for item in items: 40 | data = json.loads(item) 41 | #端口用的是十六进制 42 | port = data.get('PROXY_PORT') 43 | port = str(int(port, 16)) 44 | 45 | proxy = Proxy() 46 | proxy.set_value( 47 | ip = data.get('PROXY_IP'), 48 | port = port, 49 | country = data.get('PROXY_COUNTRY'), 50 | anonymity = data.get('PROXY_TYPE'), 51 | source = self.name, 52 | ) 53 | 54 | self.add_proxy(proxy = proxy) 55 | 56 | def get_proxy(self): 57 | try: 58 | url = 'http://127.0.0.1:8000/?name={0}'.format(self.name) 59 | r = requests.get(url = url) 60 | if r.text != None and r.text != '': 61 | data = json.loads(r.text) 62 | if len(data) > 0: 63 | proxy = random.choice(data) 64 | ip = proxy.get('ip') 65 | port = proxy.get('port') 66 | address = '%s:%s' % (ip, port) 67 | 68 | proxies = { 69 | 'http': 'http://%s' % address 70 | } 71 | return proxies 72 | except: 73 | return None 74 | -------------------------------------------------------------------------------- /crawler/spiders/proxy/hidemy.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | 3 | import utils 4 | 5 | from scrapy import Selector 6 | from .basespider import BaseSpider 7 | from proxy import Proxy 8 | 9 | 10 | class HidemySpider(BaseSpider): 11 | name = 'hidemy' 12 | 13 | def __init__(self, *a, **kw): 14 | super(HidemySpider, self).__init__(*a, **kw) 15 | 16 | self.urls = ['https://hidemy.name/en/proxy-list/?start=%s' % n for n in range(0, 5 * 64, 64)] 17 | self.headers = { 18 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 19 | 'Accept-Encoding': 'gzip, deflate, br', 20 | 'Accept-Language': 'en-US,en;q=0.5', 21 | 'Connection': 'keep-alive', 22 | 'Host': 'hidemy.name', 23 | 'Referer': 'https://hidemy.name/en/proxy-list/?start=0', 24 | 'Upgrade-Insecure-Requests': '1', 25 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 Firefox/51.0', 26 | } 27 | 28 | self.init() 29 | 30 | def parse_page(self, response): 31 | self.write(response.body) 32 | 33 | sel = Selector(response) 34 | infos = sel.xpath('//tbody/tr').extract() 35 | for i, info in enumerate(infos): 36 | if i == 0: 37 | continue 38 | 39 | val = Selector(text = info) 40 | ip = val.xpath('//td[1]/text()').extract_first() 41 | port = val.xpath('//td[2]/text()').extract_first() 42 | country = val.xpath('//td[3]/div/text()').extract_first() 43 | anonymity = val.xpath('//td[6]/text()').extract_first() 44 | 45 | proxy = Proxy() 46 | proxy.set_value( 47 | ip = ip, 48 | port = port, 49 | country = country, 50 | anonymity = anonymity, 51 | source = self.name, 52 | ) 53 | 54 | self.add_proxy(proxy = proxy) 55 | -------------------------------------------------------------------------------- /crawler/spiders/proxy/ip181.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | 3 | from scrapy import Selector 4 | from .basespider import BaseSpider 5 | from proxy import Proxy 6 | 7 | 8 | class IpOneEightOneSpider(BaseSpider): 9 | name = 'ip181' 10 | 11 | def __init__(self, *a, **kw): 12 | super(IpOneEightOneSpider, self).__init__(*a, **kw) 13 | 14 | self.urls = ['http://www.ip181.com/'] 15 | self.headers = { 16 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 17 | 'Accept-Encoding': 'gzip, deflate', 18 | 'Accept-Language': 'en-US,en;q=0.5', 19 | 'Connection': 'keep-alive', 20 | 'Host': 'www.ip181.com', 21 | 'Upgrade-Insecure-Requests': '1', 22 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 Firefox/50.0', 23 | } 24 | 25 | self.init() 26 | 27 | def parse_page(self, response): 28 | self.write(response.body) 29 | 30 | sel = Selector(response) 31 | infos = sel.xpath('//tbody/tr').extract() 32 | for i, info in enumerate(infos): 33 | if i == 0: 34 | continue 35 | 36 | val = Selector(text = info) 37 | ip = val.xpath('//td[1]/text()').extract_first() 38 | port = val.xpath('//td[2]/text()').extract_first() 39 | country = val.xpath('//td[6]/text()').extract_first() 40 | anonymity = val.xpath('//td[3]/text()').extract_first() 41 | https = val.xpath('//td[4]/text()').extract_first() 42 | 43 | proxy = Proxy() 44 | proxy.set_value( 45 | ip = ip, 46 | port = port, 47 | country = country, 48 | anonymity = anonymity, 49 | source = self.name, 50 | ) 51 | 52 | self.add_proxy(proxy = proxy) 53 | -------------------------------------------------------------------------------- /crawler/spiders/proxy/kuaidaili.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | 3 | import re 4 | 5 | from proxy import Proxy 6 | from .basespider import BaseSpider 7 | 8 | 9 | class KuaiDaiLiSpider(BaseSpider): 10 | name = 'kuaidaili' 11 | 12 | def __init__(self, *a, **kwargs): 13 | super(KuaiDaiLiSpider, self).__init__(*a, **kwargs) 14 | 15 | self.urls = ['https://www.kuaidaili.com/free/inha/%s/' % i for i in range(1, 5)] 16 | 17 | self.headers = { 18 | 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 19 | 'Accept-Encoding':'gzip, deflate', 20 | 'Accept-Language':'zh-CN,zh;q=0.9', 21 | 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36', 22 | } 23 | 24 | self.is_record_web_page = False 25 | self.init() 26 | 27 | def parse_page(self, response): 28 | pattern = re.compile( 29 | '\s.*?(.*?)\s.*?(.*?)\s.*?(.*?)\s.*?(' 30 | '.*?)\s.*?(.*?)\s.*?(.*?)\s.*?(.*?)\s.*?', 31 | re.S) 32 | items = re.findall(pattern, response.body.decode()) 33 | 34 | for item in items: 35 | proxy = Proxy() 36 | proxy.set_value( 37 | ip = item[0], 38 | port = item[1], 39 | country = item[4], 40 | anonymity = item[2], 41 | source = self.name, 42 | ) 43 | 44 | self.add_proxy(proxy) 45 | -------------------------------------------------------------------------------- /crawler/spiders/proxy/proxydb.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | from proxy import Proxy 4 | from .basespider import BaseSpider 5 | from scrapy.selector import Selector 6 | import re 7 | from base64 import b64decode 8 | 9 | class ProxyDBSpider(BaseSpider): 10 | name = 'proxydb' 11 | 12 | def __init__(self, *a, **kwargs): 13 | super(ProxyDBSpider, self).__init__(*a, **kwargs) 14 | self.urls = ['http://proxydb.net/?protocol=http&protocol=https&offset=%s' % n for n in range(1, 500, 50)] 15 | self.headers = { 16 | 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 17 | 'Accept-Encoding':'gzip, deflate', 18 | 'Accept-Language':'zh-CN,zh;q=0.9', 19 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 20 | } 21 | 22 | self.is_record_web_page = False 23 | self.init() 24 | 25 | def parse_page(self, response): 26 | super(ProxyDBSpider, self).parse_page(response) 27 | for table_item in response.xpath('//tbody/tr'): 28 | ip,port = self.parse_ip(table_item.xpath('.//td[1]/script/text()').extract_first()) 29 | country = table_item.xpath('.//td/img/@title').extract_first().strip() 30 | anonymity = table_item.xpath('.//td/span/text()').extract_first().strip() 31 | proxy = Proxy() 32 | proxy.set_value( 33 | ip = ip, 34 | port = port, 35 | country = country, 36 | anonymity = anonymity, 37 | source = self.name 38 | ) 39 | self.add_proxy(proxy = proxy) 40 | 41 | def parse_ip(self, page): 42 | ip_part1 = re.search(r'\'(.*)\'\.split',page).group(1)[::-1] 43 | ip_part2= ''.join([chr(int(x,16)) for x in re.findall(r'\\x([0-9A-Fa-f]{2})', page)]) 44 | ip_part2= b64decode(ip_part2).decode('utf-8') 45 | port = re.search(r'pp = -(\d+) \+ (\d+);',page).groups() 46 | port = -int(port[0]) + int(port[1]) 47 | return [''.join([ip_part1,ip_part2]),port] 48 | 49 | -------------------------------------------------------------------------------- /crawler/spiders/proxy/proxylistplus.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | 3 | from scrapy import Selector 4 | from .basespider import BaseSpider 5 | from proxy import Proxy 6 | 7 | 8 | class ProxylistplusSpider(BaseSpider): 9 | name = 'proxylistplus' 10 | 11 | def __init__(self, *a, **kw): 12 | super(ProxylistplusSpider, self).__init__(*a, **kw) 13 | 14 | self.urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-%s' % n for n in range(1, 3)] 15 | self.headers = { 16 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 17 | 'Accept-Encoding': 'gzip, deflate, br', 18 | 'Accept-Language': 'en-US,en;q=0.5', 19 | 'Cache-Control': 'max-age=0', 20 | 'Connection': 'keep-alive', 21 | 'Host': 'list.proxylistplus.com', 22 | 'If-Modified-Since': 'Mon, 20 Feb 2017 07:47:35 GMT', 23 | 'If-None-Match': 'list381487576865', 24 | 'Referer': 'https://list.proxylistplus.com/Fresh-HTTP-Proxy', 25 | 'Upgrade-Insecure-Requests': '1', 26 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 Firefox/51.0', 27 | } 28 | 29 | self.is_record_web_page = False 30 | self.init() 31 | 32 | def parse_page(self, response): 33 | self.write(response.body) 34 | 35 | sel = Selector(response) 36 | infos = sel.xpath('//tr[@class="cells"]').extract() 37 | for i, info in enumerate(infos): 38 | self.log(info) 39 | val = Selector(text = info) 40 | 41 | ip = val.xpath('//td[2]/text()').extract_first() 42 | port = val.xpath('//td[3]/text()').extract_first() 43 | country = val.xpath('//td[5]/text()').extract_first() 44 | anonymity = val.xpath('//td[4]/text()').extract_first() 45 | 46 | proxy = Proxy() 47 | proxy.set_value( 48 | ip = ip, 49 | port = port, 50 | country = country, 51 | anonymity = anonymity, 52 | source = self.name, 53 | ) 54 | 55 | self.add_proxy(proxy = proxy) 56 | -------------------------------------------------------------------------------- /crawler/spiders/proxy/sixsixip.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import re 4 | 5 | from proxy import Proxy 6 | from .basespider import BaseSpider 7 | 8 | 9 | class SixSixIpSpider(BaseSpider): 10 | name = 'sixsixip' 11 | 12 | def __init__(self, *a, **kwargs): 13 | super(SixSixIpSpider, self).__init__(*a, **kwargs) 14 | 15 | self.urls = ['http://m.66ip.cn/%s.html' % n for n in range(1, 10)] 16 | self.headers = { 17 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 18 | 'Accept-Encoding': 'gzip, deflate', 19 | 'Accept-Language': 'en-US,en;q=0.5', 20 | 'Cache-Control': 'max-age=0', 21 | 'Connection': 'keep-alive', 22 | 'Host': 'm.66ip.cn', 23 | 'Upgrade-Insecure-Requests': '1', 24 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 Firefox/50.0', 25 | } 26 | 27 | self.init() 28 | 29 | def parse_page(self, response): 30 | pattern = re.compile('(.*?)(.*?)(.*?)(.*?)(.*?)', 31 | re.S) 32 | items = re.findall(pattern, response.body.decode()) 33 | for i, item in enumerate(items): 34 | if i >= 1: 35 | proxy = Proxy() 36 | proxy.set_value( 37 | ip = item[0], 38 | port = item[1], 39 | country = item[2], 40 | anonymity = item[3], 41 | source = self.name 42 | ) 43 | 44 | self.add_proxy(proxy = proxy) 45 | -------------------------------------------------------------------------------- /crawler/spiders/proxy/usproxy.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import re 4 | 5 | from proxy import Proxy 6 | from .basespider import BaseSpider 7 | 8 | 9 | class UsProxySpider(BaseSpider): 10 | name = 'usproxy' 11 | 12 | def __init__(self, *a, **kwargs): 13 | super(UsProxySpider, self).__init__(*a, **kwargs) 14 | 15 | self.urls = [ 16 | 'http://www.sslproxies.org/', 17 | 'http://www.us-proxy.org/', 18 | 'http://free-proxy-list.net/uk-proxy.html', 19 | 'http://www.socks-proxy.net/', 20 | ] 21 | self.headers = { 22 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 23 | 'Accept-Encoding': 'gzip, deflate', 24 | 'Accept-Language': 'en-US,en;q=0.5', 25 | 'Cache-Control': 'max-age=0', 26 | 'Connection': 'keep-alive', 27 | 'Host': 'www.us-proxy.org', 28 | 'If-Modified-Since': 'Tue, 24 Jan 2017 03:32:01 GMT', 29 | 'Referer': 'http://www.sslproxies.org/', 30 | 'Upgrade-Insecure-Requests': '1', 31 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 Firefox/50.0', 32 | } 33 | 34 | self.init() 35 | 36 | def parse_page(self, response): 37 | pattern = re.compile( 38 | '(.*?)(.*?)(.*?)(.*?)(.*?)(.*?)(.*?)(.*?)', 39 | re.S) 40 | items = re.findall(pattern, response.body.decode()) 41 | 42 | if items is not None: 43 | for item in items: 44 | proxy = Proxy() 45 | proxy.set_value( 46 | ip = item[0], 47 | port = item[1], 48 | country = item[3], 49 | anonymity = item[4], 50 | source = self.name, 51 | ) 52 | 53 | self.add_proxy(proxy) 54 | -------------------------------------------------------------------------------- /crawler/spiders/proxy/xicidaili.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | 3 | from proxy import Proxy 4 | from .basespider import BaseSpider 5 | from scrapy.selector import Selector 6 | 7 | 8 | class XiCiDaiLiSpider(BaseSpider): 9 | name = 'xici' 10 | 11 | def __init__(self, *a, **kw): 12 | super(XiCiDaiLiSpider, self).__init__(*a, **kw) 13 | 14 | self.urls = ['http://www.xicidaili.com/nn/%s' % n for n in range(1, 10)] 15 | self.headers = { 16 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 17 | 'Accept-Encoding': 'gzip, deflate', 18 | 'Accept-Language': 'en-US,en;q=0.5', 19 | 'Cache-Control': 'max-age=0', 20 | 'Connection': 'keep-alive', 21 | 'Host': 'www.xicidaili.com', 22 | 'If-None-Match': 'W/"cb655e834a031d9237e3c33f3499bd34"', 23 | 'Upgrade-Insecure-Requests': '1', 24 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 Firefox/50.0', 25 | } 26 | 27 | self.init() 28 | 29 | def parse_page(self, response): 30 | sel = Selector(text = response.body) 31 | infos = sel.xpath('//tr[@class="odd"]').extract() 32 | for info in infos: 33 | val = Selector(text = info) 34 | ip = val.xpath('//td[2]/text()').extract_first() 35 | port = val.xpath('//td[3]/text()').extract_first() 36 | country = val.xpath('//td[4]/a/text()').extract_first() 37 | anonymity = val.xpath('//td[5]/text()').extract_first() 38 | 39 | proxy = Proxy() 40 | proxy.set_value( 41 | ip = ip, 42 | port = port, 43 | country = country, 44 | anonymity = anonymity, 45 | source = self.name, 46 | ) 47 | 48 | self.add_proxy(proxy = proxy) 49 | -------------------------------------------------------------------------------- /crawler/spiders/validator/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /crawler/spiders/validator/amazoncn.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | 3 | from .validator import Validator 4 | 5 | 6 | class AmazonCnSpider(Validator): 7 | name = 'amazoncn' 8 | 9 | def __init__(self, name = None, **kwargs): 10 | super(AmazonCnSpider, self).__init__(name, **kwargs) 11 | 12 | self.timeout = 5 13 | 14 | self.urls = [ 15 | 'https://www.amazon.cn/dp/B00ID363S4', 16 | 'https://www.amazon.cn/gp/product/B01BDBJ71W', 17 | 'https://www.amazon.cn/gp/product/B06XBHPZNC', 18 | ] 19 | 20 | self.headers = { 21 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 22 | 'Accept-Encoding': 'gzip, deflate, br', 23 | 'Accept-Language': 'en-US,en;q=0.5', 24 | 'Connection': 'keep-alive', 25 | 'Host': 'www.amazon.cn', 26 | 'Upgrade-Insecure-Requests': '1', 27 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 ' 28 | 'Firefox/50.0', 29 | } 30 | 31 | self.init() 32 | 33 | def success_content_parse(self, response): 34 | if 'Amazon CAPTCHA' in response.text: 35 | return False 36 | return True 37 | 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /crawler/spiders/validator/anjuke.py: -------------------------------------------------------------------------------- 1 | # -*- coding=utf-8 -*- 2 | 3 | import datetime 4 | import json 5 | import random 6 | import re 7 | import time 8 | import config 9 | 10 | from scrapy import Request 11 | from scrapy.selector import Selector 12 | from crawler.spiders.validator.validator import Validator 13 | 14 | 15 | class AJKSpider(Validator): 16 | name = 'ajk' 17 | 18 | concurrent_requests = 16 19 | 20 | def __init__(self, name = None, **kwargs): 21 | super(AJKSpider, self).__init__(name, **kwargs) 22 | 23 | self.region_urls = [ 24 | 'aolinpikegongyuan/', 25 | 'anzhen/', 26 | 'baiziwan/', 27 | 'beiyuan/', 28 | 'balizhuangb/', 29 | 'chaoyanggongyuandong/', 30 | 'chaowaidajie/', 31 | 'changying/', 32 | 'chaoyangjichang/', 33 | 'chaoqing/', 34 | 'chaoyanggongyuanxi/', 35 | 'dawanglu/', 36 | 'dongbaa/', 37 | 'dougezhuang/', 38 | 'dongdaqiao/', 39 | 'dingfuzhuang/', 40 | 'fatou/', 41 | 'gaobeidian/', 42 | 'guanzhuang/', 43 | 'guomao/', 44 | 'hepinglibei/', 45 | 'huaweiqiaocy/', 46 | 'jinsongdong/', 47 | 'jianzhanxiang/', 48 | 'jianxiangqiao/', 49 | 'jianguomenwai/', 50 | 'jiuxianqiao/', 51 | 'jinsongxi/', 52 | 'laiguangying/', 53 | 'liufang/', 54 | 'nanshatan/', 55 | 'panjiayuan/', 56 | 'shilihe/', 57 | 'sanlitun/', 58 | 'sihui/', 59 | 'shuangqiaoc/', 60 | 'shifoying/', 61 | 'shibalidian/', 62 | 'shaoyaoju/', 63 | 'shuangjing/', 64 | 'sanyuanqiao/', 65 | 'taiyanggong/', 66 | 'tuanjiehu/', 67 | 'wangjingxi/', 68 | 'wangjingdong/', 69 | 'xiaohongmen/', 70 | 'yayuncun/', 71 | 'chaoyang/', 72 | 'haidian/', 73 | 'dongchenga/', 74 | 'xicheng/', 75 | 'fengtai/', 76 | 'tongzhou/', 77 | 'shijingshan/', 78 | 'changping/', 79 | 'daxing/', 80 | 'shunyi/', 81 | 'fangshan/', 82 | 'mentougou/', 83 | 'miyun/', 84 | 'huairou/', 85 | 'pinggua/', 86 | 'yanqing/', 87 | 'beijingzhoubiana/', 88 | 'baishiqiao/', 89 | 'chedaogou/', 90 | 'dinghuisi/', 91 | 'erlizhuang/', 92 | 'gongzhufenxi/', 93 | 'ganjiakou/', 94 | 'gongzhufendong/', 95 | 'haidianbeibu/', 96 | 'junbo/', 97 | 'madians/', 98 | 'malianwa/', 99 | 'mudanyuan/', 100 | 'qinghe/', 101 | 'shijicheng/', 102 | 'sijiqing/', 103 | 'suzhouqiao/', 104 | 'shangdi/', 105 | 'shuangyushu/', 106 | 'tiancun/', 107 | 'wenquand/', 108 | 'wanquanhe/', 109 | 'wanshoulu/', 110 | 'wanliu/', 111 | 'wudaokou/', 112 | 'weigongcun/', 113 | 'xiangshandong/', 114 | 'xibeiwang/', 115 | 'xierqi/', 116 | 'xiangshangxi/', 117 | 'xiaoxitian/', 118 | 'xisanqi/', 119 | 'xueyuanlu/', 120 | 'yuquanlu/', 121 | 'yiheyuan/', 122 | 'yuanmingyuan/', 123 | 'zaojunmiao/', 124 | 'zizhuqiao/', 125 | 'zhichunlu/', 126 | 'zhongguancun/', 127 | 'andingmen/', 128 | 'chongwenmens/', 129 | 'chaoyangmennei/', 130 | 'dongzhimenwai/', 131 | 'donghuashis/', 132 | 'dongdan/', 133 | 'dongsia/', 134 | 'dongzhimennei/', 135 | 'dengshikou/', 136 | 'guangqumen/', 137 | 'hepinglianan/', 138 | 'jiaodaokou/', 139 | 'jianguomennei/', 140 | 'longtanhus/', 141 | 'qianmens/', 142 | 'tiantans/', 143 | 'wangfujing/', 144 | 'yongdingmens/', 145 | 'yonghegong/', 146 | 'baizhifangs/', 147 | 'baiyunluxc/', 148 | 'changchunjiexc/', 149 | 'chegongzhuanga/', 150 | 'deshengmen/', 151 | 'fuchengmen/', 152 | 'guanganmenwai/', 153 | 'guanganmennei/', 154 | 'guanyuan/', 155 | 'hepingmen/', 156 | 'jinrongjie/', 157 | 'liupukang/', 158 | 'maliandaos/', 159 | 'shichahai/', 160 | 'tianningshi/', 161 | 'taorantings/', 162 | 'xuanwumens/', 163 | 'xizhimenwai/', 164 | 'xisi/', 165 | 'xizhimen/', 166 | 'xinjiekou/', 167 | 'xidan/', 168 | 'yuetan/', 169 | 'beidadi/', 170 | 'caoqiao/', 171 | 'chengshousi/', 172 | 'caihuying/', 173 | 'dahongmen/', 174 | 'fangzhuang/', 175 | 'heyi/', 176 | 'jiaomen/', 177 | 'kandanqiao/', 178 | 'kejiyuanquft/', 179 | 'lizeqiao/', 180 | 'liuliqiaoxi/', 181 | 'lugouqiao/', 182 | 'liujiayao/', 183 | 'liuliqiaodong/', 184 | 'majiabao/', 185 | 'muxiyuan/', 186 | 'puhuangyu/', 187 | 'qilizhuang/', 188 | 'qingta/', 189 | 'songjiazhuang/', 190 | 'xinfadi/', 191 | 'xiluoyuan/', 192 | 'youanmenwai/', 193 | 'yuquanying/', 194 | 'beiguan/', 195 | 'guoyuan/', 196 | 'jiukeshu/', 197 | 'luyuan/', 198 | 'liyuan/', 199 | 'majuqiao/', 200 | 'qiaozhuang/', 201 | 'tuqiao/', 202 | 'tongzhouquqita/', 203 | 'tongzhoubeiyuan/', 204 | 'wuyihuayuan/', 205 | 'xinhuadajie/', 206 | 'bajiao/', 207 | 'gucheng/', 208 | 'laoshan/', 209 | 'lugu/', 210 | 'pingguoyuan/', 211 | 'shijingshana/', 212 | 'yuquanluxi/', 213 | 'yangzhuang/', 214 | 'baishanzhen/', 215 | 'beiqijia/', 216 | 'changpingquqita/', 217 | 'changpingxiancheng/', 218 | 'dongxiaokouzhen/', 219 | 'huoying/', 220 | 'huilongguan/', 221 | 'longze/', 222 | 'lishuiqiao/', 223 | 'nanshao/', 224 | 'nankou/', 225 | 'shahea/', 226 | 'tiantongyuan/', 227 | 'xingshouzhen/', 228 | 'xiaotangshanbei/', 229 | 'xiaotangshannan/', 230 | 'zhuxinzhuang/', 231 | 'daxingquqita/', 232 | 'guanyinsi/', 233 | 'gaomidian/', 234 | 'huangcun/', 235 | 'jiugong/', 236 | 'luchengxiang/', 237 | 'panggezhuang/', 238 | 'qingyundianzhen/', 239 | 'tiangongyuannan/', 240 | 'tiangongyuanbei/', 241 | 'xihongmen/', 242 | 'yinghaizhen/', 243 | 'yizhuang/', 244 | 'zaoyuans/', 245 | 'houshayu/', 246 | 'jichangfujin/', 247 | 'liqiao/', 248 | 'mapo/', 249 | 'shunyiquqita/', 250 | 'shunyicheng/', 251 | 'tianzhu/', 252 | 'yangzhen/', 253 | 'zhongyangbieshuqu/', 254 | 'changyang/', 255 | 'chengguanbj/', 256 | 'doudian/', 257 | 'fangshanquqita/', 258 | 'guandaozhen/', 259 | 'hancunhe/', 260 | 'liangxiang/', 261 | 'liulihe/', 262 | 'yancun/', 263 | 'yanshan/', 264 | 'binhexiqu/', 265 | 'chengzi/', 266 | 'dayu/', 267 | 'fengcun/', 268 | 'mentougouquqita/', 269 | 'shimenying/', 270 | 'yongdingzhen/', 271 | 'badaling/', 272 | 'dayushu/', 273 | 'kangzhuang/', 274 | 'yanqingquqita/', 275 | 'yanqingchengqu/', 276 | 'baodinga/', 277 | 'langfanga/', 278 | 'qinhuangdaoa/', 279 | 'tangshang/', 280 | 'weihaia/', 281 | 'yantaia/', 282 | 'yanjiao/', 283 | 'zhangjiakou/', 284 | ] 285 | 286 | self.price_urls = [ 287 | 'zj5332/', 288 | 'zj297/', 289 | 'zj298/', 290 | 'zj299/', 291 | 'zj300/', 292 | 'zj301/', 293 | 'zj33/', 294 | 'zj5333/', 295 | 'zj5334/', 296 | 'zj5335/', 297 | 'zj5336/', 298 | ] 299 | 300 | self.init() 301 | 302 | def start_requests(self): 303 | count = self.sql.get_proxy_count(self.name) 304 | count_free = self.sql.get_proxy_count(config.httpbin_table) 305 | 306 | ids = self.sql.get_proxy_ids(self.name) 307 | ids_httpbin = self.sql.get_proxy_ids(config.httpbin_table) 308 | 309 | for i in range(0, count + count_free): 310 | table = self.name if (i < count) else config.httpbin_table 311 | id = ids[i] if i < count else ids_httpbin[i - len(ids)] 312 | 313 | proxy = self.sql.get_proxy_with_id(table, id) 314 | if proxy == None: 315 | continue 316 | 317 | full_url = 'https://bj.zu.anjuke.com/fangyuan/{region}p{page}-{price}'.format( 318 | region = random.choice(self.region_urls), price = random.choice(self.price_urls), page = 1) 319 | cur_time = time.time() 320 | yield Request( 321 | url = full_url, 322 | headers = { 323 | 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' 324 | }, 325 | dont_filter = True, 326 | meta = { 327 | 'cur_time': cur_time, 328 | 'download_timeout': self.timeout, 329 | 'proxy_info': proxy, 330 | 'table': table, 331 | 'proxy': 'http://%s:%s' % (proxy.ip, proxy.port), 332 | }, 333 | callback = self.success_parse, 334 | errback = self.error_parse, 335 | ) 336 | -------------------------------------------------------------------------------- /crawler/spiders/validator/assetstore.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | 3 | import json 4 | import time 5 | import config 6 | 7 | from scrapy.http import Request 8 | from .validator import Validator 9 | 10 | 11 | class AssetStoreSpider(Validator): 12 | name = 'assetstore' 13 | 14 | def __init__(self, *a, **kwargs): 15 | super(AssetStoreSpider, self).__init__(*a, **kwargs) 16 | 17 | self.timeout = 10 18 | 19 | self.init() 20 | 21 | def start_requests(self): 22 | url = 'https://www.assetstore.unity3d.com/login' 23 | yield Request( 24 | url = url, 25 | headers = { 26 | 'Accept': 'application/json', 27 | 'Accept-Encoding': 'gzip, deflate, br', 28 | 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 29 | 'Connection': 'keep-alive', 30 | 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 31 | 'Host': 'www.assetstore.unity3d.com', 32 | 'Referer': 'https://www.assetstore.unity3d.com/en/', 33 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 ' 34 | 'Firefox/50.0', 35 | 'X-Kharma-Version': '0', 36 | 'X-Requested-With': 'UnityAssetStore', 37 | 'X-Unity-Session': '26c4202eb475d02864b40827dfff11a14657aa41', 38 | }, 39 | meta = { 40 | }, 41 | dont_filter = True, 42 | callback = self.get_unity_version, 43 | errback = self.error_parse, 44 | ) 45 | 46 | def get_unity_version(self, response): 47 | content = json.loads(response.body) 48 | self.log('unity content:%s' % response.body) 49 | 50 | unity_version = content.get('kharma_version', '') 51 | 52 | headers = { 53 | 'Accept': '*/*', 54 | 'Accept-Encoding': 'gzip, deflate, br', 55 | 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 56 | 'Connection': 'keep-alive', 57 | 'Host': 'www.assetstore.unity3d.com', 58 | 'Referer': 'https://www.assetstore.unity3d.com/en/', 59 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 Firefox/50.0', 60 | 'X-Kharma-Version': unity_version, 61 | 'X-Requested-With': 'UnityAssetStore', 62 | 'X-Unity-Session': '26c4202eb475d02864b40827dfff11a14657aa41', 63 | } 64 | 65 | count = self.sql.get_proxy_count(self.name) 66 | count_free = self.sql.get_proxy_count(config.httpbin_table) 67 | 68 | ids = self.sql.get_proxy_ids(self.name) 69 | ids_free = self.sql.get_proxy_ids(config.httpbin_table) 70 | 71 | for i in range(0, count + count_free): 72 | table = self.name if (i < count) else config.httpbin_table 73 | id = ids[i] if i < count else ids_free[i - len(ids)] 74 | 75 | proxy = self.sql.get_proxy_with_id(table, id) 76 | if proxy == None: 77 | continue 78 | 79 | url = 'https://www.assetstore.unity3d.com/api/en-US/content/overview/' + '368' + '.json' 80 | cur_time = time.time() 81 | yield Request( 82 | url = url, 83 | headers = headers, 84 | meta = { 85 | 'cur_time': cur_time, 86 | 'download_timeout': self.timeout, 87 | 'proxy_info': proxy, 88 | 'table': table, 89 | 'proxy': 'http://%s:%s' % (proxy.ip, proxy.port), 90 | }, 91 | dont_filter = True, 92 | callback = self.success_parse, 93 | errback = self.error_parse, 94 | ) 95 | -------------------------------------------------------------------------------- /crawler/spiders/validator/baidu.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | 3 | from .validator import Validator 4 | 5 | 6 | class BaiduSpider(Validator): 7 | name = 'baidu' 8 | 9 | def __init__(self, name = None, **kwargs): 10 | super(BaiduSpider, self).__init__(name, **kwargs) 11 | 12 | self.urls = [ 13 | 'https://www.baidu.com/' 14 | ] 15 | 16 | self.headers = { 17 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 18 | 'Accept-Encoding': 'gzip, deflate, br', 19 | 'Accept-Language': 'en-US,en;q=0.5', 20 | 'Cache-Control': 'max-age=0', 21 | 'Connection': 'keep-alive', 22 | 'Host': 'www.baidu.com', 23 | 'Upgrade-Insecure-Requests': '1', 24 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 ' 25 | 'Firefox/50.0', 26 | } 27 | 28 | self.init() 29 | -------------------------------------------------------------------------------- /crawler/spiders/validator/bbs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from .validator import Validator 4 | 5 | 6 | class BBSSpider(Validator): 7 | name = 'bbs' 8 | concurrent_requests = 8 9 | 10 | def __init__(self, name = None, **kwargs): 11 | super(BBSSpider, self).__init__(name, **kwargs) 12 | 13 | self.urls = [ 14 | 'http://www.autohome.com.cn/beijing/', 15 | 'http://club.autohome.com.cn/bbs/thread-c-2098-64053713-1.html', 16 | 'http://club.autohome.com.cn/bbs/thread-c-2098-61435076-1.html', 17 | 'http://club.autohome.com.cn/bbs/threadqa-c-4034-63834038-1.html', 18 | 'http://club.autohome.com.cn/bbs/threadqa-c-4034-63083758-1.html', 19 | 'http://club.autohome.com.cn/bbs/threadqa-c-4044-64310067-1.html', 20 | 'http://club.autohome.com.cn/bbs/threadqa-c-4044-64328047-1.html', 21 | 'http://club.autohome.com.cn/bbs/thread-c-4044-63233315-1.html', 22 | 'http://club.autohome.com.cn/bbs/threadqa-c-4044-62349867-1.html', 23 | 'http://club.autohome.com.cn/bbs/thread-c-4034-63846295-1.html', 24 | ] 25 | 26 | self.headers = { 27 | 'Host': 'club.autohome.com.cn', 28 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 ' 29 | 'Firefox/50.0', 30 | } 31 | 32 | self.is_record_web_page = False 33 | self.init() 34 | 35 | def success_content_parse(self, response): 36 | if 'conmain' in response.text: 37 | return True 38 | return False 39 | -------------------------------------------------------------------------------- /crawler/spiders/validator/boss.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | 3 | from .validator import Validator 4 | 5 | 6 | class BossSpider(Validator): 7 | name = 'boss' 8 | concurrent_requests = 8 9 | 10 | def __init__(self, name = None, **kwargs): 11 | super(BossSpider, self).__init__(name, **kwargs) 12 | 13 | self.urls = [ 14 | 'https://www.zhipin.com/c101010100/h_101010100/?query=java&page=1&ka=page-1' 15 | ] 16 | 17 | self.headers = { 18 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 19 | # 'Accept-Encoding': 'gzip, deflate, br', 20 | # 'Accept-Language': 'en-US,en;q=0.5', 21 | # 'Cache-Control': 'max-age=0', 22 | # 'Connection': 'keep-alive', 23 | # 'Upgrade-Insecure-Requests': '1', 24 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 ' 25 | 'Firefox/50.0', 26 | } 27 | 28 | self.is_record_web_page = False 29 | self.init() 30 | 31 | def success_content_parse(self, response): 32 | if '' in response.text: 33 | return True 34 | return False -------------------------------------------------------------------------------- /crawler/spiders/validator/douban.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | 3 | from .validator import Validator 4 | 5 | 6 | class DoubanSpider(Validator): 7 | name = 'douban' 8 | 9 | def __init__(self, name = None, **kwargs): 10 | super(DoubanSpider, self).__init__(name, **kwargs) 11 | 12 | self.timeout = 5 13 | 14 | self.urls = [ 15 | 'https://movie.douban.com/subject/3434070/?from=subject-page' 16 | ] 17 | 18 | self.headers = { 19 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 20 | 'Accept-Encoding': 'gzip, deflate, br', 21 | 'Accept-Language': 'en-US,en;q=0.5', 22 | 'Connection': 'keep-alive', 23 | 'Host': 'movie.douban.com', 24 | 'Upgrade-Insecure-Requests': '1', 25 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 ' 26 | 'Firefox/50.0', 27 | } 28 | 29 | self.init() 30 | -------------------------------------------------------------------------------- /crawler/spiders/validator/gather.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | 3 | from .validator import Validator 4 | 5 | 6 | class GatherSpider(Validator): 7 | name = 'gather' 8 | 9 | def __init__(self, name = None, **kwargs): 10 | super(GatherSpider, self).__init__(name, **kwargs) 11 | 12 | self.timeout = 10 13 | self.urls = [ 14 | 'http://gatherproxy.com/proxylist/anonymity/?t=Anonymous', 15 | 'http://gatherproxy.com/proxylist/country/?c=China' 16 | ] 17 | 18 | self.headers = { 19 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 20 | 'Accept-Encoding': 'gzip, deflate', 21 | 'Accept-Language': 'en-US,en;q=0.5', 22 | 'Connection': 'keep-alive', 23 | 'Host': 'gatherproxy.com', 24 | 'Upgrade-Insecure-Requests': '1', 25 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 ' 26 | 'Firefox/50.0', 27 | } 28 | 29 | self.init() 30 | -------------------------------------------------------------------------------- /crawler/spiders/validator/httpbin.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import json 4 | import time 5 | import requests 6 | import config 7 | 8 | from scrapy import Request 9 | from .validator import Validator 10 | 11 | 12 | class HttpBinSpider(Validator): 13 | name = 'httpbin' 14 | concurrent_requests = 16 15 | 16 | def __init__(self, name=None, **kwargs): 17 | super(HttpBinSpider, self).__init__(name, **kwargs) 18 | self.timeout = 20 19 | self.urls = [ 20 | 'http://httpbin.org/get?show_env=1', 21 | 'https://httpbin.org/get?show_env=1', 22 | ] 23 | self.headers = { 24 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 25 | "Accept-Encoding": "gzip, deflate, br", 26 | "Accept-Language": "en-US,en;q=0.5", 27 | "Host": "httpbin.org", 28 | "Upgrade-Insecure-Requests": "1", 29 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 Firefox/51.0" 30 | } 31 | 32 | self.origin_ip = '' 33 | 34 | self.init() 35 | 36 | def init(self): 37 | super(HttpBinSpider, self).init() 38 | 39 | r = requests.get(url=self.urls[0], timeout=20) 40 | data = json.loads(r.text) 41 | self.origin_ip = data.get('origin', '') 42 | self.log('origin ip:%s' % self.origin_ip) 43 | 44 | def start_requests(self): 45 | count = self.sql.get_proxy_count(self.name) 46 | count_free = self.sql.get_proxy_count(config.free_ipproxy_table) 47 | 48 | ids = self.sql.get_proxy_ids(self.name) 49 | ids_free = self.sql.get_proxy_ids(config.free_ipproxy_table) 50 | 51 | for i in range(0, count + count_free): 52 | table = self.name if (i < count) else config.free_ipproxy_table 53 | id = ids[i] if i < count else ids_free[i - len(ids)] 54 | 55 | proxy = self.sql.get_proxy_with_id(table, id) 56 | if proxy == None: 57 | continue 58 | 59 | for url in self.urls: 60 | https = 'yes' if 'https' in url else 'no' 61 | 62 | yield Request( 63 | url=url, 64 | headers=self.headers, 65 | dont_filter=True, 66 | priority=0 if https == 'yes' else 10, 67 | meta={ 68 | 'cur_time': time.time(), 69 | 'download_timeout': self.timeout, 70 | 'proxy_info': proxy, 71 | 'table': table, 72 | 'https': https, 73 | 'proxy': 'http://%s:%s' % (proxy.ip, proxy.port), 74 | 'vali_count': proxy.vali_count, 75 | }, 76 | callback=self.success_parse, 77 | errback=self.error_parse, 78 | ) 79 | 80 | def success_parse(self, response): 81 | proxy = response.meta.get('proxy_info') 82 | table = response.meta.get('table') 83 | proxy.https = response.meta.get('https') 84 | 85 | self.save_page(proxy.ip, response.body) 86 | 87 | if self.success_content_parse(response): 88 | proxy.speed = time.time() - response.meta.get('cur_time') 89 | proxy.vali_count += 1 90 | self.log('proxy_info:%s' % (str(proxy))) 91 | 92 | if proxy.https == 'no': 93 | data = json.loads(response.body) 94 | origin = data.get('origin') 95 | headers = data.get('headers') 96 | x_forwarded_for = headers.get('X-Forwarded-For', None) 97 | x_real_ip = headers.get('X-Real-Ip', None) 98 | via = headers.get('Via', None) 99 | 100 | if self.origin_ip in origin: 101 | proxy.anonymity = 3 102 | elif via is not None: 103 | proxy.anonymity = 2 104 | elif x_forwarded_for is not None and x_real_ip is not None: 105 | proxy.anonymity = 1 106 | 107 | if table == self.name: 108 | if proxy.speed > self.timeout: 109 | self.sql.del_proxy_with_id(table_name=table, id=proxy.id) 110 | else: 111 | self.sql.update_proxy(table_name=table, proxy=proxy) 112 | else: 113 | if proxy.speed < self.timeout: 114 | self.sql.insert_proxy(table_name=self.name, proxy=proxy) 115 | else: 116 | self.sql.update_proxy(table_name=table, proxy=proxy) 117 | 118 | self.sql.commit() 119 | 120 | def error_parse(self, failure): 121 | request = failure.request 122 | self.log('error_parse value:%s url:%s meta:%s' % (failure.value, request.url, request.meta)) 123 | https = request.meta.get('https') 124 | if https == 'no': 125 | table = request.meta.get('table') 126 | proxy = request.meta.get('proxy_info') 127 | 128 | if table == self.name: 129 | self.sql.del_proxy_with_id(table_name=table, id=proxy.id) 130 | else: 131 | # TODO... 如果 ip 验证失败应该针对特定的错误类型,进行处理 132 | pass 133 | -------------------------------------------------------------------------------- /crawler/spiders/validator/jd.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | 3 | import random 4 | import time 5 | import re 6 | import config 7 | 8 | from scrapy import Request 9 | from .validator import Validator 10 | 11 | 12 | class JDSpider(Validator): 13 | name = 'jd' 14 | 15 | def __init__(self, name = None, **kwargs): 16 | super(JDSpider, self).__init__(name, **kwargs) 17 | 18 | self.urls = [ 19 | 'https://item.jd.com/11478178241.html', 20 | 'https://item.jd.com/4142680.html', 21 | 'https://item.jd.com/3133859.html', 22 | 'https://item.jd.com/11349957411.html', 23 | 'https://item.jd.com/1231104.html', 24 | 'https://item.jd.com/11290644320.html', 25 | 'https://item.jd.com/3553539.html', 26 | 'https://item.jd.com/3553567.html', 27 | 'https://item.jd.com/4640524.html', 28 | 'https://item.jd.com/3652063.html', 29 | 'https://item.jd.com/2967929.html', 30 | 'https://item.jd.com/3367822.html', 31 | 'https://item.jd.com/1217500.html', 32 | ] 33 | 34 | self.headers = { 35 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 36 | 'Accept-Encoding': 'gzip, deflate, br', 37 | 'Accept-Language': 'en-US,en;q=0.5', 38 | 'Connection': 'keep-alive', 39 | 'Host': 'item.jd.com', 40 | 'Upgrade-Insecure-Requests': '1', 41 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:52.0) Gecko/20100101 Firefox/52.0', 42 | } 43 | 44 | self.is_record_web_page = False 45 | self.init() 46 | 47 | def success_content_parse(self, response): 48 | if 'comments' in response.text: 49 | return True 50 | return False 51 | 52 | def start_requests(self): 53 | count = self.sql.get_proxy_count(self.name) 54 | count_httpbin = self.sql.get_proxy_count(config.httpbin_table) 55 | 56 | ids = self.sql.get_proxy_ids(self.name) 57 | ids_httpbin = self.sql.get_proxy_ids(config.httpbin_table) 58 | 59 | for i in range(0, count + count_httpbin): 60 | table = self.name if (i < count) else config.httpbin_table 61 | id = ids[i] if i < count else ids_httpbin[i - len(ids)] 62 | 63 | proxy = self.sql.get_proxy_with_id(table, id) 64 | if proxy == None: 65 | continue 66 | 67 | url = random.choice(self.urls) 68 | pattern = re.compile('\d+', re.S) 69 | product_id = re.search(pattern, url).group() 70 | 71 | cur_time = time.time() 72 | yield Request( 73 | url = url, 74 | headers = self.headers, 75 | meta = { 76 | 'cur_time': cur_time, 77 | 'download_timeout': self.timeout, 78 | 'proxy_info': proxy, 79 | 'table': table, 80 | 'proxy': 'http://%s:%s' % (proxy.ip, proxy.port), 81 | 'product_id': product_id, 82 | }, 83 | dont_filter = True, 84 | callback = self.get_comment_count, 85 | errback = self.error_parse, 86 | ) 87 | 88 | def get_comment_count(self, response): 89 | name = response.xpath('//img[@id="spec-img"]/@alt').extract_first() 90 | self.log('name:%s time:%s' % (name, time.time() - response.meta.get('cur_time'))) 91 | 92 | pattern = re.compile('commentVersion:\'(\d+)\'', re.S) 93 | comment_version = re.search(pattern, response.text).group(1) 94 | 95 | # sort type 5:推荐排序 6:时间排序 96 | url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv' \ 97 | '{comment_version}&productId={product_id}&score=0&sortType={sort_type}&page=0&pageSize=10' \ 98 | '&isShadowSku=0'. \ 99 | format(product_id = response.meta.get('product_id'), comment_version = comment_version, sort_type = '6') 100 | 101 | cur_time = time.time() 102 | yield Request( 103 | url = url, 104 | headers = { 105 | 'Accept': '*/*', 106 | 'Accept-Encoding': 'gzip, deflate, br', 107 | 'Accept-Language': 'en-US,en;q=0.5', 108 | 'Connection': 'keep-alive', 109 | 'Host': 'club.jd.com', 110 | 'Referer': 'https://item.jd.com/%s.html' % response.meta.get('product_id'), 111 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:52.0) Gecko/20100101 ' 112 | 'Firefox/52.0', 113 | }, 114 | method = 'GET', 115 | meta = { 116 | 'proxy': response.meta.get('proxy'), 117 | 'cur_time': cur_time, 118 | 'download_timeout': self.timeout, 119 | 'proxy_info': response.meta.get('proxy_info'), 120 | 'table': response.meta.get('table'), 121 | }, 122 | dont_filter = True, 123 | callback = self.success_parse, 124 | errback = self.error_parse 125 | ) 126 | -------------------------------------------------------------------------------- /crawler/spiders/validator/lagou.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | 3 | import time 4 | import config 5 | import utils 6 | 7 | from .validator import Validator 8 | from scrapy.http import FormRequest 9 | 10 | 11 | class LagouSpider(Validator): 12 | name = 'lagou' 13 | concurrent_requests = 8 14 | 15 | def __init__(self, name = None, **kwargs): 16 | super(LagouSpider, self).__init__(name, **kwargs) 17 | 18 | self.urls = [ 19 | 'https://www.lagou.com/jobs/positionAjax.json?city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false' 20 | ] 21 | 22 | self.headers = { 23 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 24 | # 'Accept-Encoding': 'gzip, deflate, br', 25 | # 'Accept-Language': 'en-US,en;q=0.5', 26 | # 'Cache-Control': 'max-age=0', 27 | # 'Connection': 'keep-alive', 28 | # 'Upgrade-Insecure-Requests': '1', 29 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 ' 30 | 'Firefox/50.0', 31 | } 32 | 33 | self.is_record_web_page = True 34 | self.init() 35 | 36 | def success_content_parse(self, response): 37 | if 'success' in response.text: 38 | return True 39 | return False 40 | 41 | def start_requests(self): 42 | count = self.sql.get_proxy_count(self.name) 43 | count_httpbin = self.sql.get_proxy_count(config.httpbin_table) 44 | 45 | ids = self.sql.get_proxy_ids(self.name) 46 | ids_httpbin = self.sql.get_proxy_ids(config.httpbin_table) 47 | 48 | for i in range(0, count + count_httpbin): 49 | table = self.name if (i < count) else config.httpbin_table 50 | id = ids[i] if i < count else ids_httpbin[i - len(ids)] 51 | 52 | proxy = self.sql.get_proxy_with_id(table, id) 53 | if proxy == None: 54 | continue 55 | 56 | for url in self.urls: 57 | cur_time = time.time() 58 | yield FormRequest( 59 | url = url, 60 | headers = self.headers, 61 | method = 'POST', 62 | meta = { 63 | 'cur_time': cur_time, 64 | 'download_timeout': self.timeout, 65 | 'proxy_info': proxy, 66 | 'table': table, 67 | 'id': proxy.id, 68 | 'proxy': 'http://%s:%s' % (proxy.ip, proxy.port), 69 | 'vali_count': proxy.vali_count, 70 | }, 71 | cookies = { 72 | 'Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1488937030', 73 | '_ga': 'GA1.2.40497390.1488937014', 74 | 'TG-TRACK-CODE': 'search_code', 75 | 'index_location_city': '%E5%8C%97%E4%BA%AC', 76 | 'LGRID': '20170308093710-bf6755eb-039f-11e7-8025-525400f775ce', 77 | 'Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6': '1488881288,1488936799,1488936947,1488937014', 78 | 'JSESSIONID': 'BDCBB6167F960CE43AF54B75A651F586', 79 | 'LGSID': '20170308093653-b59316f0-039f-11e7-9229-5254005c3644', 80 | 'LGUID': '20170308093653-b593185f-039f-11e7-9229-5254005c3644', 81 | 'user_trace_token': '20170308093654-723efcfac8fb4c28a670d073d5113e02', 82 | 'SEARCH_ID': '4db4dc3dea1c46b49018ae5421b53ffa' 83 | }, 84 | formdata = { 85 | 'first': 'true', 86 | 'kd': 'ios', 87 | 'pn': '1', 88 | }, 89 | dont_filter = True, 90 | callback = self.success_parse, 91 | errback = self.error_parse, 92 | ) 93 | -------------------------------------------------------------------------------- /crawler/spiders/validator/liepin.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | 3 | from .validator import Validator 4 | 5 | 6 | class LiepinSpider(Validator): 7 | name = 'liepin' 8 | concurrent_requests = 8 9 | 10 | def __init__(self, name = None, **kwargs): 11 | super(LiepinSpider, self).__init__(name, **kwargs) 12 | 13 | self.urls = [ 14 | 'https://www.liepin.com/zhaopin/?pubTime=&ckid=17c370b0a0111aa5&fromSearchBtn=2&compkind' \ 15 | '=&isAnalysis=&init=-1&searchType=1&dqs=%s&industryType=&jobKind=&sortFlag=15&industries=&salary' 16 | '=&compscale=&clean_condition=&key=%s&headckid=49963e122c30b827&curPage=%s' % ('010', 'ios', '1') 17 | ] 18 | 19 | self.headers = { 20 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 21 | # 'Accept-Encoding': 'gzip, deflate, br', 22 | # 'Accept-Language': 'en-US,en;q=0.5', 23 | # 'Cache-Control': 'max-age=0', 24 | # 'Connection': 'keep-alive', 25 | # 'Upgrade-Insecure-Requests': '1', 26 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 ' 27 | 'Firefox/50.0', 28 | } 29 | 30 | self.is_record_web_page = False 31 | self.init() 32 | 33 | def success_content_parse(self, response): 34 | if 'sojob-list' in response.text: 35 | return True 36 | return False 37 | 38 | -------------------------------------------------------------------------------- /crawler/spiders/validator/steam.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | 3 | from .validator import Validator 4 | 5 | 6 | class SteamSpider(Validator): 7 | name = 'steam' 8 | 9 | def __init__(self, name = None, **kwargs): 10 | super(SteamSpider, self).__init__(name, **kwargs) 11 | 12 | self.timeout = 10 13 | 14 | self.urls = [ 15 | 'http://store.steampowered.com/app/602580/' 16 | ] 17 | 18 | self.headers = { 19 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 20 | 'Accept-Encoding': 'gzip, deflate', 21 | 'Accept-Language': 'en-US,en;q=0.5', 22 | 'Connection': 'keep-alive', 23 | 'Host': 'store.steampowered.com', 24 | 'Upgrade-Insecure-Requests': '1', 25 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 Firefox/51.0', 26 | } 27 | 28 | self.is_record_web_page = False 29 | 30 | self.init() 31 | -------------------------------------------------------------------------------- /crawler/spiders/validator/validator.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import random 3 | import time 4 | import datetime 5 | import utils 6 | import config 7 | 8 | from scrapy import Request 9 | from scrapy.spiders import Spider 10 | from sql import SqlManager 11 | 12 | 13 | class Validator(Spider): 14 | name = 'base' 15 | concurrent_requests = 16 16 | retry_enabled = False 17 | 18 | def __init__(self, name = None, **kwargs): 19 | super(Validator, self).__init__(name, **kwargs) 20 | 21 | self.urls = [] 22 | self.headers = None 23 | self.timeout = 10 24 | self.success_status = [200] 25 | self.is_record_web_page = False 26 | 27 | self.sql = SqlManager() 28 | 29 | def init(self): 30 | self.dir_log = 'log/validator/%s' % self.name 31 | utils.make_dir(self.dir_log) 32 | 33 | self.sql.init_proxy_table(self.name) 34 | 35 | @classmethod 36 | def update_settings(cls, settings): 37 | settings.setdict(cls.custom_settings or { 38 | 'CONCURRENT_REQUESTS': cls.concurrent_requests, 39 | 'RETRY_ENABLED': cls.retry_enabled, 40 | }, 41 | priority = 'spider') 42 | 43 | def start_requests(self): 44 | count = self.sql.get_proxy_count(self.name) 45 | count_free = self.sql.get_proxy_count(config.httpbin_table) 46 | 47 | ids = self.sql.get_proxy_ids(self.name) 48 | ids_httpbin = self.sql.get_proxy_ids(config.httpbin_table) 49 | 50 | for i in range(0, count + count_free): 51 | table = self.name if (i < count) else config.httpbin_table 52 | id = ids[i] if i < count else ids_httpbin[i - len(ids)] 53 | 54 | proxy = self.sql.get_proxy_with_id(table, id) 55 | if proxy == None: 56 | continue 57 | 58 | url = random.choice(self.urls) 59 | cur_time = time.time() 60 | yield Request( 61 | url = url, 62 | headers = self.headers, 63 | meta = { 64 | 'cur_time': cur_time, 65 | 'download_timeout': self.timeout, 66 | 'proxy_info': proxy, 67 | 'table': table, 68 | 'proxy': 'http://%s:%s' % (proxy.ip, proxy.port), 69 | }, 70 | dont_filter = True, 71 | callback = self.success_parse, 72 | errback = self.error_parse, 73 | ) 74 | 75 | def success_parse(self, response): 76 | proxy = response.meta.get('proxy_info') 77 | table = response.meta.get('table') 78 | 79 | self.save_page(proxy.ip, response.body) 80 | self.log('success_parse speed:%s meta:%s' % (time.time() - response.meta.get('cur_time'), response.meta)) 81 | 82 | proxy.vali_count += 1 83 | proxy.speed = time.time() - response.meta.get('cur_time') 84 | if self.success_content_parse(response): 85 | if table == self.name: 86 | if proxy.speed > self.timeout: 87 | self.sql.del_proxy_with_id(table, proxy.id) 88 | else: 89 | self.sql.update_proxy(table, proxy) 90 | else: 91 | if proxy.speed < self.timeout: 92 | self.sql.insert_proxy(table_name = self.name, proxy = proxy) 93 | else: 94 | if table == self.name: 95 | self.sql.del_proxy_with_id(table_name = table, id = proxy.id) 96 | 97 | self.sql.commit() 98 | 99 | def success_content_parse(self, response): 100 | if response.status not in self.success_status: 101 | return False 102 | return True 103 | 104 | def error_parse(self, failure): 105 | request = failure.request 106 | self.log('error_parse value:%s url:%s meta:%s' % (failure.value, request.url, request.meta)) 107 | 108 | proxy = failure.request.meta.get('proxy_info') 109 | table = failure.request.meta.get('table') 110 | 111 | if table == self.name: 112 | self.sql.del_proxy_with_id(table_name = table, id = proxy.id) 113 | else: 114 | # TODO... 如果 ip 验证失败应该针对特定的错误类型,进行处理 115 | pass 116 | 117 | # 118 | # request = failure.request.meta 119 | # utils.log('request meta:%s' % str(request)) 120 | # 121 | # # log all errback failures, 122 | # # in case you want to do something special for some errors, 123 | # # you may need the failure's type 124 | # self.logger.error(repr(failure)) 125 | # 126 | # #if isinstance(failure.value, HttpError): 127 | # if failure.check(HttpError): 128 | # # you can get the response 129 | # response = failure.value.response 130 | # self.logger.error('HttpError on %s', response.url) 131 | # 132 | # #elif isinstance(failure.value, DNSLookupError): 133 | # elif failure.check(DNSLookupError): 134 | # # this is the original request 135 | # request = failure.request 136 | # self.logger.error('DNSLookupError on %s', request.url) 137 | # 138 | # #elif isinstance(failure.value, TimeoutError): 139 | # elif failure.check(TimeoutError): 140 | # request = failure.request 141 | # self.logger.error('TimeoutError on url:%s', request.url) 142 | 143 | def save_page(self, ip, data): 144 | filename = '{time} {ip}'.format(time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S:%f'), ip = ip) 145 | 146 | if self.is_record_web_page: 147 | with open('%s/%s.html' % (self.dir_log, filename), 'wb') as f: 148 | f.write(data) 149 | f.close() 150 | 151 | def close(spider, reason): 152 | spider.sql.commit() 153 | -------------------------------------------------------------------------------- /crawler/spiders/validator/zhilian.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | 3 | from .validator import Validator 4 | 5 | 6 | class ZhiLianSpider(Validator): 7 | name = 'zhilian' 8 | concurrent_requests = 8 9 | 10 | def __init__(self, name = None, **kwargs): 11 | super(ZhiLianSpider, self).__init__(name, **kwargs) 12 | 13 | self.urls = [ 14 | 'http://www.zhaopin.com/' 15 | ] 16 | 17 | self.headers = { 18 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 19 | # 'Accept-Encoding': 'gzip, deflate, br', 20 | # 'Accept-Language': 'en-US,en;q=0.5', 21 | # 'Cache-Control': 'max-age=0', 22 | # 'Connection': 'keep-alive', 23 | # 'Upgrade-Insecure-Requests': '1', 24 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36', 25 | } 26 | 27 | self.is_record_web_page = False 28 | self.init() 29 | 30 | def success_content_parse(self, response): 31 | if '' in response.text: 32 | return True 33 | return False 34 | -------------------------------------------------------------------------------- /db.sql: -------------------------------------------------------------------------------- 1 | -- MySQL dump 10.13 Distrib 5.5.58, for Linux (x86_64) 2 | -- 3 | -- Host: localhost Database: ipproxy 4 | -- ------------------------------------------------------ 5 | -- Server version 5.5.58 6 | 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 10 | /*!40101 SET NAMES utf8 */; 11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; 12 | /*!40103 SET TIME_ZONE='+00:00' */; 13 | /*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */; 14 | /*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */; 15 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */; 16 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; 17 | 18 | -- 19 | -- Table structure for table `free_ipproxy` 20 | -- 21 | 22 | DROP TABLE IF EXISTS `free_ipproxy`; 23 | /*!40101 SET @saved_cs_client = @@character_set_client */; 24 | /*!40101 SET character_set_client = utf8 */; 25 | CREATE TABLE `free_ipproxy` ( 26 | `id` int(8) NOT NULL AUTO_INCREMENT, 27 | `ip` char(25) NOT NULL, 28 | `port` int(4) NOT NULL, 29 | `country` text, 30 | `anonymity` int(2) DEFAULT NULL, 31 | `https` char(4) DEFAULT NULL, 32 | `speed` float DEFAULT NULL, 33 | `source` char(20) DEFAULT NULL, 34 | `save_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, 35 | `vali_count` int(5) DEFAULT '0', 36 | PRIMARY KEY (`id`), 37 | UNIQUE KEY `proxy_field` (`ip`,`port`) 38 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 39 | /*!40101 SET character_set_client = @saved_cs_client */; 40 | 41 | -- 42 | -- Dumping data for table `free_ipproxy` 43 | -- 44 | 45 | LOCK TABLES `free_ipproxy` WRITE; 46 | /*!40000 ALTER TABLE `free_ipproxy` DISABLE KEYS */; 47 | /*!40000 ALTER TABLE `free_ipproxy` ENABLE KEYS */; 48 | UNLOCK TABLES; 49 | 50 | -- 51 | -- Table structure for table `httpbin` 52 | -- 53 | 54 | DROP TABLE IF EXISTS `httpbin`; 55 | /*!40101 SET @saved_cs_client = @@character_set_client */; 56 | /*!40101 SET character_set_client = utf8 */; 57 | CREATE TABLE `httpbin` ( 58 | `id` int(8) NOT NULL AUTO_INCREMENT, 59 | `ip` char(25) NOT NULL, 60 | `port` int(4) NOT NULL, 61 | `country` text, 62 | `anonymity` int(2) DEFAULT NULL, 63 | `https` char(4) DEFAULT NULL, 64 | `speed` float DEFAULT NULL, 65 | `source` char(20) DEFAULT NULL, 66 | `save_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, 67 | `vali_count` int(5) DEFAULT '0', 68 | PRIMARY KEY (`id`), 69 | UNIQUE KEY `proxy_field` (`ip`,`port`) 70 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 71 | /*!40101 SET character_set_client = @saved_cs_client */; 72 | 73 | -- 74 | -- Dumping data for table `httpbin` 75 | -- 76 | 77 | LOCK TABLES `httpbin` WRITE; 78 | /*!40000 ALTER TABLE `httpbin` DISABLE KEYS */; 79 | /*!40000 ALTER TABLE `httpbin` ENABLE KEYS */; 80 | UNLOCK TABLES; 81 | 82 | -- 83 | -- Dumping routines for database 'ipproxy' 84 | -- 85 | /*!50003 DROP PROCEDURE IF EXISTS `drop_iptables` */; 86 | /*!50003 SET @saved_cs_client = @@character_set_client */ ; 87 | /*!50003 SET @saved_cs_results = @@character_set_results */ ; 88 | /*!50003 SET @saved_col_connection = @@collation_connection */ ; 89 | /*!50003 SET character_set_client = utf8 */ ; 90 | /*!50003 SET character_set_results = utf8 */ ; 91 | /*!50003 SET collation_connection = utf8_general_ci */ ; 92 | /*!50003 SET @saved_sql_mode = @@sql_mode */ ; 93 | /*!50003 SET sql_mode = '' */ ; 94 | DELIMITER ;; 95 | CREATE DEFINER=`root`@`localhost` PROCEDURE `drop_iptables`() 96 | BEGIN 97 | DELETE FROM ipproxy.free_ipproxy; 98 | DELETE FROM ipproxy.httpbin; 99 | TRUNCATE TABLE ipproxy.free_ipproxy; 100 | TRUNCATE TABLE ipproxy.httpbin; 101 | END ;; 102 | DELIMITER ; 103 | /*!50003 SET sql_mode = @saved_sql_mode */ ; 104 | /*!50003 SET character_set_client = @saved_cs_client */ ; 105 | /*!50003 SET character_set_results = @saved_cs_results */ ; 106 | /*!50003 SET collation_connection = @saved_col_connection */ ; 107 | /*!50003 DROP PROCEDURE IF EXISTS `ip_transfer` */; 108 | /*!50003 SET @saved_cs_client = @@character_set_client */ ; 109 | /*!50003 SET @saved_cs_results = @@character_set_results */ ; 110 | /*!50003 SET @saved_col_connection = @@collation_connection */ ; 111 | /*!50003 SET character_set_client = utf8 */ ; 112 | /*!50003 SET character_set_results = utf8 */ ; 113 | /*!50003 SET collation_connection = utf8_general_ci */ ; 114 | /*!50003 SET @saved_sql_mode = @@sql_mode */ ; 115 | /*!50003 SET sql_mode = '' */ ; 116 | DELIMITER ;; 117 | CREATE DEFINER=`root`@`localhost` PROCEDURE `ip_transfer`(IN valid_id INT) 118 | BEGIN DECLARE cur_ip char(25); DECLARE cur_port int(4); SELECT ip,port INTO cur_ip,cur_port FROM free_ipproxy WHERE id = valid_id; DELETE FROM httpbin WHERE ip =cur_ip AND port = cur_port; INSERT INTO httpbin(ip,port,country,anonymity,https,speed,source) SELECT ip,port,country,anonymity,https,speed,source FROM free_ipproxy WHERE id = valid_id; DELETE FROM free_ipproxy where id = valid_id; END ;; 119 | DELIMITER ; 120 | /*!50003 SET sql_mode = @saved_sql_mode */ ; 121 | /*!50003 SET character_set_client = @saved_cs_client */ ; 122 | /*!50003 SET character_set_results = @saved_cs_results */ ; 123 | /*!50003 SET collation_connection = @saved_col_connection */ ; 124 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; 125 | 126 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; 127 | /*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */; 128 | /*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */; 129 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 130 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 131 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 132 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; 133 | 134 | -- Dump completed on 2018-01-25 4:01:20 135 | -------------------------------------------------------------------------------- /ipproxytool.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import logging 4 | import os 5 | import sys 6 | import subprocess 7 | import run_validator 8 | import run_validator_async 9 | 10 | if __name__ == '__main__': 11 | 12 | # 进入当前项目目录 13 | os.chdir(sys.path[0]) 14 | 15 | if not os.path.exists('log'): 16 | os.makedirs('log') 17 | 18 | logging.basicConfig( 19 | filename = 'log/ipproxy.log', 20 | format = '%(asctime)s: %(message)s', 21 | level = logging.DEBUG 22 | ) 23 | 24 | subprocess.Popen(['python', 'run_crawl_proxy.py']) 25 | subprocess.Popen(['python', 'run_server.py']) 26 | 27 | if 'async' in sys.argv: 28 | run_validator_async.async_validator() 29 | else: 30 | run_validator.validator() 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /proxy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | class Proxy(object): 5 | def __init__(self): 6 | self.id = 1 7 | self.ip = '' 8 | self.port = '' 9 | self.country = '' 10 | self.anonymity = '' 11 | self.https = '' 12 | self.speed = '' 13 | self.source = '' 14 | self.vali_count = 0 15 | 16 | def __str__(self): 17 | data = { 18 | 'ip': self.ip, 19 | 'port': self.port, 20 | 'country': self.country, 21 | 'anonymity': self.anonymity, 22 | 'https': self.https, 23 | 'speed': self.speed, 24 | 'source': self.source, 25 | 'vali_count': self.vali_count, 26 | } 27 | 28 | return str(data) 29 | 30 | def __dict__(self): 31 | data = { 32 | 'ip': self.ip, 33 | 'port': self.port, 34 | 'country': self.country, 35 | 'anonymity': self.anonymity, 36 | 'https': self.https, 37 | 'speed': self.speed, 38 | 'source': self.source, 39 | 'vali_count': self.vali_count, 40 | } 41 | 42 | return data 43 | 44 | def get_dict(self): 45 | data = { 46 | 'ip': self.ip, 47 | 'port': self.port, 48 | 'country': self.country, 49 | 'anonymity': self.anonymity, 50 | 'https': self.https, 51 | 'speed': self.speed, 52 | 'source': self.source, 53 | 'vali_count': self.vali_count, 54 | } 55 | 56 | return data 57 | 58 | def set_value(self, ip, port, country, anonymity, source='unkonw', https='no', speed=-1, vali_count=0): 59 | self.ip = ip 60 | self.port = port 61 | self.country = country 62 | self.anonymity = self.get_anonymity_type(anonymity) 63 | self.https = https 64 | self.speed = speed 65 | self.source = source 66 | self.vali_count = vali_count 67 | 68 | def get_anonymity_type(self, anonymity): 69 | '''There are 3 levels of proxies according to their anonymity. 70 | 71 | Level 1 - Elite Proxy / Highly Anonymous Proxy: The web server can't detect whether you are using a proxy. 72 | Level 2 - Anonymous Proxy: The web server can know you are using a proxy, but it can't know your real IP. 73 | Level 3 - Transparent Proxy: The web server can know you are using a proxy and it can also know your real 74 | IP. 75 | ''' 76 | 77 | if anonymity == u'高匿代理' or anonymity == u'高匿名' or anonymity == 'elite proxy' or \ 78 | anonymity == u'超级匿名' or anonymity == u'High': 79 | return '1' 80 | elif anonymity == u'匿名' or anonymity == 'anonymous' or anonymity == u'普通匿名' or anonymity == u'Medium': 81 | return '2' 82 | elif anonymity == u'透明' or anonymity == 'transparent' or anonymity == u'No': 83 | return '3' 84 | else: 85 | return '3' 86 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.7.4 2 | async-timeout==3.0.1 3 | attrs==20.3.0 4 | Automat==20.2.0 5 | beautifulsoup4==4.9.3 6 | bs4==0.0.1 7 | certifi==2020.12.5 8 | cffi==1.14.4 9 | chardet==3.0.4 10 | click==7.1.2 11 | constantly==15.1.0 12 | crochet==1.12.0 13 | cryptography==3.3.1 14 | cssselect==1.1.0 15 | Flask==1.1.2 16 | hyperlink==20.0.1 17 | idna==2.10 18 | incremental==17.5.0 19 | itemadapter==0.2.0 20 | itemloaders==1.0.4 21 | itsdangerous==1.1.0 22 | Jinja2==2.11.2 23 | jmespath==0.10.0 24 | logzero==1.6.3 25 | lxml==4.6.2 26 | MarkupSafe==1.1.1 27 | multidict==5.1.0 28 | parsel==1.6.0 29 | Protego==0.1.16 30 | pyasn1==0.4.8 31 | pyasn1-modules==0.2.8 32 | pycparser==2.20 33 | PyDispatcher==2.0.5 34 | Pygments==2.7.3 35 | PyHamcrest==2.0.2 36 | pymongo==3.11.2 37 | PyMySQL==0.10.1 38 | pyOpenSSL==20.0.1 39 | queuelib==1.5.0 40 | requests==2.25.1 41 | Scrapy==2.4.1 42 | scrapydo==0.2.2 43 | service-identity==18.1.0 44 | six==1.15.0 45 | soupsieve==2.1 46 | Twisted==20.3.0 47 | typing-extensions==3.7.4.3 48 | urllib3==1.26.2 49 | w3lib==1.22.0 50 | Werkzeug==1.0.1 51 | wrapt==1.12.1 52 | yarl==1.6.3 53 | zope.interface==5.2.0 54 | -------------------------------------------------------------------------------- /run_crawl_proxy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import logging 4 | import os 5 | import sys 6 | import scrapydo 7 | import time 8 | import utils 9 | import config 10 | 11 | from sql import SqlManager 12 | from crawler.spiders.proxy.xicidaili import XiCiDaiLiSpider 13 | from crawler.spiders.proxy.sixsixip import SixSixIpSpider 14 | from crawler.spiders.proxy.ip181 import IpOneEightOneSpider 15 | from crawler.spiders.proxy.kuaidaili import KuaiDaiLiSpider 16 | from crawler.spiders.proxy.gatherproxy import GatherproxySpider 17 | from crawler.spiders.proxy.hidemy import HidemySpider 18 | from crawler.spiders.proxy.proxylistplus import ProxylistplusSpider 19 | from crawler.spiders.proxy.freeproxylists import FreeProxyListsSpider 20 | from crawler.spiders.proxy.usproxy import UsProxySpider 21 | from crawler.spiders.proxy.proxydb import ProxyDBSpider 22 | from crawler.spiders.proxy.data5u import Data5uSpider 23 | 24 | 25 | scrapydo.setup() 26 | 27 | if __name__ == '__main__': 28 | os.chdir(sys.path[0]) 29 | 30 | if not os.path.exists('log'): 31 | os.makedirs('log') 32 | 33 | logging.basicConfig( 34 | filename = 'log/crawl_proxy.log', 35 | format = '%(levelname)s %(asctime)s: %(message)s', 36 | level = logging.DEBUG 37 | ) 38 | sql = SqlManager() 39 | 40 | spiders = [ 41 | # XiCiDaiLiSpider, # 已失效 42 | SixSixIpSpider, 43 | IpOneEightOneSpider, 44 | KuaiDaiLiSpider, # 在访问前加了一个 js ,反爬 45 | GatherproxySpider, 46 | # HidemySpider, 已失效 47 | ProxylistplusSpider, 48 | FreeProxyListsSpider, 49 | # PeulandSpider, # 目标站点失效 50 | UsProxySpider, 51 | ProxyDBSpider, 52 | Data5uSpider, 53 | ] 54 | while True: 55 | utils.log('*******************run spider start...*******************') 56 | #sql.delete_old(config.free_ipproxy_table, 0.5) 57 | try: 58 | for spider in spiders: 59 | scrapydo.run_spider(spider_cls = spider) 60 | except Exception as e: 61 | utils.log('[Error]# spider goes wroing.Return Message: {}'.format(str(e))) 62 | 63 | utils.log('*******************run spider waiting...*******************') 64 | time.sleep(1200) 65 | -------------------------------------------------------------------------------- /run_server.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import logging 5 | import config 6 | import utils 7 | 8 | from server import dataserver 9 | 10 | if __name__ == '__main__': 11 | if not os.path.exists('log'): 12 | os.makedirs('log') 13 | 14 | logging.basicConfig( 15 | filename='log/server.log', 16 | format='%(levelname)s %(asctime)s: %(message)s', 17 | level=logging.DEBUG 18 | ) 19 | 20 | utils.kill_ports([config.data_port]) 21 | 22 | dataserver.app.run( 23 | debug=False, 24 | host='127.0.0.1', 25 | port=config.data_port, 26 | ) 27 | -------------------------------------------------------------------------------- /run_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import logging 5 | import sys 6 | 7 | from scrapy.crawler import CrawlerProcess 8 | from scrapy.utils.log import configure_logging 9 | from scrapy.utils.project import get_project_settings 10 | 11 | 12 | def runspider(name): 13 | configure_logging(install_root_handler=False) 14 | logging.basicConfig( 15 | filename='log/%s.log' % name, 16 | format='%(levelname)s %(asctime)s: %(message)s', 17 | level=logging.DEBUG 18 | ) 19 | process = CrawlerProcess(get_project_settings()) 20 | try: 21 | logging.info('runspider start spider:%s' % name) 22 | process.crawl(name) 23 | process.start() 24 | except Exception as e: 25 | logging.exception('runspider spider:%s exception:%s' % (name, e)) 26 | 27 | logging.debug('finish this spider:%s\n\n' % name) 28 | 29 | 30 | if __name__ == '__main__': 31 | try: 32 | name = sys.argv[1] or 'base' 33 | runspider(name) 34 | except Exception as e: 35 | logging.exception('run_spider main exception msg:%s' % e) 36 | -------------------------------------------------------------------------------- /run_validator.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import logging 4 | import os 5 | import subprocess 6 | import sys 7 | import time 8 | import scrapydo 9 | import utils 10 | from importlib import import_module 11 | 12 | VALIDATORS = { 13 | 'HttpBinSpider': 'crawler.spiders.validator.httpbin', 14 | # 'DoubanSpider':'ipproxytool.spiders.validator.douban', 15 | # 'AssetStoreSpider':'ipproxytool.spiders.validator.assetstore', 16 | # 'GatherSpider' :'ipproxytool.spiders.validator.gather', 17 | # 'HttpBinSpider' :'ipproxytool.spiders.validator.httpbin', 18 | # 'SteamSpider' :'ipproxytool.spiders.validator.steam', 19 | # 'BossSpider' :'ipproxytool.spiders.validator.boss', 20 | # 'LagouSpider' :'ipproxytool.spiders.validator.lagou', 21 | # 'LiepinSpider' :'ipproxytool.spiders.validator.liepin', 22 | # 'JDSpider' :'ipproxytool.spiders.validator.jd', 23 | # 'BBSSpider' :'ipproxytool.spiders.validator.bbs', 24 | # 'ZhiLianSpider' :'ipproxytool.spiders.validator.zhilian', 25 | # 'AmazonCnSpider' :'ipproxytool.spiders.validator.amazoncn', 26 | } 27 | 28 | scrapydo.setup() 29 | 30 | 31 | def validator(): 32 | process_list = [] 33 | for item, path in VALIDATORS.items(): 34 | module = import_module(path) 35 | validator = getattr(module, item) 36 | popen = subprocess.Popen(['python', 'run_spider.py', validator.name], shell=False) 37 | data = { 38 | 'name': validator.name, 39 | 'popen': popen, 40 | } 41 | process_list.append(data) 42 | 43 | while True: 44 | time.sleep(60) 45 | for process in process_list: 46 | popen = process.get('popen', None) 47 | utils.log('name:%s poll:%s' % (process.get('name'), popen.poll())) 48 | 49 | # 检测结束进程,如果有结束进程,重新开启 50 | if popen != None and popen.poll() == 0: 51 | name = process.get('name') 52 | utils.log('%(name)s spider finish...\n' % {'name': name}) 53 | process_list.remove(process) 54 | p = subprocess.Popen(['python', 'run_spider.py', name], shell=False) 55 | data = { 56 | 'name': name, 57 | 'popen': p, 58 | } 59 | process_list.append(data) 60 | time.sleep(1) 61 | break 62 | 63 | 64 | if __name__ == '__main__': 65 | os.chdir(sys.path[0]) 66 | 67 | if not os.path.exists('log'): 68 | os.makedirs('log') 69 | 70 | logging.basicConfig( 71 | filename='log/validator.log', 72 | format='%(asctime)s: %(message)s', 73 | level=logging.DEBUG 74 | ) 75 | 76 | validator() 77 | -------------------------------------------------------------------------------- /run_validator_async.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import logging 4 | import os 5 | import sys 6 | import time 7 | import utils 8 | import aiohttp 9 | from aiohttp import ClientSession 10 | from sql.sql_manager import SqlManager 11 | import config 12 | import asyncio 13 | 14 | TEST_URL='http://httpbin.org/ip' 15 | 16 | async def test_connect(proxy,operator,mode=None): 17 | conn = aiohttp.TCPConnector(verify_ssl=False) 18 | async with ClientSession(connector=conn) as s: 19 | try: 20 | async with s.get(url=TEST_URL,proxy=proxy[2], 21 | timeout=10,allow_redirects=False) as resp: 22 | page = await resp.text() 23 | if (resp.status != 200 or str(resp.url) != TEST_URL): 24 | utils.log(('[INFO]#proxy:{ip} has been dropped\n' 25 | ' #Reason:Abnormal url or return Code').format(ip=proxy[1])) 26 | operator.del_proxy_with_id(config.free_ipproxy_table,proxy[0]) 27 | operator.del_proxy_with_id(config.httpbin_table,proxy[0]) 28 | elif mode == 'add': 29 | operator.insert_valid_proxy(id=proxy[0]) 30 | else: 31 | operator.update_valid_proxy(id=proxy[0]) 32 | 33 | except Exception as e: 34 | utils.log(('[INFO]#proxy:{ip} has been dropped\n' 35 | ' #Reason:{msg}').format(ip=proxy[1],msg=str(e))) 36 | operator.del_proxy_with_id(config.free_ipproxy_table,proxy[0]) 37 | operator.del_proxy_with_id(config.httpbin_table,proxy[0]) 38 | finally: 39 | operator.commit() 40 | 41 | 42 | def async_validator(): 43 | utils.log('[INFO]#Loading ip proxies....60 sec left') 44 | time.sleep(60) 45 | proxy_factory = SqlManager() 46 | loop = asyncio.get_event_loop() 47 | def test_process(table_name,mode=None,limit=50): 48 | id_list = proxy_factory.get_proxy_ids(table_name) 49 | if len(id_list) > 0: 50 | task_len = len(id_list) 51 | cur_id = 0 52 | for sig in range(0,task_len,limit): 53 | proxies = proxy_factory.get_proxies_info(table_name=table_name, 54 | start_id=cur_id, 55 | limit=limit) 56 | if len(proxies) == 0: 57 | break 58 | cur_id = proxies[-1][0] 59 | proxies = [[proxy[0],proxy[1],'http://{}:{}'.format(proxy[1],proxy[2])] for proxy in proxies] 60 | tasks = [test_connect(proxy,proxy_factory,mode) for proxy in proxies] 61 | loop.run_until_complete(asyncio.wait(tasks)) 62 | while True: 63 | utils.log('[INFO]Validator process started') 64 | utils.log('[INFO]Validator process:Verify mode start') 65 | test_process(config.httpbin_table) 66 | utils.log('[INFO]Validator process:Add mode start') 67 | test_process(config.free_ipproxy_table,mode='add') 68 | utils.log('[INFO]Validator process completed') 69 | time.sleep(300) 70 | 71 | 72 | if __name__ == '__main__': 73 | if not os.path.exists('log'): 74 | os.makedirs('log') 75 | 76 | logging.basicConfig( 77 | filename = 'log/validator.log', 78 | format = '%(asctime)s: %(message)s', 79 | level = logging.INFO 80 | ) 81 | async_validator() 82 | 83 | 84 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = crawler.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = crawler 12 | -------------------------------------------------------------------------------- /server/__init__.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | 3 | -------------------------------------------------------------------------------- /server/dataserver.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import json 4 | import logging 5 | import sys 6 | import config 7 | 8 | from proxy import Proxy 9 | from sql import SqlManager 10 | from flask import Flask 11 | from flask import request 12 | 13 | app = Flask(__name__) 14 | 15 | 16 | @app.route('/') 17 | def index(): 18 | return 'Hello, World!' 19 | 20 | 21 | @app.route('/insert') 22 | def insert(): 23 | sql = SqlManager() 24 | name = request.args.get('name') 25 | proxy = Proxy() 26 | proxy.set_value( 27 | ip=request.args.get('ip'), 28 | port=request.args.get('port'), 29 | country=request.args.get('country', None), 30 | anonymity=request.args.get('anonymity', None), 31 | https=request.args.get('https', 'no'), 32 | speed=request.args.get('speed', -1), 33 | source=request.args.get('source', name), 34 | ) 35 | 36 | result = sql.insert_proxy(name, proxy) 37 | data = { 38 | 'result': result 39 | } 40 | 41 | return json.dumps(data, indent=4) 42 | 43 | 44 | @app.route('/select') 45 | def select(): 46 | sql = SqlManager() 47 | name = request.args.get('name') 48 | anonymity = request.args.get('anonymity', '') 49 | https = request.args.get('https', '') 50 | order = request.args.get('order', 'speed') 51 | sort = request.args.get('sort', 'asc') 52 | count = request.args.get('count', 100) 53 | 54 | kwargs = { 55 | 'anonymity': anonymity, 56 | 'https': https, 57 | 'order': order, 58 | 'sort': sort, 59 | 'count': count 60 | } 61 | result = sql.select_proxy(name, **kwargs) 62 | data = [{ 63 | 'ip': item.get('ip'), 'port': item.get('port'), 64 | 'anonymity': item.get('anonymity'), 'https': item.get('https'), 65 | 'speed': item.get('speed'), 'save_time': item.get('save_time', '') 66 | } for item in result] 67 | return json.dumps(data, indent=4) 68 | 69 | 70 | @app.route('/delete') 71 | def delete(): 72 | sql = SqlManager() 73 | name = request.args.get('name') 74 | ip = request.args.get('ip') 75 | result = sql.del_proxy_with_ip(name, ip) 76 | data = {'result': result} 77 | 78 | return json.dumps(data, indent=4) 79 | 80 | @app.route('/query') 81 | def query(): 82 | sql = SqlManager() 83 | start_id = request.args.get('sid') 84 | limit = int(request.args.get('limit','100')) 85 | proxies = sql.get_proxies_info(config.httpbin_table,start_id=start_id,limit=limit) 86 | data = [{'id':proxy[0],'ip':proxy[1],'port':proxy[2],'https':proxy[3]} 87 | for proxy in proxies] 88 | return json.dumps(data,indent=4) 89 | 90 | 91 | 92 | -------------------------------------------------------------------------------- /sql/__init__.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | 3 | import sql.sql_base 4 | from sql.mysql import MySql 5 | from sql.mongodb import Mongodb 6 | from sql.sql_manager import SqlManager -------------------------------------------------------------------------------- /sql/mongodb.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import logging 4 | import pymongo 5 | import config 6 | import json 7 | import datetime 8 | 9 | from proxy import Proxy 10 | from sql.sql_base import SqlBase 11 | 12 | 13 | class Mongodb(SqlBase): 14 | def __init__(self, **kwargs): 15 | super(Mongodb, self).__init__(**kwargs) 16 | self.client = pymongo.MongoClient(**kwargs) 17 | self.db = self.client[config.database] 18 | 19 | def init_database(self, database_name): 20 | pass 21 | 22 | def init_proxy_table(self, table_name): 23 | pass 24 | 25 | def insert_proxy(self, table_name, proxy): 26 | data = proxy.get_dict() 27 | data['save_time'] = str(datetime.datetime.now()) 28 | self.db[table_name].insert(data) 29 | 30 | def select_proxy(self, table_name, **kwargs): 31 | filter = {} 32 | if kwargs.get('anonymity') != '': 33 | filter['anonymity'] = kwargs.get('anonymity') 34 | if kwargs.get('https') != '': 35 | filter['https'] = kwargs.get('https') 36 | 37 | data = [item for item in self.db[table_name].find(filter).limit(int(kwargs.get('count')))] 38 | return data 39 | 40 | def update_proxy(self, table_name, proxy): 41 | self.db[table_name].update_one( 42 | {'_id': proxy.id}, 43 | {'$set': 44 | {'https': proxy.https, 'speed': proxy.speed, 'vali_count': proxy.vali_count, 45 | 'anonymity': proxy.anonymity, 'save_time': str(datetime.datetime.now())}}) 46 | 47 | def delete_proxy(self, table_name, proxy): 48 | return self.del_proxy_with_id(table_name, proxy.id) 49 | 50 | def delete_old(self, table_name, day): 51 | start = datetime.datetime.now() 52 | end = datetime.datetime.now() 53 | pass 54 | 55 | def get_proxy_count(self, table_name): 56 | count = self.db[table_name].find().count() 57 | logging.debug('count:%s' % count) 58 | return count 59 | 60 | def get_proxy_ids(self, table_name): 61 | ids = self.db[table_name].distinct('_id') 62 | logging.debug('ids:%s' % ids) 63 | return ids 64 | 65 | def get_proxy_with_id(self, table_name, id): 66 | data = self.db[table_name].find_one({'_id': id}) 67 | logging.debug(data) 68 | proxy = Proxy() 69 | proxy.set_value( 70 | ip=data.get('ip'), 71 | port=data.get('port'), 72 | country=data.get('country'), 73 | anonymity=data.get('country'), 74 | https=data.get('https'), 75 | speed=data.get('speed'), 76 | source=data.get('source'), 77 | vali_count=data.get('vali_count') 78 | ) 79 | proxy.id = data.get('_id') 80 | return proxy 81 | 82 | def del_proxy_with_id(self, table_name, id): 83 | self.db[table_name].delete_one({'_id': id}) 84 | return True 85 | 86 | def del_proxy_with_ip(self, table_name, ip): 87 | self.db[table_name].delete_one({'ip': ip}) 88 | return True 89 | 90 | def commit(self): 91 | pass 92 | -------------------------------------------------------------------------------- /sql/mysql.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import logging 4 | import utils 5 | import config 6 | import pymysql 7 | 8 | from proxy import Proxy 9 | from sql.sql_base import SqlBase 10 | 11 | 12 | class MySql(SqlBase): 13 | def __init__(self, **kwargs): 14 | super(MySql, self).__init__(**kwargs) 15 | 16 | self.conn = pymysql.connect(**kwargs) 17 | self.cursor = self.conn.cursor() 18 | 19 | try: 20 | self.conn.select_db(config.database) 21 | except: 22 | self.create_database(config.database) 23 | self.conn.select_db(config.database) 24 | 25 | def create_database(self, database_name): 26 | try: 27 | command = 'CREATE DATABASE IF NOT EXISTS %s DEFAULT CHARACTER SET \'utf8\' ' % database_name 28 | logging.debug('mysql create_database command:%s' % command) 29 | self.cursor.execute(command) 30 | self.conn.commit() 31 | except Exception as e: 32 | logging.exception('mysql create_database exception:%s' % e) 33 | 34 | def init_database(self, database_name): 35 | try: 36 | command = 'CREATE DATABASE IF NOT EXISTS %s DEFAULT CHARACTER SET \'utf8\' ' % database_name 37 | logging.debug('mysql create_database command:%s' % command) 38 | self.cursor.execute(command) 39 | self.conn.commit() 40 | except Exception as e: 41 | logging.exception('mysql create_database exception:%s' % e) 42 | 43 | def init_proxy_table(self, table_name): 44 | command = ( 45 | "CREATE TABLE IF NOT EXISTS {} (" 46 | "`id` INT(8) NOT NULL AUTO_INCREMENT," 47 | "`ip` CHAR(25) NOT NULL UNIQUE," 48 | "`port` INT(4) NOT NULL," 49 | "`country` TEXT DEFAULT NULL," 50 | "`anonymity` INT(2) DEFAULT NULL," 51 | "`https` CHAR(4) DEFAULT NULL ," 52 | "`speed` FLOAT DEFAULT NULL," 53 | "`source` CHAR(20) DEFAULT NULL," 54 | "`vali_count` INT(5) DEFAULT 0," 55 | "`created_at` timestamp NULL DEFAULT CURRENT_TIMESTAMP," 56 | "`updated_at` timestamp NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP," 57 | "PRIMARY KEY(id)," 58 | "UNIQUE KEY `uniq_ip` (`ip`)" 59 | ") ENGINE=InnoDB".format(table_name)) 60 | 61 | self.cursor.execute(command) 62 | self.conn.commit() 63 | 64 | def insert_proxy(self, table_name, proxy): 65 | try: 66 | command = ("INSERT IGNORE INTO {} " 67 | "(id, ip, port, country, anonymity, https, speed, source, vali_count)" 68 | "VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s)".format(table_name)) 69 | 70 | data = (None, proxy.ip, proxy.port, proxy.country, proxy.anonymity, 71 | proxy.https, proxy.speed, proxy.source, proxy.vali_count) 72 | 73 | self.cursor.execute(command, data) 74 | return True 75 | except Exception as e: 76 | logging.exception('mysql insert_proxy exception msg:%s' % e) 77 | return False 78 | 79 | def insert_valid_proxy(self, id): 80 | try: 81 | command = "CALL ip_transfer({id})".format(id=id) 82 | self.cursor.execute(command) 83 | except Exception as e: 84 | logging.exception('[Error]mysql#insert_valid_proxy Exception msg:{}'.format(str(e))) 85 | raise e 86 | 87 | def select_proxy(self, table_name, **kwargs): 88 | filter = {} 89 | for k, v in kwargs.items(): 90 | if v != '': 91 | filter[k] = v 92 | 93 | table_name = table_name if table_name else 'free_ipproxy' 94 | 95 | try: 96 | command = "SELECT * FROM {name} WHERE anonymity LIKE '{anonymity}' AND https LIKE '{https}' ORDER BY " \ 97 | "{order} {sort} limit {count}". \ 98 | format(name=table_name, anonymity=filter.get('anonymity', '%'), 99 | https=filter.get('https', '%'), order=filter.get('order', 'save_time'), 100 | sort=filter.get('sort', 'desc'), count=filter.get('count', 100)) 101 | result = self.query(command) 102 | data = [{ 103 | 'ip': item[1], 'port': item[2], 'anonymity': item[4], 'https': item[5], 104 | 'speed': item[6], 'save_time': str(item[8]) 105 | } for item in result] 106 | return data 107 | except Exception as e: 108 | logging.exception('mysql select_proxy exception msg:%s' % e) 109 | return [] 110 | 111 | def update_proxy(self, table_name, proxy): 112 | try: 113 | command = "UPDATE {table_name} set https='{https}', speed={speed}, " \ 114 | "vali_count={vali_count}, anonymity = {anonymity},save_time={save_time} " \ 115 | "where id={id};".format( 116 | table_name=table_name, https=proxy.https, 117 | speed=proxy.speed, id=proxy.id, vali_count=proxy.vali_count, anonymity=proxy.anonymity, 118 | save_time='NOW()') 119 | logging.debug('mysql update_proxy command:%s' % command) 120 | self.cursor.execute(command) 121 | except Exception as e: 122 | logging.exception('mysql update_proxy exception msg:%s' % e) 123 | 124 | def update_valid_proxy(self, id=0): 125 | try: 126 | command = "UPDATE httpbin SET vali_count=vali_count+1 WHERE id={id}".format(id=id) 127 | affected_row = self.cursor.execute(command) 128 | self.commit() 129 | return affected_row 130 | except Exception as e: 131 | logging.exception('[mysql] update_valid_proxy exception:{msg}'.format(str(e))) 132 | 133 | def delete_proxy(self, table_name, proxy): 134 | self.del_proxy_with_id(table_name=table_name, id=proxy.id) 135 | 136 | def delete_old(self, table_name, day): 137 | try: 138 | command = "DELETE FROM {table} where save_time < SUBDATE(NOW(), INTERVAL {day} DAY)".format( 139 | table=config.free_ipproxy_table, day=day) 140 | 141 | self.cursor.execute(command) 142 | self.commit() 143 | except Exception as e: 144 | logging.exception('mysql delete_old exception msg:%s' % e) 145 | 146 | def get_proxy_count(self, table_name): 147 | try: 148 | command = "SELECT COUNT(*) from {}".format(table_name) 149 | count, = self.query_one(command) 150 | logging.debug('mysql get_proxy_count count:%s' % count) 151 | return count 152 | except Exception as e: 153 | logging.exception('mysql get_proxy_count exception msg:%s' % e) 154 | 155 | return 0 156 | 157 | def get_proxy_ids(self, table_name): 158 | ids = [] 159 | try: 160 | command = "SELECT id from {}".format(table_name) 161 | result = self.query(command) 162 | ids = [item[0] for item in result] 163 | except Exception as e: 164 | logging.exception('mysql get_proxy_ids exception msg:%s' % e) 165 | 166 | return ids 167 | 168 | def get_proxy_with_id(self, table_name, id): 169 | proxy = Proxy() 170 | try: 171 | command = "SELECT * FROM {0} WHERE id=\'{1}\'".format(table_name, id) 172 | result = self.query_one(command) 173 | if result != None: 174 | # data = { 175 | # 'id': result[0], 176 | # 'ip': result[1], 177 | # 'port': result[2], 178 | # 'country': result[3], 179 | # 'anonymity': result[4], 180 | # 'https': result[5], 181 | # 'speed': result[6], 182 | # 'source': result[7], 183 | # 'save_time': result[8], 184 | # 'vali_count': result[9], 185 | # } 186 | proxy = Proxy() 187 | proxy.set_value( 188 | ip=result[1], 189 | port=result[2], 190 | country=result[3], 191 | anonymity=result[4], 192 | https=result[5], 193 | speed=result[6], 194 | source=result[7], 195 | vali_count=result[9]) 196 | proxy.id = result[0] 197 | proxy.save_time = result[8] 198 | except Exception as e: 199 | logging.exception('mysql get_proxy_ids exception msg:%s' % e) 200 | 201 | return proxy 202 | 203 | def get_proxies_info(self, table_name, start_id=0, limit=100): 204 | '''批量获取代理表中的id,ip和port信息 205 | Args: 206 | @table_name 表名 207 | @start_id 起始id 208 | @limit 单批次最大记录数 209 | 210 | Return 211 | 包含id,ip,port信息的元组列表 212 | 213 | ''' 214 | command = ('SELECT id,ip,port,https from {table} where id >={start_id}' 215 | ' order by id asc limit {limit}') 216 | command = command.format(table=table_name, start_id=start_id, limit=limit) 217 | proxies_info = [] 218 | try: 219 | result = self.query(command) 220 | proxies_info = [proxy for proxy in result] 221 | except Exception as e: 222 | logging.exception('[ERROR]#mysql get_proxies_info: {msg}'.format(msg=e)) 223 | 224 | return proxies_info 225 | 226 | def del_proxy_with_id(self, table_name, id): 227 | res = False 228 | try: 229 | command = "DELETE FROM {0} WHERE id={1}".format(table_name, id) 230 | self.cursor.execute(command) 231 | res = True 232 | except Exception as e: 233 | logging.exception('mysql get_proxy_ids exception msg:%s' % e) 234 | 235 | return res 236 | 237 | def del_proxy_with_ip(self, table_name, ip): 238 | res = False 239 | try: 240 | command = "DELETE FROM {0} WHERE ip='{1}'".format(table_name, ip) 241 | self.cursor.execute(command) 242 | self.commit() 243 | res = True 244 | except Exception as e: 245 | logging.exception('mysql del_proxy_with_ip exception msg:%s' % e) 246 | 247 | return res 248 | 249 | def create_table(self, command): 250 | try: 251 | logging.debug('mysql create_table command:%s' % command) 252 | x = self.cursor.execute(command) 253 | self.conn.commit() 254 | return x 255 | except Exception as e: 256 | logging.exception('mysql create_table exception:%s' % e) 257 | 258 | def insert_data(self, command, data, commit=False): 259 | try: 260 | logging.debug('mysql insert_data command:%s, data:%s' % (command, data)) 261 | x = self.cursor.execute(command, data) 262 | if commit: 263 | self.conn.commit() 264 | return x 265 | except Exception as e: 266 | logging.debug('mysql insert_data exception msg:%s' % e) 267 | 268 | def commit(self): 269 | self.conn.commit() 270 | 271 | def execute(self, command, commit=True): 272 | try: 273 | logging.debug('mysql execute command:%s' % command) 274 | data = self.cursor.execute(command) 275 | if commit: 276 | self.conn.commit() 277 | return data 278 | except Exception as e: 279 | logging.exception('mysql execute exception msg:%s' % e) 280 | return None 281 | 282 | def query(self, command, commit=False): 283 | try: 284 | logging.debug('mysql execute command:%s' % command) 285 | 286 | self.cursor.execute(command) 287 | data = self.cursor.fetchall() 288 | if commit: 289 | self.conn.commit() 290 | return data 291 | except Exception as e: 292 | logging.exception('mysql execute exception msg:%s' % e) 293 | return None 294 | 295 | def query_one(self, command, commit=False): 296 | try: 297 | logging.debug('mysql execute command:%s' % command) 298 | 299 | self.cursor.execute(command) 300 | data = self.cursor.fetchone() 301 | if commit: 302 | self.conn.commit() 303 | 304 | return data 305 | except Exception as e: 306 | logging.debug('mysql execute exception msg:%s' % str(e)) 307 | return None 308 | -------------------------------------------------------------------------------- /sql/sql_base.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | 3 | class SqlBase(object): 4 | def __init__(self, **kwargs): 5 | pass 6 | 7 | def init_database(self, database_name): 8 | pass 9 | 10 | def init_proxy_table(self, table_name): 11 | pass 12 | 13 | def insert_proxy(self, table_name, proxy): 14 | pass 15 | 16 | def select_proxy(self, table_name, **kwargs): 17 | pass 18 | 19 | def update_proxy(self, table_name, proxy): 20 | pass 21 | 22 | def delete_proxy(self, table_name, proxy): 23 | pass 24 | 25 | def delete_old(self, table_name, day): 26 | pass 27 | 28 | def get_proxy_count(self, table_name): 29 | pass 30 | 31 | def get_proxy_ids(self, table_name): 32 | pass 33 | 34 | def get_proxy_with_id(self, table_name, id): 35 | pass 36 | 37 | def del_proxy_with_id(self, table_name, id): 38 | pass 39 | 40 | def del_proxy_with_ip(self, table_name, ip): 41 | pass 42 | 43 | def commit(self): 44 | pass 45 | -------------------------------------------------------------------------------- /sql/sql_manager.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import config 4 | 5 | from sql.sql_base import SqlBase 6 | 7 | 8 | class SqlManager(object): 9 | def __init__(self): 10 | db_type = config.DB_config.get('db_type', 'mysql') 11 | db_config = config.DB_config.get(db_type) 12 | 13 | if db_type == 'mysql': 14 | from sql.mysql import MySql 15 | self.sql = MySql(**db_config) 16 | elif db_type == 'redis': 17 | pass 18 | elif db_type == 'sqlite': 19 | pass 20 | elif db_type == 'mongodb': 21 | from sql.mongodb import Mongodb 22 | self.sql = Mongodb(**db_config) 23 | else: # default mysql 24 | from sql.mysql import MySql 25 | self.sql = MySql(**config.DB_config.get('db_type')) 26 | 27 | def init_database(self, database_name): 28 | pass 29 | 30 | def init_proxy_table(self, table_name): 31 | return self.sql.init_proxy_table(table_name) 32 | 33 | def insert_proxy(self, table_name, proxy): 34 | return self.sql.insert_proxy(table_name, proxy) 35 | 36 | def insert_valid_proxy(self,id=id): 37 | return self.sql.insert_valid_proxy(id) 38 | 39 | def select_proxy(self, table_name, **kwargs): 40 | return self.sql.select_proxy(table_name, **kwargs) 41 | 42 | def update_proxy(self, table_name, proxy): 43 | return self.sql.update_proxy(table_name, proxy) 44 | 45 | def update_valid_proxy(self,id=0): 46 | return self.sql.update_valid_proxy(id=id) 47 | 48 | def delete_proxy(self, table_name, proxy): 49 | return self.sql.delete_proxy(table_name, proxy) 50 | 51 | def delete_old(self, table_name, day): 52 | return self.sql.delete_old(table_name, day) 53 | 54 | def get_proxy_count(self, table_name): 55 | return self.sql.get_proxy_count(table_name = table_name) 56 | 57 | def get_proxy_ids(self, table_name): 58 | return self.sql.get_proxy_ids(table_name = table_name) 59 | 60 | def get_proxy_with_id(self, table_name, id): 61 | return self.sql.get_proxy_with_id(table_name = table_name, id = id) 62 | 63 | def del_proxy_with_id(self, table_name, id): 64 | return self.sql.del_proxy_with_id(table_name = table_name, id = id) 65 | 66 | def del_proxy_with_ip(self, table_name, ip): 67 | return self.sql.del_proxy_with_ip(table_name = table_name, ip = ip) 68 | 69 | def get_proxies_info(self,table_name,start_id=0,limit=10): 70 | return self.sql.get_proxies_info(table_name=table_name, start_id=start_id, limit=limit) 71 | 72 | def commit(self): 73 | return self.sql.commit() 74 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import logging 4 | import os 5 | import re 6 | import subprocess 7 | import traceback 8 | import time 9 | import datetime 10 | 11 | 12 | # 自定义的日志输出 13 | def log(msg, level=logging.DEBUG): 14 | logging.log(level, msg) 15 | print('%s [%s], msg:%s' % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), level, msg)) 16 | 17 | if level == logging.WARNING or level == logging.ERROR: 18 | for line in traceback.format_stack(): 19 | print(line.strip()) 20 | 21 | for line in traceback.format_stack(): 22 | logging.log(level, line.strip()) 23 | 24 | 25 | # 服务器使用,清理端口占用 26 | def kill_ports(ports): 27 | for port in ports: 28 | log('kill %s start' % port) 29 | popen = subprocess.Popen('lsof -i:%s' % port, shell=True, stdout=subprocess.PIPE) 30 | (data, err) = popen.communicate() 31 | log('data:\n%s \nerr:\n%s' % (data, err)) 32 | 33 | pattern = re.compile(r'\b\d+\b', re.S) 34 | pids = re.findall(pattern, data.decode()) 35 | 36 | log('pids:%s' % str(pids)) 37 | 38 | for pid in pids: 39 | if pid != '' and pid != None: 40 | try: 41 | log('pid:%s' % pid) 42 | popen = subprocess.Popen('kill -9 %s' % pid, shell=True, stdout=subprocess.PIPE) 43 | (data, err) = popen.communicate() 44 | log('data:\n%s \nerr:\n%s' % (data, err)) 45 | except Exception as e: 46 | log('kill_ports exception:%s' % e) 47 | 48 | log('kill %s finish' % port) 49 | 50 | time.sleep(1) 51 | 52 | 53 | # 创建文件夹 54 | def make_dir(dir): 55 | log('make dir:%s' % dir) 56 | if not os.path.exists(dir): 57 | os.makedirs(dir) 58 | -------------------------------------------------------------------------------- /weixin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awolfly9/IPProxyTool/4e4e3aadd30a75f74393b54e8077568b6a58a813/weixin.png --------------------------------------------------------------------------------