├── 1024 ├── requestments.txt └── new1024spider.py ├── 91user ├── config │ ├── __init__.py │ └── uids.py ├── run.py └── user.py ├── kuaishou ├── config │ ├── __init__.py │ ├── user.py │ └── ua_mobile.txt ├── lib │ ├── __init__.py │ └── crawler.py ├── run.py ├── ks_down.py └── ks_video.py ├── proxy_pool ├── _config.yml ├── cli │ ├── start.sh │ └── proxyPool.py ├── .travis.yml ├── requirements.txt ├── doc │ ├── block_ips.md │ ├── release_notes.md │ └── introduce.md ├── docker-compose.yml ├── Config │ ├── __init__.py │ ├── ConfigGetter.py │ └── setting.py ├── DB │ ├── __init__.py │ ├── MongodbClient.py │ ├── DbClient.py │ ├── RedisClient.py │ └── SsdbClient.py ├── Dockerfile ├── ProxyGetter │ ├── __init__.py │ ├── CheckProxy.py │ └── getFreeProxy.py ├── Test │ ├── __init__.py │ ├── testProxyClass.py │ ├── testWebRequest.py │ ├── testConfig.py │ ├── testLogHandler.py │ └── testGetFreeProxy.py ├── __init__.py ├── Api │ ├── __init__.py │ └── ProxyApi.py ├── Manager │ ├── __init__.py │ └── ProxyManager.py ├── test.py ├── ProxyHelper │ ├── __init__.py │ ├── ProxyUtil.py │ └── Proxy.py ├── Util │ ├── __init__.py │ ├── utilClass.py │ ├── utilFunction.py │ ├── LogHandler.py │ └── WebRequest.py ├── Schedule │ ├── __init__.py │ ├── ProxyScheduler.py │ ├── UsefulProxyCheck.py │ └── RawProxyCheck.py ├── LICENSE └── README.md ├── requirement.txt ├── README.md ├── LICENSE ├── qicai ├── qicai_top50.py └── QicaiCategoriesSpider.py ├── umei └── app.py ├── cmanuf └── download.py ├── cableav.py ├── baiduMap └── baiduMap.py ├── yasee1 └── run.py └── tuao8 └── crawler.py /91user/config/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /kuaishou/config/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /kuaishou/lib/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /proxy_pool/_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-time-machine -------------------------------------------------------------------------------- /proxy_pool/cli/start.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | python proxyPool.py webserver & 3 | python proxyPool.py schedule -------------------------------------------------------------------------------- /1024/requestments.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.6.3 2 | certifi==2022.12.7 3 | chardet==3.0.4 4 | idna==2.7 5 | lxml==4.9.1 6 | requests==2.20.1 7 | urllib3==1.26.5 8 | -------------------------------------------------------------------------------- /91user/config/uids.py: -------------------------------------------------------------------------------- 1 | USERS_UID = [ 2 | '3637DMj5U2Y7YRyzO9oivHdmcoRn6Cz38oR7yh9jrTonY4AM', 3 | '787cUGTgFxeUcKp9wAODVVRi35IDVLjNjygNSkyXcSZfdfmZ' 4 | ] -------------------------------------------------------------------------------- /requirement.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.6.3 2 | certifi==2022.12.7 3 | chardet==3.0.4 4 | idna==2.7 5 | lxml==4.9.1 6 | pymongo==3.7.2 7 | requests==2.20.0 8 | urllib3==1.26.5 9 | -------------------------------------------------------------------------------- /proxy_pool/.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - 2.7 4 | # - nightly 5 | os: 6 | - linux 7 | install: 8 | - pip install -r requirements.txt 9 | 10 | script: python test.py -------------------------------------------------------------------------------- /proxy_pool/requirements.txt: -------------------------------------------------------------------------------- 1 | APScheduler==3.2.0 2 | werkzeug==2.2.3 3 | Flask==1.0 4 | requests==2.20.0 5 | lxml==4.9.1 6 | PyExecJS==1.5.1 7 | click==7.0 8 | gunicorn==19.9.0 9 | pymongo 10 | redis 11 | -------------------------------------------------------------------------------- /proxy_pool/doc/block_ips.md: -------------------------------------------------------------------------------- 1 | | block IP | block 日期 | msg | 2 | | ----- | ---- | -------- | 3 | | 144.52.45.149 | 20190815 | 恶意访问 | 4 | | 39.100.153.226 | 20190816 | 恶意访问 | 5 | | 47.102.47.42 | 20190819 | 恶意访问 | 6 | | 125.71.211.125 | 20190820 | 恶意访问 | 7 | 8 | 如需正常访问请提issues说明 9 | -------------------------------------------------------------------------------- /proxy_pool/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | services: 3 | proxy_pool: 4 | build: . 5 | ports: 6 | - "5010:5010" 7 | links: 8 | - proxy_redis 9 | environment: 10 | db_type: SSDB 11 | ssdb_host: proxy_redis 12 | ssdb_port: 6379 13 | proxy_redis: 14 | image: "redis" 15 | -------------------------------------------------------------------------------- /proxy_pool/Config/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: __init__ 5 | Description : 6 | Author : JHao 7 | date: 2019/2/15 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/2/15: 11 | ------------------------------------------------- 12 | """ 13 | -------------------------------------------------------------------------------- /proxy_pool/DB/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: __init__.py.py 5 | Description : 6 | Author : JHao 7 | date: 2016/12/2 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2016/12/2: 11 | ------------------------------------------------- 12 | """ -------------------------------------------------------------------------------- /proxy_pool/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.6 2 | 3 | MAINTAINER jhao104 4 | 5 | ENV TZ Asia/Shanghai 6 | 7 | WORKDIR /usr/src/app 8 | 9 | COPY ./requirements.txt . 10 | 11 | RUN pip install --no-cache-dir -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple/ 12 | 13 | COPY . . 14 | 15 | EXPOSE 5010 16 | 17 | WORKDIR /usr/src/app/cli 18 | 19 | ENTRYPOINT [ "sh", "start.sh" ] 20 | -------------------------------------------------------------------------------- /proxy_pool/ProxyGetter/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: __init__.py.py 5 | Description : 6 | Author : JHao 7 | date: 2016/11/25 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2016/11/25: 11 | ------------------------------------------------- 12 | """ -------------------------------------------------------------------------------- /proxy_pool/Test/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: __init__ 5 | Description : 6 | Author : JHao 7 | date: 2019/2/15 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/2/15: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' -------------------------------------------------------------------------------- /proxy_pool/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: __init__.py 5 | Description : 6 | Author : JHao 7 | date: 2016/12/3 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2016/12/3: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' -------------------------------------------------------------------------------- /proxy_pool/Api/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: __init__.py 5 | Description : 6 | Author : JHao 7 | date: 2016/12/3 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2016/12/3: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | -------------------------------------------------------------------------------- /proxy_pool/Manager/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: __init__.py.py 5 | Description : 6 | Author : JHao 7 | date: 2016/12/3 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2016/12/3: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | from Manager.ProxyManager import ProxyManager 16 | -------------------------------------------------------------------------------- /proxy_pool/test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: test.py 5 | Description : 6 | Author : JHao 7 | date: 2017/3/7 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2017/3/7: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | from Test import testConfig 16 | 17 | if __name__ == '__main__': 18 | testConfig.testConfig() 19 | -------------------------------------------------------------------------------- /proxy_pool/ProxyHelper/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: __init__.py 5 | Description : 6 | Author : JHao 7 | date: 2019/7/11 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/7/11: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | from ProxyHelper.Proxy import Proxy 16 | from ProxyHelper.ProxyUtil import checkProxyUseful 17 | -------------------------------------------------------------------------------- /proxy_pool/Util/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: __init__.py.py 5 | Description : 6 | Author : JHao 7 | date: 2016/11/25 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2016/11/25: 11 | ------------------------------------------------- 12 | """ 13 | 14 | from Util.utilFunction import validUsefulProxy 15 | from Util.LogHandler import LogHandler 16 | from Util.utilClass import Singleton 17 | -------------------------------------------------------------------------------- /proxy_pool/Schedule/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: __init__.py.py 5 | Description : 6 | Author : JHao 7 | date: 2016/12/3 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2016/12/3: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | from Schedule.RawProxyCheck import doRawProxyCheck 16 | from Schedule.UsefulProxyCheck import doUsefulProxyCheck 17 | -------------------------------------------------------------------------------- /kuaishou/run.py: -------------------------------------------------------------------------------- 1 | from lib.crawler import Kuaishou 2 | from time import sleep 3 | from config.user import users 4 | 5 | file_list = [] #创建一个空列表 6 | def out_file(input_file,out_file): 7 | with open(input_file, "r") as f: 8 | file_2 = f.readlines() 9 | for file in file_2: 10 | file_list.append(file) 11 | out_file1 = set(file_list) #set()函数可以自动过滤掉重复元素 12 | last_out_file = list(out_file1) 13 | for out in last_out_file: 14 | with open(out_file,"a+",encoding="utf-8") as f: #去重后文件写入文件里 15 | f.write(out) 16 | print(out) 17 | 18 | def run(): 19 | app = Kuaishou() 20 | for i in users: 21 | app.setUid(i) 22 | sleep(10) 23 | 24 | out_file('data/data.txt', 'data.txt') 25 | 26 | if __name__ == '__main__': 27 | run() 28 | -------------------------------------------------------------------------------- /proxy_pool/Test/testProxyClass.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: testProxyClass 5 | Description : 6 | Author : JHao 7 | date: 2019/8/8 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/8/8: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | import json 16 | from ProxyHelper import Proxy 17 | 18 | 19 | def testProxyClass(): 20 | proxy = Proxy("127.0.0.1:8080") 21 | 22 | print(proxy.info_dict) 23 | 24 | proxy.source = "test" 25 | 26 | proxy_str = json.dumps(proxy.info_dict, ensure_ascii=False) 27 | 28 | print(proxy_str) 29 | 30 | print(Proxy.newProxyFromJson(proxy_str).info_dict) 31 | 32 | 33 | testProxyClass() 34 | -------------------------------------------------------------------------------- /kuaishou/config/user.py: -------------------------------------------------------------------------------- 1 | temp = [ 2 | 'qiuqiuya0708', 3 | 'y0485201314', 4 | 'aiwo33442528', 5 | 'hellowuzi', 6 | 'lg25802468', 7 | 'flxiaohuxian520', 8 | 'MB667890', 9 | '3xbyb7qjchwgeza', 10 | 'xiaoyun2121', 11 | 'mdxiangbei', 12 | 'dagouxingzuo', 13 | 'dear521_', 14 | 'sunyongfei', 15 | 'jin970608', 16 | 'Zr520976', 17 | '3xmknin32j59p9w', 18 | '3xynx4v3d3yjnxc', 19 | 'xy15705818104', 20 | 'xue888881', 21 | '3x39f99nqet3m9e', 22 | 'HTMB20201212', 23 | 'kuailexiaoni', 24 | 'TTai569-', 25 | 'Sd543318617', 26 | '3xtt8swc7idnnb9', 27 | '3xiepavgtpfasxa', 28 | 'jzwnh666', 29 | '3xi7ts3hndvw83g', 30 | 'xy5201314', 31 | 'C130748359', 32 | ] 33 | 34 | users = [ 35 | 'flxiaohuxian520', 36 | 'xiaoyun2121', 37 | 'dagouxingzuo', 38 | 'hellowuzi', 39 | ] -------------------------------------------------------------------------------- /proxy_pool/Test/testWebRequest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: testWebRequest 5 | Description : test class WebRequest 6 | Author : J_hao 7 | date: 2017/7/31 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2017/7/31: function testWebRequest 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'J_hao' 14 | 15 | from Util.WebRequest import WebRequest 16 | 17 | 18 | # noinspection PyPep8Naming 19 | def testWebRequest(): 20 | """ 21 | test class WebRequest in Util/WebRequest.py 22 | :return: 23 | """ 24 | wr = WebRequest() 25 | request_object = wr.get('https://www.baidu.com/') 26 | assert request_object.status_code == 200 27 | 28 | 29 | if __name__ == '__main__': 30 | testWebRequest() 31 | -------------------------------------------------------------------------------- /proxy_pool/Test/testConfig.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: testGetConfig 5 | Description : testGetConfig 6 | Author : J_hao 7 | date: 2017/7/31 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2017/7/31: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'J_hao' 14 | 15 | from Config.ConfigGetter import config 16 | 17 | 18 | # noinspection PyPep8Naming 19 | def testConfig(): 20 | """ 21 | :return: 22 | """ 23 | print(config.db_type) 24 | print(config.db_name) 25 | print(config.db_host) 26 | print(config.db_port) 27 | print(config.db_password) 28 | assert isinstance(config.proxy_getter_functions, list) 29 | print(config.proxy_getter_functions) 30 | 31 | 32 | if __name__ == '__main__': 33 | testConfig() 34 | -------------------------------------------------------------------------------- /proxy_pool/Test/testLogHandler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: testLogHandler 5 | Description : 6 | Author : J_hao 7 | date: 2017/8/2 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2017/8/2: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'J_hao' 14 | 15 | from Util.LogHandler import LogHandler 16 | 17 | 18 | # noinspection PyPep8Naming 19 | def testLogHandler(): 20 | """ 21 | test function LogHandler in Util/LogHandler 22 | :return: 23 | """ 24 | log = LogHandler('test') 25 | log.info('this is a log from test') 26 | 27 | log.resetName(name='test1') 28 | log.info('this is a log from test1') 29 | 30 | log.resetName(name='test2') 31 | log.info('this is a log from test2') 32 | 33 | 34 | if __name__ == '__main__': 35 | testLogHandler() 36 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## 废话连篇 2 | 3 | 4 | 爬虫自给自足 5 | 6 | 使用Python3完成 7 | 8 | **注:爬虫大多具有时效性,所以早期上传的不一定能用** 9 | 10 | 11 | 12 | 这个readme我也是写了又删,删了又写。曾经一度不想更新(害,主要是懒) 13 | 14 | 现在更新这篇也是单纯的因为太闲了。 15 | 16 | 17 | 18 | 19 | 20 | 21 | ## 依赖包 22 | 23 | 有的可能需要以下包,加粗表示必须滴(技术太菜,只能依赖这些包来搞一搞) 24 | 25 | 26 | - **requests** 27 | 28 | - **Beautifulsoup4** 29 | 30 | - pymongo 31 | 32 | - fake_UserAgent 33 | 34 | - pymysql 35 | 36 | 37 | 38 | ## 目录 39 | 40 | - **1024**: 数字社区的图片 41 | 42 | - **baiduMap**: 简单调用百度地图的api完成区域类的信息检索,需要用到开发账号 43 | 44 | - **cmanuf**:机械工业出版社的pdf下载?**烂尾,bug太多,不修了** 45 | 46 | - ~~**novel**:盗版小说的爬虫...存储到数据库中~~ 47 | 48 | - **qicai**:七彩英语(英文原著)的PDF下载 49 | 50 | - **umei**: 批量下载图片 51 | 52 | - **kuaishou**: 关键词:快手、无水印、解析、下载 53 | 54 | - ~~**yasee1**:网站倒闭了~~ 55 | 56 | - **proxy_pool**:代理池源自[jhao104/proxy_pool](https://github.com/jhao104/proxy_pool/) 57 | 58 | - **tuao8**: 一个小姐姐的图片下载爬虫 59 | 60 | - **91user:** 传入UID解析视频m3u8播放链接 61 | 62 | - **cableav.py** 一个神奇的网站。传入列表页,解析M3U8地址并存储在本地 63 | 64 | -------------------------------------------------------------------------------- /proxy_pool/doc/release_notes.md: -------------------------------------------------------------------------------- 1 | ## Release Notes 2 | 3 | * master 4 | 5 | 1. 新增免费代理源 `西拉代理` (2020-03-30) 6 | 7 | * 2.0.1 8 | 9 | 1. 新增免费代理源 `89免费代理`; 10 | 2. 新增免费代理源 `齐云代理` 11 | 12 | * 2.0.0 (201908) 13 | 14 | 1. WebApi集成Gunicorn方式启动, Windows平台暂不支持; 15 | 2. 优化Proxy调度程序; 16 | 3. 扩展Proxy属性; 17 | 4. 提供cli工具, 更加方便启动proxyPool 18 | 19 | * 1.14 (2019.07) 20 | 21 | 1. 修复`ProxyValidSchedule`假死bug,原因是Queue阻塞; 22 | 2. 修改代理源 `云代理` 抓取; 23 | 3. 修改代理源 `码农代理` 抓取; 24 | 4. 修改代理源 `代理66` 抓取, 引入 `PyExecJS` 模块破解加速乐动态Cookies加密; 25 | 26 | * 1.13 (2019.02) 27 | 28 | 1.使用.py文件替换.ini作为配置文件; 29 | 30 | 2.更新代理采集部分; 31 | 32 | * 1.12 (2018.4) 33 | 34 | 1.优化代理格式检查; 35 | 36 | 2.增加代理源; 37 | 38 | 3.fix bug [#122](https://github.com/jhao104/proxy_pool/issues/122) [#126](https://github.com/jhao104/proxy_pool/issues/126) 39 | 40 | * 1.11 (2017.8) 41 | 42 |   1.使用多线程验证useful_pool; 43 | 44 | * 1.10 (2016.11) 45 | 46 |   1. 第一版; 47 | 48 |   2. 支持PY2/PY3; 49 | 50 |   3. 代理池基本功能; -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Thompson.Lin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /proxy_pool/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 J_hao104 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /proxy_pool/Util/utilClass.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # !/usr/bin/env python 3 | """ 4 | ------------------------------------------------- 5 | File Name: utilClass.py 6 | Description : tool class 7 | Author : JHao 8 | date: 2016/12/3 9 | ------------------------------------------------- 10 | Change Activity: 11 | 2016/12/3: Class LazyProperty 12 | ------------------------------------------------- 13 | """ 14 | __author__ = 'JHao' 15 | 16 | 17 | class LazyProperty(object): 18 | """ 19 | LazyProperty 20 | explain: http://www.spiderpy.cn/blog/5/ 21 | """ 22 | 23 | def __init__(self, func): 24 | self.func = func 25 | 26 | def __get__(self, instance, owner): 27 | if instance is None: 28 | return self 29 | else: 30 | value = self.func(instance) 31 | setattr(instance, self.func.__name__, value) 32 | return value 33 | 34 | 35 | class Singleton(type): 36 | """ 37 | Singleton Metaclass 38 | """ 39 | 40 | _inst = {} 41 | 42 | def __call__(cls, *args, **kwargs): 43 | if cls not in cls._inst: 44 | cls._inst[cls] = super(Singleton, cls).__call__(*args) 45 | return cls._inst[cls] 46 | -------------------------------------------------------------------------------- /proxy_pool/ProxyHelper/ProxyUtil.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: ProxyHelper 5 | Description : 6 | Author : JHao 7 | date: 2019/8/8 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/8/8: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | from Util import validUsefulProxy 16 | 17 | from datetime import datetime 18 | 19 | 20 | def checkProxyUseful(proxy_obj): 21 | """ 22 | 检测代理是否可用 23 | :param proxy_obj: Proxy object 24 | :return: Proxy object, status 25 | """ 26 | 27 | if validUsefulProxy(proxy_obj.proxy): 28 | # 检测通过 更新proxy属性 29 | proxy_obj.check_count += 1 30 | proxy_obj.last_status = 1 31 | proxy_obj.last_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") 32 | if proxy_obj.fail_count > 0: 33 | proxy_obj.fail_count -= 1 34 | return proxy_obj, True 35 | else: 36 | proxy_obj.check_count += 1 37 | proxy_obj.last_status = 0 38 | proxy_obj.last_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") 39 | proxy_obj.fail_count += 1 40 | return proxy_obj, False 41 | -------------------------------------------------------------------------------- /proxy_pool/cli/proxyPool.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: proxy_pool 5 | Description : 6 | Author : JHao 7 | date: 2019/8/2 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/8/2: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | import sys 16 | import click 17 | import platform 18 | 19 | sys.path.append('../') 20 | 21 | from Config.setting import HEADER 22 | from Schedule.ProxyScheduler import runScheduler 23 | from Api.ProxyApi import runFlask,runFlaskWithGunicorn 24 | 25 | CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help']) 26 | 27 | 28 | @click.group(context_settings=CONTEXT_SETTINGS) 29 | @click.version_option(version='2.0.0') 30 | def cli(): 31 | """ProxyPool cli工具""" 32 | 33 | 34 | @cli.command(name="schedule") 35 | def schedule(): 36 | """ 启动调度程序 """ 37 | click.echo(HEADER) 38 | runScheduler() 39 | 40 | 41 | @cli.command(name="webserver") 42 | def schedule(): 43 | """ 启动web服务 """ 44 | click.echo(HEADER) 45 | if platform.system() == "Windows": 46 | runFlask() 47 | else: 48 | runFlaskWithGunicorn() 49 | 50 | 51 | if __name__ == '__main__': 52 | cli() 53 | -------------------------------------------------------------------------------- /kuaishou/ks_down.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from multiprocessing import Pool 3 | from fake_useragent import UserAgent 4 | import time 5 | import os 6 | 7 | 8 | video_path ='./video/' 9 | 10 | UA = UserAgent() 11 | 12 | headers = { 13 | 'Connection': 'close', 14 | 'User-Agent':UA.random 15 | } 16 | 17 | def download(url): 18 | video_name = url[-24:] 19 | if os.path.exists(video_path+video_name) == True: 20 | print(video_name + ' 视频已存在,跳过') 21 | pass 22 | else: 23 | try: 24 | req = requests.get(url,headers=headers) 25 | req.raise_for_status() 26 | req.close() 27 | with open(video_path + video_name,'wb') as f: 28 | f.write(req.content) 29 | f.close() 30 | print(str(video_name) + ' ~下载完成!') 31 | except Exception as code: 32 | print(code) 33 | return None 34 | 35 | if __name__ == '__main__': 36 | start_time = time.time() 37 | pool = Pool(8) 38 | with open('./20200320.txt', 'r') as f: 39 | for line in f: 40 | line = line.split('?')[0] 41 | line = line.strip('/\n') 42 | pool.apply_async(download(line)) 43 | pool.close() 44 | pool.join() 45 | 46 | end_time = time.time() 47 | print('下载完成,总耗时:%s' % (end_time - start_time)) -------------------------------------------------------------------------------- /proxy_pool/Test/testGetFreeProxy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: testGetFreeProxy 5 | Description : test model ProxyGetter/getFreeProxy 6 | Author : J_hao 7 | date: 2017/7/31 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2017/7/31:function testGetFreeProxy 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'J_hao' 14 | 15 | 16 | from ProxyGetter.getFreeProxy import GetFreeProxy 17 | from Config.ConfigGetter import config 18 | 19 | 20 | def testGetFreeProxy(): 21 | """ 22 | test class GetFreeProxy in ProxyGetter/GetFreeProxy 23 | :return: 24 | """ 25 | proxy_getter_functions = config.proxy_getter_functions 26 | for proxyGetter in proxy_getter_functions: 27 | proxy_count = 0 28 | for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): 29 | if proxy: 30 | print('{func}: fetch proxy {proxy},proxy_count:{proxy_count}'.format(func=proxyGetter, proxy=proxy, 31 | proxy_count=proxy_count)) 32 | proxy_count += 1 33 | # assert proxy_count >= 20, '{} fetch proxy fail'.format(proxyGetter) 34 | 35 | 36 | if __name__ == '__main__': 37 | testGetFreeProxy() 38 | -------------------------------------------------------------------------------- /proxy_pool/Schedule/ProxyScheduler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: ProxyScheduler 5 | Description : 6 | Author : JHao 7 | date: 2019/8/5 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/8/5: ProxyScheduler 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | import sys 16 | from apscheduler.schedulers.blocking import BlockingScheduler 17 | 18 | sys.path.append('../') 19 | 20 | from Schedule import doRawProxyCheck, doUsefulProxyCheck 21 | from Manager import ProxyManager 22 | from Util import LogHandler 23 | 24 | 25 | class DoFetchProxy(ProxyManager): 26 | """ fetch proxy""" 27 | 28 | def __init__(self): 29 | ProxyManager.__init__(self) 30 | self.log = LogHandler('fetch_proxy') 31 | 32 | def main(self): 33 | self.log.info("start fetch proxy") 34 | self.fetch() 35 | self.log.info("finish fetch proxy") 36 | 37 | 38 | def rawProxyScheduler(): 39 | DoFetchProxy().main() 40 | doRawProxyCheck() 41 | 42 | 43 | def usefulProxyScheduler(): 44 | doUsefulProxyCheck() 45 | 46 | 47 | def runScheduler(): 48 | rawProxyScheduler() 49 | usefulProxyScheduler() 50 | 51 | scheduler_log = LogHandler("scheduler_log") 52 | scheduler = BlockingScheduler(logger=scheduler_log) 53 | 54 | scheduler.add_job(rawProxyScheduler, 'interval', minutes=5, id="raw_proxy_check", name="raw_proxy定时采集") 55 | scheduler.add_job(usefulProxyScheduler, 'interval', minutes=1, id="useful_proxy_check", name="useful_proxy定时检查") 56 | 57 | scheduler.start() 58 | 59 | 60 | if __name__ == '__main__': 61 | runScheduler() 62 | -------------------------------------------------------------------------------- /91user/run.py: -------------------------------------------------------------------------------- 1 | from user import User, ClientSqlite 2 | from config.uids import USERS_UID 3 | import json 4 | 5 | db = ClientSqlite() 6 | 7 | 8 | def fetchall_table(uid): 9 | sql = '''SELECT data FROM users WHERE uid = ('{}')'''.format(uid) 10 | result = db.fetchall_table(sql) 11 | if result != None: 12 | result = result[0] 13 | data = json.loads(result[0]) 14 | data_num = len(data) 15 | return {'data': data, 'data_num': int(data_num)} 16 | else: 17 | return None 18 | 19 | def insert_table(data): 20 | uid = data['uid'] 21 | name = data['name'] 22 | videos = json.dumps(data['data']) 23 | sql = '''INSERT INTO users(uid, name,data) VALUES('{0}','{1}','{2}')'''.format(uid, name, videos) 24 | db.insert_update_table(sql) 25 | 26 | def update_table(data): 27 | uid = data['uid'] 28 | name = data['name'] 29 | videos = json.dumps(data['data']) 30 | sql = '''UPDATE users SET data = ('{0}') WHERE uid = "{1}"'''.format(videos, uid) 31 | db.insert_update_table(sql) 32 | 33 | 34 | 35 | if __name__ == "__main__": 36 | for i in USERS_UID: 37 | user = User(i) 38 | public_data = user.public_data() 39 | public_num = public_data['public_video'] 40 | local_data = fetchall_table(i) 41 | if local_data != None and public_num > local_data['data_num']: 42 | # if public_num > local_data['data_num'] 43 | user_data = user.parse_video() 44 | update_table(user_data) 45 | print('数据更新.....\n') 46 | elif local_data != None and public_num == local_data['data_num']: 47 | print('公开视频与本地数据相符') 48 | pass 49 | else: 50 | user_data = user.parse_video() 51 | insert_table(user_data) 52 | print('数据新增.....\n') 53 | db.close_conn() -------------------------------------------------------------------------------- /proxy_pool/Config/ConfigGetter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: ConfigGetter 5 | Description : 读取配置 6 | Author : JHao 7 | date: 2019/2/15 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/2/15: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | 16 | from Util.utilClass import LazyProperty 17 | from Config.setting import * 18 | 19 | 20 | class ConfigGetter(object): 21 | """ 22 | get config 23 | """ 24 | 25 | def __init__(self): 26 | pass 27 | 28 | @LazyProperty 29 | def db_type(self): 30 | return DATABASES.get("default", {}).get("TYPE", "SSDB") 31 | 32 | @LazyProperty 33 | def db_name(self): 34 | return DATABASES.get("default", {}).get("NAME", "proxy") 35 | 36 | @LazyProperty 37 | def db_host(self): 38 | return DATABASES.get("default", {}).get("HOST", "127.0.0.1") 39 | 40 | @LazyProperty 41 | def db_port(self): 42 | return DATABASES.get("default", {}).get("PORT", 8888) 43 | 44 | @LazyProperty 45 | def db_password(self): 46 | return DATABASES.get("default", {}).get("PASSWORD", "") 47 | 48 | @LazyProperty 49 | def proxy_getter_functions(self): 50 | return PROXY_GETTER 51 | 52 | @LazyProperty 53 | def host_ip(self): 54 | return SERVER_API.get("HOST", "127.0.0.1") 55 | 56 | @LazyProperty 57 | def host_port(self): 58 | return SERVER_API.get("PORT", 5010) 59 | 60 | 61 | config = ConfigGetter() 62 | 63 | if __name__ == '__main__': 64 | print(config.db_type) 65 | print(config.db_name) 66 | print(config.db_host) 67 | print(config.db_port) 68 | print(config.proxy_getter_functions) 69 | print(config.host_ip) 70 | print(config.host_port) 71 | print(config.db_password) 72 | -------------------------------------------------------------------------------- /qicai/qicai_top50.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # date: 2018年10月15日 3 | 4 | import requests 5 | from bs4 import BeautifulSoup 6 | import urllib.request 7 | import re 8 | from multiprocessing import Pool 9 | import random, time 10 | 11 | headers = { 12 | 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 13 | 'Host': 'www.qcenglish.com', 14 | 'Referer': 'http://www.qcenglish.com/' 15 | } 16 | 17 | url = 'http://www.qcenglish.com' 18 | host = 'http://www.qcenglish.com' 19 | 20 | download_path = './' 21 | 22 | def get_article(url): 23 | req = requests.get(url,headers=headers) 24 | req.encoding = req.apparent_encoding 25 | soup = BeautifulSoup(req.text,'lxml') 26 | try: 27 | pdf_title = soup.select('#details > dl > dd')[0].get_text() 28 | download_link = soup.select('#download > li > a')[-1].get('href') 29 | print('书名:' + pdf_title) 30 | print('下载链接:' + host + download_link) 31 | download_url = host + download_link 32 | download(download_url,pdf_title) 33 | except IndexError as e: 34 | print(e) 35 | pass 36 | 37 | def download(url, title): 38 | file_path = download_path + title + '.zip' 39 | urllib.request.urlretrieve(url, file_path) 40 | print('下载完成.......\n') 41 | print('延迟等待....Hold on!') 42 | time.sleep(random(3,10)) 43 | 44 | 45 | def get_list(url): 46 | top_list = [] 47 | req = requests.get(url,headers=headers) 48 | soup = BeautifulSoup(req.text,'lxml') 49 | pdf_list = soup.select('#rectop2 > ul > li > a') 50 | for p_list in pdf_list: 51 | p_list = p_list.get('href') 52 | top_list.append(p_list) 53 | # print(top_list) 54 | p = re.compile('_') 55 | clear_list = [x for x in top_list if not p.findall(x)] 56 | return clear_list 57 | 58 | 59 | if __name__ == '__main__': 60 | p = Pool() 61 | top_list = get_list(url) 62 | for article_url in top_list: 63 | start_url = host + article_url 64 | # get_article(start_url) 65 | start = p.apply_async(get_article(start_url)) 66 | p.close() 67 | p.join() 68 | if start.successful(): 69 | print('Top50 下载完成!\n') -------------------------------------------------------------------------------- /umei/app.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import sys, os 3 | from bs4 import BeautifulSoup 4 | from time import sleep 5 | 6 | urls = ['https://www.umei.fun/categories/16?page={}'.format(str(i)) for i in range(1,63)] 7 | cookie = 'your cookies' 8 | 9 | headers = { 10 | 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 11 | 'accept-encoding': 'gzip, deflate, br', 12 | 'accept-language': 'zh-CN,zh;q=0.9', 13 | 'cache-control': 'max-age=0', 14 | 'cookie': cookie, 15 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36' 16 | } 17 | 18 | def respon(url): 19 | response = requests.get(url,headers=headers) 20 | status = response.status_code 21 | if status == 200: 22 | return response.text 23 | else: 24 | return None 25 | 26 | def gerUrls(page): 27 | if page == None: 28 | print('None!') 29 | else: 30 | html = BeautifulSoup(page,'lxml') 31 | urls = html.select('div.section-white > div > div > div > div > div > div > div > a') 32 | for url in urls: 33 | url = 'https://www.umei.fun' + url.get('href') 34 | imgpage = respon(url) 35 | getImg(imgpage) 36 | 37 | def getImg(page): 38 | html = BeautifulSoup(page,'lxml') 39 | imgs = html.select('div.container > div > div > img') 40 | title = html.select('h2')[0].get_text() 41 | if imgs == []: 42 | print('No img!') 43 | pass 44 | else: 45 | for img in imgs: 46 | img = img.get('src') 47 | download(img,title) 48 | print(str(title) + ' download succesful!') 49 | 50 | 51 | def download(url,title): 52 | picPath = os.getcwd() + '\pic' + '\\' + str(title) 53 | if not os.path.exists(picPath): 54 | os.mkdir(picPath) 55 | con = requests.get(url) 56 | name = url[-8:] 57 | with open(picPath + '\\' + str(name) + '.jpg','wb') as f: 58 | f.write(con.content) 59 | f.flush() 60 | 61 | if __name__ == '__main__': 62 | for url in urls: 63 | print(url) 64 | try: 65 | res = respon(url) 66 | imgUrls = gerUrls(res) 67 | sleep(1) 68 | except: 69 | print('Error \n') 70 | continue 71 | -------------------------------------------------------------------------------- /proxy_pool/DB/MongodbClient.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """ 3 | ------------------------------------------------- 4 | File Name: MongodbClient.py 5 | Description : 封装mongodb操作 6 | Author : JHao netAir 7 | date: 2017/3/3 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2017/3/3: 11 | 2017/9/26:完成对mongodb的支持 12 | ------------------------------------------------- 13 | """ 14 | __author__ = 'Maps netAir' 15 | 16 | from pymongo import MongoClient 17 | 18 | 19 | class MongodbClient(object): 20 | def __init__(self, name, host, port, **kwargs): 21 | self.name = name 22 | self.client = MongoClient(host, port, **kwargs) 23 | self.db = self.client.proxy 24 | 25 | def changeTable(self, name): 26 | self.name = name 27 | 28 | def get(self, proxy): 29 | data = self.db[self.name].find_one({'proxy': proxy}) 30 | return data['num'] if data != None else None 31 | 32 | def put(self, proxy, num=1): 33 | if self.db[self.name].find_one({'proxy': proxy}): 34 | return None 35 | else: 36 | self.db[self.name].insert({'proxy': proxy, 'num': num}) 37 | 38 | def pop(self): 39 | data = list(self.db[self.name].aggregate([{'$sample': {'size': 1}}])) 40 | if data: 41 | data = data[0] 42 | value = data['proxy'] 43 | self.delete(value) 44 | return {'proxy': value, 'value': data['num']} 45 | return None 46 | 47 | def delete(self, value): 48 | self.db[self.name].remove({'proxy': value}) 49 | 50 | def getAll(self): 51 | return {p['proxy']: p['num'] for p in self.db[self.name].find()} 52 | 53 | def clean(self): 54 | self.client.drop_database('proxy') 55 | 56 | def delete_all(self): 57 | self.db[self.name].remove() 58 | 59 | def update(self, key, value): 60 | self.db[self.name].update({'proxy': key}, {'$inc': {'num': value}}) 61 | 62 | def exists(self, key): 63 | return True if self.db[self.name].find_one({'proxy': key}) != None else False 64 | 65 | def getNumber(self): 66 | return self.db[self.name].count() 67 | 68 | 69 | if __name__ == "__main__": 70 | db = MongodbClient('first', 'localhost', 27017) 71 | # db.put('127.0.0.1:1') 72 | # db2 = MongodbClient('second', 'localhost', 27017) 73 | # db2.put('127.0.0.1:2') 74 | print(db.pop()) 75 | -------------------------------------------------------------------------------- /proxy_pool/ProxyGetter/CheckProxy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: CheckProxy 5 | Description : used for check getFreeProxy.py 6 | Author : JHao 7 | date: 2018/7/10 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2018/7/10: CheckProxy 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | from getFreeProxy import GetFreeProxy 16 | from Util.utilFunction import verifyProxyFormat 17 | 18 | 19 | from Util.LogHandler import LogHandler 20 | 21 | log = LogHandler('check_proxy', file=False) 22 | 23 | 24 | class CheckProxy(object): 25 | 26 | @staticmethod 27 | def checkAllGetProxyFunc(): 28 | """ 29 | 检查getFreeProxy所有代理获取函数运行情况 30 | Returns: 31 | None 32 | """ 33 | import inspect 34 | member_list = inspect.getmembers(GetFreeProxy, predicate=inspect.isfunction) 35 | proxy_count_dict = dict() 36 | for func_name, func in member_list: 37 | log.info(u"开始运行 {}".format(func_name)) 38 | try: 39 | proxy_list = [_ for _ in func() if verifyProxyFormat(_)] 40 | proxy_count_dict[func_name] = len(proxy_list) 41 | except Exception as e: 42 | log.info(u"代理获取函数 {} 运行出错!".format(func_name)) 43 | log.error(str(e)) 44 | log.info(u"所有函数运行完毕 " + "***" * 5) 45 | for func_name, func in member_list: 46 | log.info(u"函数 {n}, 获取到代理数: {c}".format(n=func_name, c=proxy_count_dict.get(func_name, 0))) 47 | 48 | @staticmethod 49 | def checkGetProxyFunc(func): 50 | """ 51 | 检查指定的getFreeProxy某个function运行情况 52 | Args: 53 | func: getFreeProxy中某个可调用方法 54 | 55 | Returns: 56 | None 57 | """ 58 | func_name = getattr(func, '__name__', "None") 59 | log.info("start running func: {}".format(func_name)) 60 | count = 0 61 | for proxy in func(): 62 | if verifyProxyFormat(proxy): 63 | log.info("{} fetch proxy: {}".format(func_name, proxy)) 64 | count += 1 65 | log.info("{n} completed, fetch proxy number: {c}".format(n=func_name, c=count)) 66 | 67 | 68 | if __name__ == '__main__': 69 | CheckProxy.checkAllGetProxyFunc() 70 | CheckProxy.checkGetProxyFunc(GetFreeProxy.freeProxy01) -------------------------------------------------------------------------------- /kuaishou/ks_video.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from random import randint 3 | from bs4 import BeautifulSoup 4 | from fake_useragent import UserAgent 5 | from time import sleep 6 | import os 7 | UA = UserAgent() 8 | 9 | headers = { 10 | 'Connection': 'close', 11 | 'User-Agent':UA.random 12 | } 13 | 14 | video_path = './video/' 15 | 16 | 17 | def get_page(url): 18 | ''' 19 | :return: response 20 | ''' 21 | try: 22 | req = requests.get(url,headers=headers) 23 | req.raise_for_status() 24 | req.close() 25 | req.encoding = 'utf-8' 26 | return req 27 | except Exception as code: 28 | print(code) 29 | sleep(3) 30 | 31 | def download(url): 32 | video_name = url[-24:] 33 | if os.path.exists(video_path+video_name) == True: 34 | print(video_name + ' 视频已存在,跳过') 35 | pass 36 | else: 37 | try: 38 | req = requests.get(url,headers=headers) 39 | req.raise_for_status() 40 | req.close() 41 | with open(video_path + video_name,'wb') as f: 42 | f.write(req.content) 43 | f.close() 44 | print(str(video_name) + ' ~下载完成!') 45 | except Exception as code: 46 | print(code) 47 | return None 48 | 49 | 50 | def parse_xiacoo(html): 51 | # http://v.xiacoo.com 52 | soup = BeautifulSoup(html.text, 'lxml') 53 | video_src = soup.select('source')[0] 54 | video_url = video_src.get('src').split('?', 1) 55 | url = video_url[0] 56 | return url 57 | 58 | def parse_xjj(html): 59 | # https://xjj.show/ks.php 60 | soup = BeautifulSoup(html.text, 'lxml') 61 | video_src = soup.find_all('video')[0] 62 | video_url = video_src.get('src').split('?', 1) 63 | url = video_url[0] 64 | return url 65 | 66 | 67 | if __name__ == '__main__': 68 | print('1: v.xiacoo.com; 2: xjj.show;') 69 | select = int(input('Please input your select:')) 70 | if select == 1: 71 | start_url = 'http://v.xiacoo.com' 72 | print('start url: ' + str(start_url)) 73 | elif select == 2: 74 | start_url = 'https://xjj.show/ks.php' 75 | print('start url: ' + str(start_url)) 76 | else: 77 | print('ERROR: check your input!') 78 | exit() 79 | while True: 80 | try: 81 | video_page = get_page(start_url) 82 | if video_page == None: 83 | print('url is None!') 84 | video_page = get_page(start_url) 85 | else: 86 | video_url = parse_xjj(video_page) 87 | download(video_url) 88 | sleep(randint(1,3)) 89 | except TimeoutError as code: 90 | print(code) -------------------------------------------------------------------------------- /cmanuf/download.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import pdfkit 3 | from time import sleep 4 | from bs4 import BeautifulSoup 5 | 6 | ''' 7 | No.1 手痒撸的,单次只能下载一本,只能下载H5中内容自行合并PDF,不下载PDF。 8 | No.2 token每小时都需要更新,获取方法自行网站中debug。 9 | No.3 book ID not detail ID 10 | No.4 感谢机械工业出版社...... 11 | 12 | 13 | ''' 14 | 15 | headers = { 16 | 'Accept': 'application/json, text/javascript, */*; q=0.01', 17 | 'Accept-Encoding': 'gzip, deflate', 18 | 'Accept-Language': 'zh-CN,zh;q=0.9', 19 | 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 20 | 'Cookie': 'JSESSIONID=A6DF07780010F3F5D221497A3A345A8D', 21 | 'DNT': '1', 22 | 'Host': 'www.hzcourse.com', 23 | 'Origin': 'http://www.hzcourse.com', 24 | 'Proxy-Connection': 'keep-alive', 25 | 'Referer': 'http://www.hzcourse.com', 26 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36', 27 | 'X-Requested-With': 'XMLHttpRequest' 28 | } 29 | 30 | url = 'http://www.hzcourse.com/web/refbook/queryAllChapterList' 31 | 32 | path_wk = r'C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe' 33 | config = pdfkit.configuration(wkhtmltopdf=path_wk) 34 | options = { 35 | 'page-size': 'Letter', 36 | 'margin-top': '0.75in', 37 | 'margin-right': '0.75in', 38 | 'margin-bottom': '0.75in', 39 | 'margin-left': '0.75in', 40 | 'encoding': "UTF-8", 41 | 'no-outline': None 42 | } 43 | 44 | def getUrls(url,data): 45 | res = requests.post(url,data=data) 46 | jsdata = res.json() 47 | urls = [] 48 | data = jsdata['data'] 49 | for i in data['data']: 50 | link = i['ref'] 51 | urls.append(link) 52 | return urls 53 | 54 | def download(links): 55 | num = 1 56 | for i in links: 57 | xtm = requests.get(url = 'http://www.hzcourse.com/resource/readBook?path=' + str(i),headers=headers) 58 | soup = BeautifulSoup(xtm.text,'lxml') 59 | for img in soup.find_all('img'): 60 | img['src'] = 'http://www.hzcourse.com/resource/readBook?path=/openresources/teach_ebook/uncompressed/18563/OEBPS/Text/' + img['src'] 61 | article = str(soup).encode('utf-8') 62 | with open(str(num) + '.html','wb') as f: 63 | f.write(article) 64 | f.close() 65 | try: 66 | pdfkit.from_file(str(num) + '.html',str(num) + '.pdf',configuration=config,options=options) 67 | except Exception as e: 68 | print('Error for ' + str(e) + ',Page :' + str(num)) 69 | num += 1 70 | sleep(1) 71 | 72 | 73 | if __name__ == '__main__': 74 | bookid = input("Please input bookid:") 75 | postData = { 76 | 'ebookId': bookid, 77 | 'token': '5a1536002e3441d0af4c3d640d0b37e9' 78 | } 79 | links = getUrls(url,postData) 80 | download(links) 81 | -------------------------------------------------------------------------------- /proxy_pool/Schedule/UsefulProxyCheck.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: UsefulProxyCheck 5 | Description : check useful proxy 6 | Author : JHao 7 | date: 2019/8/7 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/8/7: check useful proxy 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | from threading import Thread 16 | 17 | try: 18 | from Queue import Queue, Empty # py2 19 | except: 20 | from queue import Queue, Empty # py3 21 | 22 | from Util import LogHandler 23 | from Manager import ProxyManager 24 | from ProxyHelper import checkProxyUseful, Proxy 25 | 26 | FAIL_COUNT = 0 27 | 28 | 29 | class UsefulProxyCheck(ProxyManager, Thread): 30 | def __init__(self, queue, thread_name): 31 | ProxyManager.__init__(self) 32 | Thread.__init__(self, name=thread_name) 33 | 34 | self.queue = queue 35 | self.log = LogHandler('useful_proxy_check') 36 | 37 | def run(self): 38 | self.log.info("UsefulProxyCheck - {} : start".format(self.name)) 39 | self.db.changeTable(self.useful_proxy_queue) 40 | while True: 41 | try: 42 | proxy_str = self.queue.get(block=False) 43 | except Empty: 44 | self.log.info("UsefulProxyCheck - {} : exit".format(self.name)) 45 | break 46 | 47 | proxy_obj = Proxy.newProxyFromJson(proxy_str) 48 | proxy_obj, status = checkProxyUseful(proxy_obj) 49 | if status or proxy_obj.fail_count < FAIL_COUNT: 50 | self.db.put(proxy_obj) 51 | self.log.info('UsefulProxyCheck - {} : {} validation pass'.format(self.name, 52 | proxy_obj.proxy.ljust(20))) 53 | else: 54 | self.log.info('UsefulProxyCheck - {} : {} validation fail'.format(self.name, 55 | proxy_obj.proxy.ljust(20))) 56 | self.db.delete(proxy_obj.proxy) 57 | self.queue.task_done() 58 | 59 | 60 | def doUsefulProxyCheck(): 61 | proxy_queue = Queue() 62 | 63 | pm = ProxyManager() 64 | pm.db.changeTable(pm.useful_proxy_queue) 65 | for _proxy in pm.db.getAll(): 66 | proxy_queue.put(_proxy) 67 | 68 | thread_list = list() 69 | for index in range(10): 70 | thread_list.append(UsefulProxyCheck(proxy_queue, "thread_%s" % index)) 71 | 72 | for thread in thread_list: 73 | thread.start() 74 | 75 | for thread in thread_list: 76 | thread.join() 77 | 78 | 79 | if __name__ == '__main__': 80 | doUsefulProxyCheck() 81 | -------------------------------------------------------------------------------- /proxy_pool/Schedule/RawProxyCheck.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: RawProxyCheck 5 | Description : check raw_proxy to useful 6 | Author : JHao 7 | date: 2019/8/6 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/8/6: check raw_proxy to useful 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | from threading import Thread 16 | 17 | try: 18 | from Queue import Empty, Queue # py2 19 | except: 20 | from queue import Empty, Queue # py3 21 | 22 | from Util import LogHandler 23 | from Manager import ProxyManager 24 | from ProxyHelper import Proxy, checkProxyUseful 25 | 26 | 27 | class RawProxyCheck(ProxyManager, Thread): 28 | def __init__(self, queue, thread_name): 29 | ProxyManager.__init__(self) 30 | Thread.__init__(self, name=thread_name) 31 | self.log = LogHandler('raw_proxy_check') 32 | self.queue = queue 33 | 34 | def run(self): 35 | self.log.info("RawProxyCheck - {} : start".format(self.name)) 36 | self.db.changeTable(self.useful_proxy_queue) 37 | while True: 38 | try: 39 | proxy_json = self.queue.get(block=False) 40 | except Empty: 41 | self.log.info("RawProxyCheck - {} : exit".format(self.name)) 42 | break 43 | 44 | proxy_obj = Proxy.newProxyFromJson(proxy_json) 45 | 46 | proxy_obj, status = checkProxyUseful(proxy_obj) 47 | if status: 48 | if self.db.exists(proxy_obj.proxy): 49 | self.log.info('RawProxyCheck - {} : {} validation exists'.format(self.name, 50 | proxy_obj.proxy.ljust(20))) 51 | else: 52 | self.db.put(proxy_obj) 53 | self.log.info( 54 | 'RawProxyCheck - {} : {} validation pass'.format(self.name, proxy_obj.proxy.ljust(20))) 55 | else: 56 | self.log.info('RawProxyCheck - {} : {} validation fail'.format(self.name, proxy_obj.proxy.ljust(20))) 57 | self.queue.task_done() 58 | 59 | 60 | def doRawProxyCheck(): 61 | proxy_queue = Queue() 62 | 63 | pm = ProxyManager() 64 | pm.db.changeTable(pm.raw_proxy_queue) 65 | for _proxy in pm.db.getAll(): 66 | proxy_queue.put(_proxy) 67 | pm.db.clear() 68 | 69 | thread_list = list() 70 | for index in range(20): 71 | thread_list.append(RawProxyCheck(proxy_queue, "thread_%s" % index)) 72 | 73 | for thread in thread_list: 74 | thread.start() 75 | 76 | for thread in thread_list: 77 | thread.join() 78 | 79 | 80 | if __name__ == '__main__': 81 | doRawProxyCheck() 82 | -------------------------------------------------------------------------------- /proxy_pool/Util/utilFunction.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # !/usr/bin/env python 3 | """ 4 | ------------------------------------------------- 5 | File Name: utilFunction.py 6 | Description : tool function 7 | Author : JHao 8 | date: 2016/11/25 9 | ------------------------------------------------- 10 | Change Activity: 11 | 2016/11/25: 添加robustCrawl、verifyProxy、getHtmlTree 12 | ------------------------------------------------- 13 | """ 14 | import requests 15 | from lxml import etree 16 | 17 | from Util.WebRequest import WebRequest 18 | 19 | 20 | def robustCrawl(func): 21 | def decorate(*args, **kwargs): 22 | try: 23 | return func(*args, **kwargs) 24 | except Exception as e: 25 | pass 26 | # logger.info(u"sorry, 抓取出错。错误原因:") 27 | # logger.info(e) 28 | 29 | return decorate 30 | 31 | 32 | def verifyProxyFormat(proxy): 33 | """ 34 | 检查代理格式 35 | :param proxy: 36 | :return: 37 | """ 38 | import re 39 | verify_regex = r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}" 40 | _proxy = re.findall(verify_regex, proxy) 41 | return True if len(_proxy) == 1 and _proxy[0] == proxy else False 42 | 43 | 44 | def getHtmlTree(url, **kwargs): 45 | """ 46 | 获取html树 47 | :param url: 48 | :param kwargs: 49 | :return: 50 | """ 51 | 52 | header = {'Connection': 'keep-alive', 53 | 'Cache-Control': 'max-age=0', 54 | 'Upgrade-Insecure-Requests': '1', 55 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko)', 56 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 57 | 'Accept-Encoding': 'gzip, deflate, sdch', 58 | 'Accept-Language': 'zh-CN,zh;q=0.8', 59 | } 60 | # TODO 取代理服务器用代理服务器访问 61 | wr = WebRequest() 62 | html = wr.get(url=url, header=header).content 63 | return etree.HTML(html) 64 | 65 | 66 | def tcpConnect(proxy): 67 | """ 68 | TCP 三次握手 69 | :param proxy: 70 | :return: 71 | """ 72 | from socket import socket, AF_INET, SOCK_STREAM 73 | s = socket(AF_INET, SOCK_STREAM) 74 | ip, port = proxy.split(':') 75 | result = s.connect_ex((ip, int(port))) 76 | return True if result == 0 else False 77 | 78 | 79 | def validUsefulProxy(proxy): 80 | """ 81 | 检验代理是否可用 82 | :param proxy: 83 | :return: 84 | """ 85 | if isinstance(proxy, bytes): 86 | proxy = proxy.decode("utf8") 87 | proxies = {"http": "http://{proxy}".format(proxy=proxy)} 88 | try: 89 | r = requests.get('http://www.baidu.com', proxies=proxies, timeout=10, verify=False) 90 | if r.status_code == 200: 91 | return True 92 | except Exception as e: 93 | pass 94 | return False 95 | 96 | -------------------------------------------------------------------------------- /proxy_pool/Config/setting.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: setting.py 5 | Description : 配置文件 6 | Author : JHao 7 | date: 2019/2/15 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/2/15: 11 | ------------------------------------------------- 12 | """ 13 | 14 | import sys 15 | from os import getenv 16 | from logging import getLogger 17 | 18 | log = getLogger(__name__) 19 | 20 | HEADER = """ 21 | **************************************************************** 22 | *** ______ ********************* ______ *********** _ ******** 23 | *** | ___ \_ ******************** | ___ \ ********* | | ******** 24 | *** | |_/ / \__ __ __ _ __ _ | |_/ /___ * ___ | | ******** 25 | *** | __/| _// _ \ \ \/ /| | | || __// _ \ / _ \ | | ******** 26 | *** | | | | | (_) | > < \ |_| || | | (_) | (_) || |___ **** 27 | *** \_| |_| \___/ /_/\_\ \__ |\_| \___/ \___/ \_____/ **** 28 | **** __ / / ***** 29 | ************************* /___ / ******************************* 30 | ************************* ******************************** 31 | **************************************************************** 32 | """ 33 | 34 | PY3 = sys.version_info >= (3,) 35 | 36 | DB_TYPE = getenv('db_type', 'SSDB').upper() 37 | DB_HOST = getenv('db_host', '127.0.0.1') 38 | DB_PORT = getenv('db_port', 8888) 39 | DB_PASSWORD = getenv('db_password', '') 40 | 41 | 42 | """ 数据库配置 """ 43 | DATABASES = { 44 | "default": { 45 | "TYPE": DB_TYPE, 46 | "HOST": DB_HOST, 47 | "PORT": DB_PORT, 48 | "NAME": "proxy", 49 | "PASSWORD": DB_PASSWORD 50 | } 51 | } 52 | 53 | # register the proxy getter function 54 | 55 | PROXY_GETTER = [ 56 | "freeProxy01", 57 | # "freeProxy02", 58 | "freeProxy03", 59 | "freeProxy04", 60 | "freeProxy05", 61 | # "freeProxy06", 62 | "freeProxy07", 63 | # "freeProxy08", 64 | "freeProxy09", 65 | "freeProxy13", 66 | "freeProxy14", 67 | "freeProxy14", 68 | ] 69 | 70 | """ API config http://127.0.0.1:5010 """ 71 | SERVER_API = { 72 | "HOST": "0.0.0.0", # The ip specified which starting the web API 73 | "PORT": 5010 # port number to which the server listens to 74 | } 75 | 76 | 77 | class ConfigError(BaseException): 78 | pass 79 | 80 | 81 | def checkConfig(): 82 | if DB_TYPE not in ["SSDB", "REDIS"]: 83 | raise ConfigError('db_type Do not support: %s, must SSDB/REDIS .' % DB_TYPE) 84 | 85 | if type(DB_PORT) == str and not DB_PORT.isdigit(): 86 | raise ConfigError('if db_port is string, it must be digit, not %s' % DB_PORT) 87 | 88 | from ProxyGetter import getFreeProxy 89 | illegal_getter = list(filter(lambda key: not hasattr(getFreeProxy.GetFreeProxy, key), PROXY_GETTER)) 90 | if len(illegal_getter) > 0: 91 | raise ConfigError("ProxyGetter: %s does not exists" % "/".join(illegal_getter)) 92 | 93 | 94 | checkConfig() 95 | -------------------------------------------------------------------------------- /proxy_pool/Util/LogHandler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: LogHandler.py 5 | Description : 日志操作模块 6 | Author : JHao 7 | date: 2017/3/6 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2017/3/6: log handler 11 | 2017/9/21: 屏幕输出/文件输出 可选(默认屏幕和文件均输出) 12 | ------------------------------------------------- 13 | """ 14 | __author__ = 'JHao' 15 | 16 | import os 17 | 18 | import logging 19 | 20 | from logging.handlers import TimedRotatingFileHandler 21 | 22 | # 日志级别 23 | CRITICAL = 50 24 | FATAL = CRITICAL 25 | ERROR = 40 26 | WARNING = 30 27 | WARN = WARNING 28 | INFO = 20 29 | DEBUG = 10 30 | NOTSET = 0 31 | 32 | CURRENT_PATH = os.path.dirname(os.path.abspath(__file__)) 33 | ROOT_PATH = os.path.join(CURRENT_PATH, os.pardir) 34 | LOG_PATH = os.path.join(ROOT_PATH, 'log') 35 | 36 | if not os.path.exists(LOG_PATH): 37 | os.mkdir(LOG_PATH) 38 | 39 | 40 | class LogHandler(logging.Logger): 41 | """ 42 | LogHandler 43 | """ 44 | 45 | def __init__(self, name, level=DEBUG, stream=True, file=True): 46 | self.name = name 47 | self.level = level 48 | logging.Logger.__init__(self, self.name, level=level) 49 | if stream: 50 | self.__setStreamHandler__() 51 | if file: 52 | self.__setFileHandler__() 53 | 54 | def __setFileHandler__(self, level=None): 55 | """ 56 | set file handler 57 | :param level: 58 | :return: 59 | """ 60 | file_name = os.path.join(LOG_PATH, '{name}.log'.format(name=self.name)) 61 | # 设置日志回滚, 保存在log目录, 一天保存一个文件, 保留15天 62 | file_handler = TimedRotatingFileHandler(filename=file_name, when='D', interval=1, backupCount=15) 63 | file_handler.suffix = '%Y%m%d.log' 64 | if not level: 65 | file_handler.setLevel(self.level) 66 | else: 67 | file_handler.setLevel(level) 68 | formatter = logging.Formatter('%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s') 69 | 70 | file_handler.setFormatter(formatter) 71 | self.file_handler = file_handler 72 | self.addHandler(file_handler) 73 | 74 | def __setStreamHandler__(self, level=None): 75 | """ 76 | set stream handler 77 | :param level: 78 | :return: 79 | """ 80 | stream_handler = logging.StreamHandler() 81 | formatter = logging.Formatter('%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s') 82 | stream_handler.setFormatter(formatter) 83 | if not level: 84 | stream_handler.setLevel(self.level) 85 | else: 86 | stream_handler.setLevel(level) 87 | self.addHandler(stream_handler) 88 | 89 | def resetName(self, name): 90 | """ 91 | reset name 92 | :param name: 93 | :return: 94 | """ 95 | self.name = name 96 | self.removeHandler(self.file_handler) 97 | self.__setFileHandler__() 98 | 99 | 100 | if __name__ == '__main__': 101 | log = LogHandler('test') 102 | log.info('this is a test msg') 103 | -------------------------------------------------------------------------------- /proxy_pool/Util/WebRequest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: WebRequest 5 | Description : Network Requests Class 6 | Author : J_hao 7 | date: 2017/7/31 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2017/7/31: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'J_hao' 14 | 15 | from requests.models import Response 16 | import requests 17 | import random 18 | import time 19 | 20 | 21 | class WebRequest(object): 22 | def __init__(self, *args, **kwargs): 23 | pass 24 | 25 | @property 26 | def user_agent(self): 27 | """ 28 | return an User-Agent at random 29 | :return: 30 | """ 31 | ua_list = [ 32 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101', 33 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122', 34 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71', 35 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95', 36 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71', 37 | 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)', 38 | 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50', 39 | 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', 40 | ] 41 | return random.choice(ua_list) 42 | 43 | @property 44 | def header(self): 45 | """ 46 | basic header 47 | :return: 48 | """ 49 | return {'User-Agent': self.user_agent, 50 | 'Accept': '*/*', 51 | 'Connection': 'keep-alive', 52 | 'Accept-Language': 'zh-CN,zh;q=0.8'} 53 | 54 | def get(self, url, header=None, retry_time=5, timeout=30, 55 | retry_flag=list(), retry_interval=5, *args, **kwargs): 56 | """ 57 | get method 58 | :param url: target url 59 | :param header: headers 60 | :param retry_time: retry time when network error 61 | :param timeout: network timeout 62 | :param retry_flag: if retry_flag in content. do retry 63 | :param retry_interval: retry interval(second) 64 | :param args: 65 | :param kwargs: 66 | :return: 67 | """ 68 | headers = self.header 69 | if header and isinstance(header, dict): 70 | headers.update(header) 71 | while True: 72 | try: 73 | html = requests.get(url, headers=headers, timeout=timeout, **kwargs) 74 | if any(f in html.content for f in retry_flag): 75 | raise Exception 76 | return html 77 | except Exception as e: 78 | print(e) 79 | retry_time -= 1 80 | if retry_time <= 0: 81 | # 多次请求失败 82 | resp = Response() 83 | resp.status_code = 200 84 | return resp 85 | time.sleep(retry_interval) 86 | -------------------------------------------------------------------------------- /cableav.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | from fake_useragent import UserAgent 4 | import re 5 | from datetime import datetime 6 | from time import sleep 7 | from random import randint 8 | 9 | FILE_PATH = './' 10 | 11 | host = 'https://www.cableav.tv/' 12 | 13 | proxies = { 14 | 'http': 'http://127.0.0.1:7890', 15 | 'https': 'http://127.0.0.1:7890' 16 | } 17 | ua = UserAgent() 18 | headers = { 19 | "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", 20 | "accept-encoding": "gzip, deflate, br", 21 | "accept-language": "zh-CN,zh;q=0.9", 22 | "cache-control": "max-age=0", 23 | "dnt":"1", 24 | "referer":"https://cableav.tv/playlist/", 25 | "user-agent": ua.random 26 | } 27 | 28 | def open_page(url): 29 | 30 | sleep(randint(1,3)) 31 | print('\n{} - [INFO]: requests at {}'.format( 32 | datetime.now().strftime("%Y-%m-%d %H:%M:%S"),url)) 33 | 34 | req = requests.get(url,headers=headers,proxies=proxies) 35 | try: 36 | if req.status_code == 200 or req.status_code == 304: 37 | req.encoding = 'utf-8' 38 | return req 39 | except TimeoutError: 40 | print("Timeout:") 41 | cnt = 0 42 | while cnt < 3: 43 | open_page(url) 44 | cnt += 1 45 | 46 | def parse_playlist(html): 47 | 48 | if html != None: 49 | page = BeautifulSoup(html.text,'lxml') 50 | video_urls = page.select('div.listing-content > h3 > a') 51 | for i in video_urls: 52 | data = i.get('href') 53 | yield data 54 | else: 55 | print("Result is None! \n") 56 | pass 57 | 58 | def parse_video(html): 59 | PATTERN_URL = r'.*\"single_media_sources\":(\[\{.*\}\])' 60 | if html != None: 61 | page = BeautifulSoup(html.text,'lxml') 62 | m3u8 = page.find("meta", {"property": "og:video:url"})["content"] 63 | video_tags = page.find_all("meta", {"property": "video:tag"}) 64 | best_quality = max([int(tag["content"][: -1]) for tag in video_tags]) 65 | title = page.find("title").text.replace(' - CableAV','') 66 | 67 | for line in html.text.split('\n'): 68 | match = re.match(PATTERN_URL, line) 69 | if match: 70 | quality_lists = eval(match.group(1)) 71 | for quality in quality_lists: 72 | if str(best_quality) in quality['source_label']: 73 | m3u8 = quality['source_file'].replace('\/', '/') 74 | break 75 | # return [title,m3u8] 76 | save_file(title,m3u8) 77 | 78 | 79 | def save_file(title,m3u8): 80 | try: 81 | with open(FILE_PATH + 'test.txt','ab+') as f: 82 | result = '{},{}\r\n'.format(title,m3u8) 83 | f.write(result.encode('utf-8')) 84 | f.close() 85 | except IOError as e: 86 | print(e) 87 | pass 88 | 89 | def run(url): 90 | page = open_page(url) 91 | play_list = parse_playlist(page) 92 | for i in play_list: 93 | video_page = open_page(i) 94 | parse_video(video_page) 95 | 96 | if __name__ == '__main__': 97 | while True: 98 | start_url = input("Input page URL: \n") 99 | page_num = int(input('Input page list num:\n')) 100 | if page_num <= 1: 101 | run(start_url) 102 | else: 103 | urls = [start_url + "page/" + "{}/".format(x) for x in range(2,page_num+1)] 104 | run(start_url) 105 | for url in urls: 106 | run(url) 107 | -------------------------------------------------------------------------------- /baiduMap/baiduMap.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import requests 3 | import pandas as pd 4 | from time import sleep 5 | 6 | ak='' 7 | # KeyWord=u'早教' 8 | # City=u'北京市' 9 | # Tag=u'教育培训' 10 | # Page=0 11 | 12 | def getJson(url): 13 | response = requests.get(url) 14 | status = response.status_code 15 | data = response.json() 16 | if status == 200: 17 | return data 18 | else: 19 | num = 1 20 | while num < 4: 21 | print('连接错误!尝试重新获取! 当前获取次数:' + str(num)) 22 | num += 1 23 | sleep(3) 24 | print(url) 25 | getJson(url) 26 | print('Error!') 27 | return None 28 | 29 | def getNum(data): 30 | # num = 0 31 | if data == None: 32 | return None 33 | else: 34 | total = data['total'] 35 | # results = data['results'] 36 | if total > 20: 37 | # pageNum = total // 20 38 | pageNum = int((total + 20 - 1) / 20) 39 | print('共检索到' + str(total) + '数据,共计:' + str(pageNum) + '页!') 40 | return pageNum 41 | else: 42 | pageNum = 1 43 | return pageNum 44 | 45 | def parseData(data): 46 | if data == None: 47 | print('data is None!') 48 | else: 49 | datalist = [] 50 | results = data['results'] 51 | for i in results: 52 | name = i['name'] 53 | add = i['address'] 54 | detail = i['detail_info'] 55 | mapUrl = detail['detail_url'] 56 | if i.__contains__('telephone') == True: 57 | tel = i['telephone'] 58 | else: 59 | tel = None 60 | 61 | tempData = { 62 | 'name': name, 63 | 'address': add, 64 | 'tel': str(tel), 65 | 'mapUrl': str(mapUrl) 66 | } 67 | datalist.append(tempData) 68 | return datalist 69 | 70 | if __name__ == '__main__': 71 | headers = ['name','address','tel','map'] 72 | KeyWord = input('输入检索关键词: \n') 73 | Tag = input('输入分类标签: \n') 74 | City = input('检索城市(市): \n') 75 | startUrl = 'http://api.map.baidu.com/place/v2/search?query=' + KeyWord + \ 76 | '&tag=' + Tag + \ 77 | '®ion=' + City + \ 78 | '&output=json' + \ 79 | '&ak=' + ak + \ 80 | '&scope=2&page_size=20' + \ 81 | '& page_num=0' 82 | json = getJson(startUrl) 83 | pageNum = getNum(json) 84 | if pageNum == None: 85 | print('No page number!') 86 | else: 87 | for num in range(0,int(pageNum)): 88 | url = 'http://api.map.baidu.com/place/v2/search?query=' + KeyWord + \ 89 | '&tag=' + Tag + \ 90 | '®ion=' + City + \ 91 | '&output=json' + \ 92 | '&ak=' + ak + \ 93 | '&scope=2&page_size=20' + \ 94 | '& page_num=' + str(num) 95 | print('page is :' + str(num)) 96 | data = getJson(url) 97 | sleep(5) 98 | datalist = parseData(data) 99 | save = pd.DataFrame(datalist) 100 | try: 101 | save.to_csv('./result.csv',header=headers,index=False,mode='a+',encoding='utf_8_sig') 102 | except UnicodeEncodeError: 103 | print('编码错误!') 104 | -------------------------------------------------------------------------------- /qicai/QicaiCategoriesSpider.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import urllib.request 4 | import pymongo 5 | from multiprocessing import Pool 6 | 7 | mongo_client = pymongo.MongoClient('localhost',27017) 8 | db = mongo_client['spider_db'] 9 | qcdb = db.client['qcdb'] 10 | 11 | 12 | host = 'http://www.qcenglish.com' 13 | 14 | headers = { 15 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 16 | 'Host': 'www.qcenglish.com', 17 | 'Referer': host 18 | } 19 | 20 | download_path = './tmp//' 21 | 22 | def get_article(url): 23 | req = requests.get(url,headers=headers) 24 | req.encoding = req.apparent_encoding 25 | soup = BeautifulSoup(req.text,'lxml') 26 | try: 27 | pdf_title = soup.select('#details > dl > dd')[0].get_text() 28 | download_link = soup.select('#download > li > a')[0].get('href') 29 | print('书名:' + pdf_title) 30 | print('下载链接:' + host + download_link) 31 | download_url = host + download_link 32 | download(download_url,pdf_title) 33 | except IndexError as e: 34 | print(e) 35 | pass 36 | 37 | def download(url, title): 38 | file_path = download_path + title + '.zip' 39 | conunter = 1 40 | try: 41 | urllib.request.urlretrieve(url, file_path) 42 | except urllib.error.URLError as e: 43 | while conunter <= 3: 44 | print("尝试重连,当前次数:" + str(conunter)) 45 | download(url,title) 46 | conunter += 1 47 | pass 48 | print('下载完成.......') 49 | 50 | def get_item_url(url): 51 | print('当前URL: ' + url) 52 | wb_date = requests.get(url,headers=headers) 53 | wb_date.encoding = wb_date.apparent_encoding 54 | soup = BeautifulSoup(wb_date.text,'lxml') 55 | items = soup.select('#container > div.content > dl.listitem > a') 56 | for item in items: 57 | # item = item.get('href') 58 | data = { 59 | 'item_url': host + item.get('href'), 60 | 'status': 0 61 | } 62 | print(data) 63 | qcdb.url.insert(data) 64 | print('当前列表页爬取完成!\n') 65 | 66 | def url_generator(page_id,page_sum): 67 | page_sum = page_sum + 1 68 | for y in range(1,page_sum): 69 | url = 'http://www.qcenglish.com/ebook/list_' + str(page_id) + '_{}.html'.format(str(y)) 70 | get_item_url(url) 71 | print('文章页获取ing....') 72 | 73 | 74 | # url_generator(54,12) 75 | 76 | 77 | if __name__ == '__main__': 78 | p = Pool() 79 | for item in qcdb.url.find(): 80 | item_status = item.get('status') 81 | item_url = item.get('item_url') 82 | if item_status == 0: 83 | print('当前内容页:' + item_url) 84 | try: 85 | p.apply_async(get_article(item_url)) 86 | qcdb.url.update({'item_url':item_url},{"$set":{"item_url":item_url,"status":1}},multi=False) 87 | except: 88 | print('发现一个玄学问题!') 89 | bad_url = { 90 | 'badURL': item_url, 91 | 'status': 0 92 | } 93 | qcdb.badurl.insert(bad_url) 94 | print('已加入BadURL中,请注意查看!') 95 | pass 96 | else: 97 | print('已经爬取过了····') 98 | print("等待新的线程加入!") 99 | p.close() 100 | p.join() 101 | print('完成!\n') -------------------------------------------------------------------------------- /proxy_pool/DB/DbClient.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # !/usr/bin/env python 3 | """ 4 | ------------------------------------------------- 5 | File Name: DbClient.py 6 | Description : DB工厂类 7 | Author : JHao 8 | date: 2016/12/2 9 | ------------------------------------------------- 10 | Change Activity: 11 | 2016/12/2: 12 | ------------------------------------------------- 13 | """ 14 | __author__ = 'JHao' 15 | 16 | import os 17 | import sys 18 | 19 | from Config.ConfigGetter import config 20 | from Util import Singleton 21 | 22 | sys.path.append(os.path.dirname(os.path.abspath(__file__))) 23 | 24 | 25 | class DbClient(object): 26 | """ 27 | DbClient DB工厂类 提供get/put/update/pop/delete/exists/getAll/clean/getNumber/changeTable方法 28 | 29 | 目前存放代理的有两种, 使用changeTable方法切换操作对象: 30 | raw_proxy: 存放原始的代理; 31 | useful_proxy: 存放检验后的代理; 32 | 33 | 34 | 抽象方法定义: 35 | get(proxy): 返回指定proxy的信息; 36 | put(proxy): 存入一个proxy信息; 37 | pop(): 返回并删除一个proxy信息; 38 | update(proxy): 更新指定proxy信息; 39 | delete(proxy): 删除指定proxy; 40 | exists(proxy): 判断指定proxy是否存在; 41 | getAll(): 列表形式返回所有代理; 42 | clean(): 清除所有proxy信息; 43 | getNumber(): 返回proxy数据量; 44 | changeTable(name): 切换操作对象 raw_proxy/useful_proxy 45 | 46 | 47 | 所有方法需要相应类去具体实现: 48 | ssdb: SsdbClient.py 49 | redis: RedisClient.py 50 | mongodb: MongodbClient.py 51 | 52 | """ 53 | 54 | __metaclass__ = Singleton 55 | 56 | def __init__(self): 57 | """ 58 | init 59 | :return: 60 | """ 61 | self.__initDbClient() 62 | 63 | def __initDbClient(self): 64 | """ 65 | init DB Client 66 | :return: 67 | """ 68 | __type = None 69 | if "SSDB" == config.db_type: 70 | __type = "SsdbClient" 71 | elif "REDIS" == config.db_type: 72 | __type = "RedisClient" 73 | elif "MONGODB" == config.db_type: 74 | __type = "MongodbClient" 75 | else: 76 | pass 77 | assert __type, 'type error, Not support DB type: {}'.format(config.db_type) 78 | self.client = getattr(__import__(__type), __type)(name=config.db_name, 79 | host=config.db_host, 80 | port=config.db_port, 81 | password=config.db_password) 82 | 83 | def get(self, key, **kwargs): 84 | return self.client.get(key, **kwargs) 85 | 86 | def put(self, key, **kwargs): 87 | return self.client.put(key, **kwargs) 88 | 89 | def update(self, key, value, **kwargs): 90 | return self.client.update(key, value, **kwargs) 91 | 92 | def delete(self, key, **kwargs): 93 | return self.client.delete(key, **kwargs) 94 | 95 | def exists(self, key, **kwargs): 96 | return self.client.exists(key, **kwargs) 97 | 98 | def pop(self, **kwargs): 99 | return self.client.pop(**kwargs) 100 | 101 | def getAll(self): 102 | return self.client.getAll() 103 | 104 | def clear(self): 105 | return self.client.clear() 106 | 107 | def changeTable(self, name): 108 | self.client.changeTable(name) 109 | 110 | def getNumber(self): 111 | return self.client.getNumber() 112 | -------------------------------------------------------------------------------- /proxy_pool/Api/ProxyApi.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # !/usr/bin/env python 3 | """ 4 | ------------------------------------------------- 5 | File Name: ProxyApi.py 6 | Description : WebApi 7 | Author : JHao 8 | date: 2016/12/4 9 | ------------------------------------------------- 10 | Change Activity: 11 | 2016/12/04: WebApi 12 | 2019/08/14: 集成Gunicorn启动方式 13 | ------------------------------------------------- 14 | """ 15 | __author__ = 'JHao' 16 | 17 | import sys 18 | import platform 19 | from werkzeug.wrappers import Response 20 | from flask import Flask, jsonify, request 21 | 22 | sys.path.append('../') 23 | 24 | from Config.ConfigGetter import config 25 | from Manager.ProxyManager import ProxyManager 26 | 27 | app = Flask(__name__) 28 | 29 | 30 | class JsonResponse(Response): 31 | @classmethod 32 | def force_type(cls, response, environ=None): 33 | if isinstance(response, (dict, list)): 34 | response = jsonify(response) 35 | 36 | return super(JsonResponse, cls).force_type(response, environ) 37 | 38 | 39 | app.response_class = JsonResponse 40 | 41 | api_list = { 42 | 'get': u'get an useful proxy', 43 | # 'refresh': u'refresh proxy pool', 44 | 'get_all': u'get all proxy from proxy pool', 45 | 'delete?proxy=127.0.0.1:8080': u'delete an unable proxy', 46 | 'get_status': u'proxy number' 47 | } 48 | 49 | 50 | @app.route('/') 51 | def index(): 52 | return api_list 53 | 54 | 55 | @app.route('/get/') 56 | def get(): 57 | proxy = ProxyManager().get() 58 | return proxy.info_json if proxy else {"code": 0, "src": "no proxy"} 59 | 60 | 61 | @app.route('/refresh/') 62 | def refresh(): 63 | # TODO refresh会有守护程序定时执行,由api直接调用性能较差,暂不使用 64 | # ProxyManager().refresh() 65 | pass 66 | return 'success' 67 | 68 | 69 | @app.route('/get_all/') 70 | def getAll(): 71 | proxies = ProxyManager().getAll() 72 | return jsonify([_.info_dict for _ in proxies]) 73 | 74 | 75 | @app.route('/delete/', methods=['GET']) 76 | def delete(): 77 | proxy = request.args.get('proxy') 78 | ProxyManager().delete(proxy) 79 | return {"code": 0, "src": "success"} 80 | 81 | 82 | @app.route('/get_status/') 83 | def getStatus(): 84 | status = ProxyManager().getNumber() 85 | return status 86 | 87 | 88 | if platform.system() != "Windows": 89 | import gunicorn.app.base 90 | from six import iteritems 91 | 92 | 93 | class StandaloneApplication(gunicorn.app.base.BaseApplication): 94 | 95 | def __init__(self, app, options=None): 96 | self.options = options or {} 97 | self.application = app 98 | super(StandaloneApplication, self).__init__() 99 | 100 | def load_config(self): 101 | _config = dict([(key, value) for key, value in iteritems(self.options) 102 | if key in self.cfg.settings and value is not None]) 103 | for key, value in iteritems(_config): 104 | self.cfg.set(key.lower(), value) 105 | 106 | def load(self): 107 | return self.application 108 | 109 | 110 | def runFlask(): 111 | app.run(host=config.host_ip, port=config.host_port) 112 | 113 | 114 | def runFlaskWithGunicorn(): 115 | _options = { 116 | 'bind': '%s:%s' % (config.host_ip, config.host_port), 117 | 'workers': 4, 118 | 'accesslog': '-', # log to stdout 119 | 'access_log_format': '%(h)s %(l)s %(t)s "%(r)s" %(s)s "%(a)s"' 120 | } 121 | StandaloneApplication(app, _options).run() 122 | 123 | 124 | if __name__ == '__main__': 125 | if platform.system() == "Windows": 126 | runFlask() 127 | else: 128 | runFlaskWithGunicorn() 129 | -------------------------------------------------------------------------------- /yasee1/run.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import requests 4 | 5 | host = 'https://1.yasee1.com/' 6 | 7 | def getVideoId(): 8 | videoId = int(input("Input Video ID: ")) 9 | # videoUrl = host + str("video-") + str(videoId) 10 | return str(videoId) 11 | 12 | def getXHR(): 13 | videoId = getVideoId() 14 | headers = { 15 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36", 16 | "X-Requested-With": "XMLHttpRequest", 17 | "Referer": str(host) + "video-" + str(videoId) 18 | } 19 | videoUrl = str(host) + "index/req/getPlayerDomain?id=" + videoId 20 | response = requests.get(videoUrl,headers=headers) 21 | res_status = response.status_code 22 | if res_status == 200: 23 | response = response.json() 24 | # print(response) 25 | return response 26 | else: 27 | return None 28 | 29 | def parseXHR(): 30 | XHR = getXHR() 31 | code = XHR.get("code") 32 | if code == -2: 33 | data = XHR.get("info") 34 | down_url = data.get("down_url") 35 | video_hls = data.get("video_hls") 36 | data = { 37 | "down_url" : down_url, 38 | "video_hls" : video_hls 39 | } 40 | return data 41 | else: 42 | print('Error! \n') 43 | return None 44 | 45 | 46 | def m3u8(data): 47 | down_url = data.get("down_url") 48 | video_hls = data.get("video_hls") 49 | hlsUrl = video_hls.split('/',3) 50 | 51 | if hlsUrl[2] == '[domain_dan]': 52 | video_hls = video_hls.replace("[domain_dan]","hone.yyhdyl.com") 53 | elif hlsUrl[2] == '[domain_fourth]': 54 | video_hls = video_hls.replace("[domain_fourth]","head2.yyhdyl.com") 55 | elif hlsUrl[2] == '[domain_shuang]': 56 | video_hls = video_hls.replace("[domain_shuang]","htwo.yyhdyl.com") 57 | elif hlsUrl[2] == '[domain_three]': 58 | video_hls = video_hls.replace("[domain_three]","head.yyhdyl.com") 59 | else: 60 | video_hls = None 61 | 62 | if down_url == None: 63 | return video_hls 64 | else: 65 | quality = down_url[-9:-5] 66 | if quality == str("720p"): 67 | video_hls = video_hls.replace("hls.m3u8","hls-720p.m3u8") 68 | elif quality == str("480p"): 69 | video_hls = video_hls.replace("hls.m3u8","hls-480p.m3u8") 70 | elif quality == str("360p"): 71 | video_hls = video_hls.replace("hls.m3u8","hls-360p.m3u8") 72 | elif quality == str("240p"): 73 | video_hls = video_hls.replace("hls.m3u8","hls-240p.m3u8") 74 | else: 75 | video_hls = None 76 | return video_hls 77 | 78 | def download(url,filename): 79 | header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"} 80 | downloadPath = os.getcwd() + '\Temp' 81 | if not os.path.exists(downloadPath): 82 | os.mkdir(downloadPath) 83 | content = requests.get(url,headers=header).text 84 | num = 0 85 | tempVideo = os.path.join(downloadPath,f'{filename}.ts') 86 | fileLine = content.split('\n') 87 | for line in fileLine: 88 | if line[-4:] == ".jpg": 89 | tsUrl = url.rsplit('/',1)[0] + "/" + line 90 | # res = requests.get(tsUrl) 91 | # with open(downloadPath + "\\" + str(num) + ".ts",'wb') as f: 92 | # f.write(res.content) 93 | # f.flush() 94 | print(tsUrl) 95 | num += 1 96 | print('Download Successful!') 97 | 98 | 99 | 100 | if __name__ == '__main__': 101 | while True: 102 | XHR = parseXHR() 103 | if XHR == None: 104 | print('Error!\n') 105 | else: 106 | m3u8_url = m3u8(XHR) 107 | print(m3u8_url) 108 | -------------------------------------------------------------------------------- /tuao8/crawler.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import requests 3 | import os 4 | import time 5 | import threading 6 | from bs4 import BeautifulSoup 7 | 8 | 9 | class myThred(threading.Thread): 10 | def __init__(self,url,dir,filename): 11 | threading.Thread.__init__(self) 12 | self.ThreadID = filename 13 | self.url = url 14 | self.dir = dir 15 | self.filename = filename 16 | 17 | def run(self): 18 | downloadPic(self.url,self.dir,self.filename) 19 | def getList(url): 20 | try: 21 | html = requests.get(url) 22 | soup = BeautifulSoup(html.text,'lxml') 23 | articlelist = soup.select('#container > main > article > div > a') 24 | articleurls = [articleurl.get('href') for articleurl in articlelist] 25 | return articleurls 26 | except Exception as e: 27 | print(e) 28 | return None 29 | 30 | def getTitle(url): 31 | try: 32 | html = requests.get(url) 33 | soup = BeautifulSoup(html.text,'lxml') 34 | title = soup.select('h1.title')[0].get_text() 35 | return title 36 | except Exception as e: 37 | print(e) 38 | return None 39 | 40 | def getImgurl(url): 41 | try: 42 | html = requests.get(url) 43 | soup = BeautifulSoup(html.text,'lxml') 44 | imgurl = soup.select('div.entry')[0].p.img['src'] 45 | return imgurl 46 | except Exception as e: 47 | print(e) 48 | return None 49 | 50 | def downloadPic(url,dir,filename): 51 | req = requests.get(url) 52 | if req.status_code == 200: 53 | with open(str(dir) + '/' + str(filename) + '.jpg', 'wb+') as f: 54 | f.write(req.content) 55 | else: 56 | print('链接错误: ' + str(req.status_code)) 57 | 58 | def getLastpage(url): 59 | html = requests.get(url) 60 | soup = BeautifulSoup(html.text,'lxml') 61 | lastnum = soup.select('#dm-fy > li > a')[-2].get_text() 62 | return int(lastnum) 63 | 64 | def getArticles(url): 65 | imgurls = [] 66 | lastpage = getLastpage(url) 67 | pageurls = [str(url) + '?page={}'.format(number) for number in range(1,lastpage)] 68 | for imgurl in pageurls: 69 | imgurls.append(imgurl) 70 | return imgurls 71 | 72 | def startUrl(url): 73 | category = int(input('请输入分类ID: ')) 74 | categoryLast = int(input('请输入分类对应的最后页码: ')) 75 | categoryUrl = [str(url) + 'category-' + str(category) + '_{}.html'.format(num) for num in range(1,int(categoryLast) + 1)] 76 | return categoryUrl 77 | 78 | def main(url): 79 | imglinks = [] 80 | title = getTitle(url) 81 | articles = getArticles(url) 82 | filename = 1 83 | for imgurl in articles: 84 | imglink = getImgurl(imgurl) 85 | imglinks.append(imglink) 86 | print('获取下载链接ing.......' + str(imglink)) 87 | print('共计取得: ' +str(len(imglinks)) + '张图片链接') 88 | if os.path.exists(title) == False: 89 | os.mkdir(title) 90 | threads = [] 91 | for img in imglinks: 92 | thread = myThred(img, title, filename) 93 | thread.start() 94 | threads.append(thread) 95 | # downloadPic(imglink, title, filename) 96 | print('下载完成....' + str(filename)) 97 | filename += 1 98 | for t in threads: 99 | t.join() 100 | else: 101 | print('文件已存在,跳过下载.....' + str(filename)) 102 | 103 | url = 'https://www.tuao8.com/' 104 | 105 | if __name__ == '__main__': 106 | try: 107 | starturls = startUrl(url) 108 | for starturl in starturls: 109 | articleurls = getList(starturl) 110 | for articleurl in articleurls: 111 | print(articleurl) 112 | main(articleurl) 113 | time.sleep(3) 114 | print('图集下载完成,休眠 3S......') 115 | print('当前分类爬取完成.....') 116 | except Exception as e: 117 | print(e) -------------------------------------------------------------------------------- /proxy_pool/Manager/ProxyManager.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # !/usr/bin/env python 3 | """ 4 | ------------------------------------------------- 5 | File Name: ProxyManager.py 6 | Description : 7 | Author : JHao 8 | date: 2016/12/3 9 | ------------------------------------------------- 10 | Change Activity: 11 | 2016/12/3: 12 | ------------------------------------------------- 13 | """ 14 | __author__ = 'JHao' 15 | 16 | import random 17 | 18 | from ProxyHelper import Proxy 19 | from DB.DbClient import DbClient 20 | from Config.ConfigGetter import config 21 | from Util.LogHandler import LogHandler 22 | from Util.utilFunction import verifyProxyFormat 23 | from ProxyGetter.getFreeProxy import GetFreeProxy 24 | 25 | 26 | class ProxyManager(object): 27 | """ 28 | ProxyManager 29 | """ 30 | 31 | def __init__(self): 32 | self.db = DbClient() 33 | self.raw_proxy_queue = 'raw_proxy' 34 | self.log = LogHandler('proxy_manager') 35 | self.useful_proxy_queue = 'useful_proxy' 36 | 37 | def fetch(self): 38 | """ 39 | fetch proxy into db by ProxyGetter 40 | :return: 41 | """ 42 | self.db.changeTable(self.raw_proxy_queue) 43 | proxy_set = set() 44 | self.log.info("ProxyFetch : start") 45 | for proxyGetter in config.proxy_getter_functions: 46 | self.log.info("ProxyFetch - {func}: start".format(func=proxyGetter)) 47 | try: 48 | for proxy in getattr(GetFreeProxy, proxyGetter.strip())(): 49 | proxy = proxy.strip() 50 | 51 | if not proxy or not verifyProxyFormat(proxy): 52 | self.log.error('ProxyFetch - {func}: ' 53 | '{proxy} illegal'.format(func=proxyGetter, proxy=proxy.ljust(20))) 54 | continue 55 | elif proxy in proxy_set: 56 | self.log.info('ProxyFetch - {func}: ' 57 | '{proxy} exist'.format(func=proxyGetter, proxy=proxy.ljust(20))) 58 | continue 59 | else: 60 | self.log.info('ProxyFetch - {func}: ' 61 | '{proxy} success'.format(func=proxyGetter, proxy=proxy.ljust(20))) 62 | self.db.put(Proxy(proxy, source=proxyGetter)) 63 | proxy_set.add(proxy) 64 | except Exception as e: 65 | self.log.error("ProxyFetch - {func}: error".format(func=proxyGetter)) 66 | self.log.error(str(e)) 67 | 68 | def get(self): 69 | """ 70 | return a useful proxy 71 | :return: 72 | """ 73 | self.db.changeTable(self.useful_proxy_queue) 74 | item_list = self.db.getAll() 75 | if item_list: 76 | random_choice = random.choice(item_list) 77 | return Proxy.newProxyFromJson(random_choice) 78 | return None 79 | 80 | def delete(self, proxy_str): 81 | """ 82 | delete proxy from pool 83 | :param proxy_str: 84 | :return: 85 | """ 86 | self.db.changeTable(self.useful_proxy_queue) 87 | self.db.delete(proxy_str) 88 | 89 | def getAll(self): 90 | """ 91 | get all proxy from pool as list 92 | :return: 93 | """ 94 | self.db.changeTable(self.useful_proxy_queue) 95 | item_list = self.db.getAll() 96 | return [Proxy.newProxyFromJson(_) for _ in item_list] 97 | 98 | def getNumber(self): 99 | self.db.changeTable(self.raw_proxy_queue) 100 | total_raw_proxy = self.db.getNumber() 101 | self.db.changeTable(self.useful_proxy_queue) 102 | total_useful_queue = self.db.getNumber() 103 | return {'raw_proxy': total_raw_proxy, 'useful_proxy': total_useful_queue} 104 | 105 | 106 | if __name__ == '__main__': 107 | pp = ProxyManager() 108 | pp.fetch() 109 | -------------------------------------------------------------------------------- /proxy_pool/ProxyHelper/Proxy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: Proxy 5 | Description : 代理对象类型封装 6 | Author : JHao 7 | date: 2019/7/11 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/7/11: 代理对象类型封装 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | import json 16 | 17 | 18 | class Proxy(object): 19 | 20 | def __init__(self, proxy, fail_count=0, region="", proxy_type="", 21 | source="", check_count=0, last_status="", last_time=""): 22 | self._proxy = proxy 23 | self._fail_count = fail_count 24 | self._region = region 25 | self._type = proxy_type 26 | self._source = source 27 | self._check_count = check_count 28 | self._last_status = last_status 29 | self._last_time = last_time 30 | 31 | @classmethod 32 | def newProxyFromJson(cls, proxy_json): 33 | """ 34 | 根据proxy属性json创建Proxy实例 35 | :param proxy_json: 36 | :return: 37 | """ 38 | proxy_dict = json.loads(proxy_json) 39 | return cls(proxy=proxy_dict.get("proxy", ""), 40 | fail_count=proxy_dict.get("fail_count", 0), 41 | region=proxy_dict.get("region", ""), 42 | proxy_type=proxy_dict.get("type", ""), 43 | source=proxy_dict.get("source", ""), 44 | check_count=proxy_dict.get("check_count", 0), 45 | last_status=proxy_dict.get("last_status", ""), 46 | last_time=proxy_dict.get("last_time", "") 47 | ) 48 | 49 | @property 50 | def proxy(self): 51 | """ 代理 ip:port """ 52 | return self._proxy 53 | 54 | @property 55 | def fail_count(self): 56 | """ 检测失败次数 """ 57 | return self._fail_count 58 | 59 | @property 60 | def region(self): 61 | """ 地理位置(国家/城市) """ 62 | return self._region 63 | 64 | @property 65 | def type(self): 66 | """ 透明/匿名/高匿 """ 67 | return self._type 68 | 69 | @property 70 | def source(self): 71 | """ 代理来源 """ 72 | return self._source 73 | 74 | @property 75 | def check_count(self): 76 | """ 代理检测次数 """ 77 | return self._check_count 78 | 79 | @property 80 | def last_status(self): 81 | """ 最后一次检测结果 1 -> 可用; 0 -> 不可用""" 82 | return self._last_status 83 | 84 | @property 85 | def last_time(self): 86 | """ 最后一次检测时间 """ 87 | return self._last_time 88 | 89 | @property 90 | def info_dict(self): 91 | """ 属性字典 """ 92 | return {"proxy": self._proxy, 93 | "fail_count": self._fail_count, 94 | "region": self._region, 95 | "type": self._type, 96 | "source": self._source, 97 | "check_count": self.check_count, 98 | "last_status": self.last_status, 99 | "last_time": self.last_time} 100 | 101 | @property 102 | def info_json(self): 103 | """ 属性json格式 """ 104 | return json.dumps(self.info_dict, ensure_ascii=False) 105 | 106 | # --- proxy method --- 107 | @fail_count.setter 108 | def fail_count(self, value): 109 | self._fail_count = value 110 | 111 | @region.setter 112 | def region(self, value): 113 | self._region = value 114 | 115 | @type.setter 116 | def type(self, value): 117 | self._type = value 118 | 119 | @source.setter 120 | def source(self, value): 121 | self._source = value 122 | 123 | @check_count.setter 124 | def check_count(self, value): 125 | self._check_count = value 126 | 127 | @last_status.setter 128 | def last_status(self, value): 129 | self._last_status = value 130 | 131 | @last_time.setter 132 | def last_time(self, value): 133 | self._last_time = value 134 | -------------------------------------------------------------------------------- /proxy_pool/DB/RedisClient.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: RedisClient 5 | Description : 封装Redis相关操作 6 | Author : JHao 7 | date: 2019/8/9 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/8/9: 封装Redis相关操作 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | from Config.setting import PY3 16 | 17 | from redis.connection import BlockingConnectionPool 18 | from redis import Redis 19 | 20 | 21 | class RedisClient(object): 22 | """ 23 | Redis client 和SSDB协议一致 数据结构一致, 但部分方法不通用 24 | 25 | Redis中代理存放的结构为hash: 26 | 原始代理存放在name为raw_proxy的hash中, key为代理的ip:por, value为代理属性的字典; 27 | 验证后的代理存放在name为useful_proxy的hash中, key为代理的ip:port, value为代理属性的字典; 28 | 29 | """ 30 | 31 | def __init__(self, name, **kwargs): 32 | """ 33 | init 34 | :param name: hash name 35 | :param host: host 36 | :param port: port 37 | :param password: password 38 | :return: 39 | """ 40 | self.name = name 41 | self.__conn = Redis(connection_pool=BlockingConnectionPool(**kwargs)) 42 | 43 | def get(self, proxy_str): 44 | """ 45 | 从hash中获取对应的proxy, 使用前需要调用changeTable() 46 | :param proxy_str: proxy str 47 | :return: 48 | """ 49 | data = self.__conn.hget(name=self.name, key=proxy_str) 50 | if data: 51 | return data.decode('utf-8') if PY3 else data 52 | else: 53 | return None 54 | 55 | def put(self, proxy_obj): 56 | """ 57 | 将代理放入hash, 使用changeTable指定hash name 58 | :param proxy_obj: Proxy obj 59 | :return: 60 | """ 61 | data = self.__conn.hset(self.name, proxy_obj.proxy, proxy_obj.info_json) 62 | return data 63 | 64 | def delete(self, proxy_str): 65 | """ 66 | 移除指定代理, 使用changeTable指定hash name 67 | :param proxy_str: proxy str 68 | :return: 69 | """ 70 | self.__conn.hdel(self.name, proxy_str) 71 | 72 | def exists(self, proxy_str): 73 | """ 74 | 判断指定代理是否存在, 使用changeTable指定hash name 75 | :param proxy_str: proxy str 76 | :return: 77 | """ 78 | return self.__conn.hexists(self.name, proxy_str) 79 | 80 | def update(self, proxy_obj): 81 | """ 82 | 更新 proxy 属性 83 | :param proxy_obj: 84 | :return: 85 | """ 86 | self.__conn.hset(self.name, proxy_obj.proxy, proxy_obj.info_json) 87 | 88 | def pop(self): 89 | """ 90 | 弹出一个代理 91 | :return: dict {proxy: value} 92 | """ 93 | # proxies = self.__conn.hkeys(self.name) 94 | # if proxies: 95 | # proxy = random.choice(proxies) 96 | # value = self.__conn.hget(self.name, proxy) 97 | # self.delete(proxy) 98 | # return {'proxy': proxy.decode('utf-8') if PY3 else proxy, 99 | # 'value': value.decode('utf-8') if PY3 and value else value} 100 | return None 101 | 102 | def getAll(self): 103 | """ 104 | 列表形式返回所有代理, 使用changeTable指定hash name 105 | :return: 106 | """ 107 | item_dict = self.__conn.hgetall(self.name) 108 | if PY3: 109 | return [value.decode('utf8') for key, value in item_dict.items()] 110 | else: 111 | return item_dict.values() 112 | 113 | def clear(self): 114 | """ 115 | 清空所有代理, 使用changeTable指定hash name 116 | :return: 117 | """ 118 | return self.__conn.delete(self.name) 119 | 120 | def getNumber(self): 121 | """ 122 | 返回代理数量 123 | :return: 124 | """ 125 | return self.__conn.hlen(self.name) 126 | 127 | def changeTable(self, name): 128 | """ 129 | 切换操作对象 130 | :param name: raw_proxy/useful_proxy 131 | :return: 132 | """ 133 | self.name = name 134 | -------------------------------------------------------------------------------- /proxy_pool/DB/SsdbClient.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # !/usr/bin/env python 3 | """ 4 | ------------------------------------------------- 5 | File Name: SsdbClient.py 6 | Description : 封装SSDB操作 7 | Author : JHao 8 | date: 2016/12/2 9 | ------------------------------------------------- 10 | Change Activity: 11 | 2016/12/2: 12 | 2017/09/22: PY3中 redis-py返回的数据是bytes型 13 | 2017/09/27: 修改pop()方法 返回{proxy:value}字典 14 | ------------------------------------------------- 15 | """ 16 | __author__ = 'JHao' 17 | 18 | from Config.setting import PY3 19 | 20 | from redis.connection import BlockingConnectionPool 21 | from redis import Redis 22 | 23 | 24 | class SsdbClient(object): 25 | """ 26 | SSDB client 27 | 28 | SSDB中代理存放的结构为hash: 29 | 原始代理存放在name为raw_proxy的hash中, key为代理的ip:por, value为代理属性的字典; 30 | 验证后的代理存放在name为useful_proxy的hash中, key为代理的ip:port, value为代理属性的字典; 31 | 32 | """ 33 | def __init__(self, name, **kwargs): 34 | """ 35 | init 36 | :param name: hash name 37 | :param host: host 38 | :param port: port 39 | :param password: password 40 | :return: 41 | """ 42 | self.name = name 43 | self.__conn = Redis(connection_pool=BlockingConnectionPool(**kwargs)) 44 | 45 | def get(self, proxy_str): 46 | """ 47 | 从hash中获取对应的proxy, 使用前需要调用changeTable() 48 | :param proxy_str: proxy str 49 | :return: 50 | """ 51 | data = self.__conn.hget(name=self.name, key=proxy_str) 52 | if data: 53 | return data.decode('utf-8') if PY3 else data 54 | else: 55 | return None 56 | 57 | def put(self, proxy_obj): 58 | """ 59 | 将代理放入hash, 使用changeTable指定hash name 60 | :param proxy_obj: Proxy obj 61 | :return: 62 | """ 63 | data = self.__conn.hset(self.name, proxy_obj.proxy, proxy_obj.info_json) 64 | return data 65 | 66 | def delete(self, proxy_str): 67 | """ 68 | 移除指定代理, 使用changeTable指定hash name 69 | :param proxy_str: proxy str 70 | :return: 71 | """ 72 | self.__conn.hdel(self.name, proxy_str) 73 | 74 | def exists(self, proxy_str): 75 | """ 76 | 判断指定代理是否存在, 使用changeTable指定hash name 77 | :param proxy_str: proxy str 78 | :return: 79 | """ 80 | return self.__conn.hexists(self.name, proxy_str) 81 | 82 | def update(self, proxy_obj): 83 | """ 84 | 更新 proxy 属性 85 | :param proxy_obj: 86 | :return: 87 | """ 88 | self.__conn.hset(self.name, proxy_obj.proxy, proxy_obj.info_json) 89 | 90 | def pop(self): 91 | """ 92 | 弹出一个代理 93 | :return: dict {proxy: value} 94 | """ 95 | # proxies = self.__conn.hkeys(self.name) 96 | # if proxies: 97 | # proxy = random.choice(proxies) 98 | # value = self.__conn.hget(self.name, proxy) 99 | # self.delete(proxy) 100 | # return {'proxy': proxy.decode('utf-8') if PY3 else proxy, 101 | # 'value': value.decode('utf-8') if PY3 and value else value} 102 | return None 103 | 104 | def getAll(self): 105 | """ 106 | 列表形式返回所有代理, 使用changeTable指定hash name 107 | :return: 108 | """ 109 | item_dict = self.__conn.hgetall(self.name) 110 | if PY3: 111 | return [value.decode('utf8') for key, value in item_dict.items()] 112 | else: 113 | return item_dict.values() 114 | 115 | def clear(self): 116 | """ 117 | 清空所有代理, 使用changeTable指定hash name 118 | :return: 119 | """ 120 | return self.__conn.execute_command("hclear", self.name) 121 | 122 | def getNumber(self): 123 | """ 124 | 返回代理数量 125 | :return: 126 | """ 127 | return self.__conn.hlen(self.name) 128 | 129 | def changeTable(self, name): 130 | """ 131 | 切换操作对象 132 | :param name: raw_proxy/useful_proxy 133 | :return: 134 | """ 135 | self.name = name 136 | -------------------------------------------------------------------------------- /1024/new1024spider.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import os 3 | import threading 4 | import random, time 5 | from bs4 import BeautifulSoup 6 | 7 | host = 'https://hh.flexui.win/' 8 | 9 | headers={ 10 | 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', 11 | 'Referer':host 12 | } 13 | 14 | class myThred(threading.Thread): 15 | def __init__(self,url,dir,filename): 16 | threading.Thread.__init__(self) 17 | self.ThreadID = filename 18 | self.url = url 19 | self.dir = dir 20 | self.filename = filename 21 | 22 | def run(self): 23 | download_pic(self.url,self.dir,self.filename) 24 | 25 | def download_pic(url,dir,filename): 26 | try: 27 | req = requests.get(url, headers=headers) 28 | if req.status_code == 200: 29 | with open('pic' + '/' + str(dir) + '/' + str(filename), 'wb+') as f: 30 | f.write(req.content) 31 | # print('下载完成.......' + str(filename)) 32 | else: 33 | print("发生错误,跳过下载....." + str(req.status_code)) 34 | except TimeoutError as e: 35 | print("链接超时: " + str(e)) 36 | 37 | def open_url(url): 38 | try: 39 | req = requests.get(url,headers=headers) 40 | req.encoding = req.apparent_encoding 41 | return req 42 | except (TimeoutError,ConnectionError,requests.exceptions.ConnectionError) as e: 43 | print('链接超时' + str(e)) 44 | 45 | def get_page(url): 46 | url_list = [] 47 | html = open_url(url) 48 | soup = BeautifulSoup(html.text,'lxml') 49 | article_url = soup.select('tbody > tr > td.tal > h3 > a') 50 | for url in article_url: 51 | url = str(host) + url.get('href') 52 | url_list.append(url) 53 | return url_list 54 | 55 | def get_article(url): 56 | img_all =[] 57 | html = open_url(url) 58 | soup = BeautifulSoup(html.text,'lxml') 59 | title = soup.select('td > h4')[0] 60 | title = title.get_text() 61 | img_urls = soup.select("input[type='image']") 62 | for img_url in img_urls: 63 | img_url = img_url.get('data-src') 64 | img_all.append(img_url) 65 | img_sum = len(img_all) 66 | print('当前帖子:\n' + str(title) + '\n共计取到 ' + str(img_sum) + ' 张图片连接......') 67 | if os.path.exists(title) == False: 68 | os.makedirs('pic' + '/' + str(title)) 69 | threads = [] 70 | for imgurl in img_all: 71 | imgname = imgurl.split('/')[-1] 72 | thread = myThred(imgurl,title,imgname) 73 | thread.start() 74 | threads.append(thread) 75 | for t in threads: 76 | t.join() 77 | timer = random.randint(2,5) 78 | print('下载完成............\n' + '休眠 ' + str(timer) + ' 秒......') 79 | time.sleep(timer) 80 | else: 81 | print("文件夹已存在,跳过下载。") 82 | 83 | if __name__ == '__main__': 84 | offset = 1 85 | while offset <= 2: 86 | page_url = 'https://hh.flexui.win/thread0806.php?fid=16&search=&page=' + str(offset) 87 | try: 88 | pagelist = get_page(page_url) 89 | for url in pagelist: 90 | if url == 'https://hh.flexui.win/read.php?tid=5877': 91 | print("pass") 92 | elif url == 'https://hh.flexui.win/htm_data/16/1106/524942.html': 93 | print('pass') 94 | elif url == 'https://hh.flexui.win/htm_data/16/1808/344501.html': 95 | print('pass') 96 | elif url == 'https://hh.flexui.win/htm_data/16/1110/622028.html': 97 | print('pass') 98 | elif url == 'https://hh.flexui.win/htm_data/16/1706/2424348.html': 99 | print('pass') 100 | elif url == 'https://hh.flexui.win/htm_data/16/1707/2519480.html': 101 | print('pass') 102 | elif url == 'https://hh.flexui.win/htm_data/16/0805/136474.html': 103 | print('pass') 104 | elif url == 'https://hh.flexui.win/htm_data/16/1109/594741.html': 105 | print('pass') 106 | elif url == 'https://hh.flexui.win/htm_data/16/1812/3351645.html': 107 | print('pass') 108 | else: 109 | get_article(url) 110 | except Exception as e: 111 | print('发生错误....跳过下载......' + str(e)) 112 | offset += 1 113 | -------------------------------------------------------------------------------- /proxy_pool/doc/introduce.md: -------------------------------------------------------------------------------- 1 | 2 | ## 代理池介绍 3 | 4 | 本项目通过爬虫方式持续抓取代理网站公布的免费代理IP,实时校验,维护部分可以使用的代理,并通过api的形式提供外部使用。 5 | 6 | ### 1、问题 7 | 8 | 构建一个代理IP池,可能有下面这些问题: 9 | 10 | * 代理IP从何而来? 11 | 12 |   许多刚接触爬虫的,都试过去西刺、快代理之类有免费代理的网站去抓些免费代理,还是有一些代理能用。 13 | 当然,如果你有更好的代理接口也可以自己接入。 14 | 15 |   免费代理的采集也很简单,无非就是:`访问页面`` —> `正则/xpath提取` —> `保存` 16 | 17 | * 如何保证代理质量? 18 | 19 |   可以肯定免费的代理IP大部分都是不能用的,不然别人还提供付费接口干嘛(不过事实上很多代理商的付费IP也不稳定,也有很多是不能用)。 20 | 所以采集回来的代理IP不能直接使用,检测的办法也很简单:可以写个程序不断的用代理访问一个稳定的网站,看是否可以正常访问即可。 21 | 这个过程可以使用多线/进程或异步的方式,因为检测代理是个很慢的过程。 22 | 23 | * 采集回来的代理如何存储? 24 | 25 |   这里不得不推荐一个国人开发的高性能支持多种数据结构的NoSQL数据库[SSDB](http://ssdb.io/docs/zh_cn/),用于替代Redis。支持队列、hash、set、k-v对,支持T级别数据。是做分布式爬虫很好中间存储工具。 26 | 27 | * 如何让爬虫更方便的用到这些代理? 28 | 29 |   答案肯定是做成服务咯,Python有这么多的web框架,随便拿一个来写个api供爬虫调用。这样代理和爬虫架构分离有很多好处, 30 | 比如:当爬虫完全不用考虑如何校验代理,如何保证拿到的代理可用,这些都由代理池来完成。这样只需要安静的码爬虫代码就行啦。 31 | 32 | ### 2、代理池设计 33 | 34 |   代理池由四部分组成: 35 | 36 | * ProxyGetter: 37 | 38 |   代理获取接口,目前有5个免费代理源,每调用一次就会抓取这个5个网站的最新代理放入DB,支持自定义扩展额外的代理获取接口; 39 | 40 | * DB: 41 | 42 |   用于存放代理IP,目前支持SSDB和Redis(推荐SSDB)。至于为什么选择SSDB,大家可以参考这篇[文章](https://www.sdk.cn/news/2684),个人觉得SSDB是个不错的Redis替代方案,如果你没有用过SSDB,安装起来也很简单,可以参考[这里](https://github.com/jhao104/memory-notes/blob/master/SSDB/SSDB%E5%AE%89%E8%A3%85%E9%85%8D%E7%BD%AE%E8%AE%B0%E5%BD%95.md); 43 | 44 | * Schedule: 45 | 46 |   计划任务,定时去检测DB中的代理可用性,删除不可用的代理。同时也会主动通过ProxyGetter去获取最新代理放入DB; 47 | 48 | * ProxyApi: 49 | 50 |   代理池的外部接口,由[Flask](http://flask.pocoo.org/)实现,功能是给爬虫提供与代理池交互的接口。 51 | 52 | 53 | ![设计](https://pic2.zhimg.com/v2-f2756da2986aa8a8cab1f9562a115b55_b.png) 54 | 55 | ### 3、代码模块 56 | 57 |   Python中高层次的数据结构,动态类型和动态绑定,使得它非常适合于快速应用开发,也适合于作为胶水语言连接已有的软件部件。用Python来搞这个代理IP池也很简单,代码分为6个模块: 58 | 59 | * Api: 60 | 61 |   api接口相关代码,目前api是由Flask实现,代码也非常简单。客户端请求传给Flask,Flask调用`ProxyManager`中的实现,包括`get/delete/refresh/get_all`; 62 | 63 | * DB: 64 | 65 |   数据库相关代码,目前数据库是支持SSDB/Redis。代码用工厂模式实现,方便日后扩展其他类型数据库; 66 | 67 | * Manager: 68 | 69 |   `get/delete/refresh/get_all`等接口的具体实现类,目前代理池只负责管理proxy,日后可能会有更多功能,比如代理和爬虫的绑定,代理和账号的绑定等等; 70 | 71 | * ProxyGetter: 72 | 73 |   代理获取的相关代码,目前抓取了[快代理](http://www.kuaidaili.com)、[代理66](http://www.66ip.cn/)、[有代理](http://www.youdaili.net/Daili/http/)、[西刺代理](http://api.xicidaili.com/free2016.txt)、[guobanjia](http://www.goubanjia.com/free/gngn/index.shtml)这个五个网站的免费代理,经测试这个5个网站每天更新的可用代理只有六七十个,当然也支持自己扩展代理接口; 74 | 75 | * Schedule: 76 | 77 |   定时任务相关代码,现在只是实现定时去刷新代理,并验证可用代理,采用多进程方式; 78 | 79 | * Util: 80 | 81 |   存放一些公共的模块方法或函数,包含`GetConfig`:读取配置文件config.ini的类,`ConfigParse`: 扩展ConfigParser的类,使其对大小写敏感, `Singleton`:实现单例,`LazyProperty`:实现类属性惰性计算。等等; 82 | 83 | * 其他文件: 84 | 85 |   配置文件:`Config.ini``,数据库配置和代理获取接口配置,可以在GetFreeProxy中添加新的代理获取方法,并在Config.ini中注册即可使用; 86 | 87 | ### 4、安装 88 | 89 | 下载代码: 90 | ``` 91 | git clone git@github.com:jhao104/proxy_pool.git 92 | 93 | 或者直接到https://github.com/jhao104/proxy_pool 下载zip文件 94 | ``` 95 | 96 | 安装依赖: 97 | ``` 98 | pip install -r requirements.txt 99 | ``` 100 | 101 | 启动: 102 | 103 | ``` 104 | 如果你的依赖已经安全完成并且具备运行条件,可以直接在Run下运行main.py 105 | 到Run目录下: 106 | >>>python main.py 107 | 108 | 如果运行成功你应该可以看到有4个main.py进程在 109 | 110 | 111 | 你也可以分别运行他们,依次到Api下启动ProxyApi.py,Schedule下启动ProxyRefreshSchedule.py和ProxyValidSchedule.py即可 112 | ``` 113 | 114 | docker: 115 | ``` 116 | git clone git@github.com:jhao104/proxy_pool.git 117 | 118 | cd proxy_pool 119 | 120 | docker build -t proxy:latest -f Dockerfile . 121 | 122 | docker run -p 5010:5010 -d proxy:latest 123 | 124 | # Wait a few minutes 125 | curl localhost:5010/get/ 126 | # result: xxx.xxx.xxx.xxx:xxxx 127 | 128 | curl localhost:5010/get_all/ 129 | ``` 130 | 131 | ### 5、使用 132 |   定时任务启动后,会通过GetFreeProxy中的方法抓取代理存入数据库并验证。此后默认每10分钟会重复执行一次。定时任务启动大概一两分钟后,便可在[SSDB](https://github.com/jhao104/SSDBAdmin)中看到刷新出来的可用的代理: 133 | 134 | ![useful_proxy](https://pic2.zhimg.com/v2-12f9b7eb72f60663212f317535a113d1_b.png) 135 | 136 |   启动ProxyApi.py后即可在浏览器中使用接口获取代理,一下是浏览器中的截图: 137 | 138 |   index页面: 139 | 140 | ![index](https://pic3.zhimg.com/v2-a867aa3db1d413fea8aeeb4c693f004a_b.png) 141 | 142 |   get: 143 | 144 | ![get](https://pic1.zhimg.com/v2-f54b876b428893235533de20f2edbfe0_b.png) 145 | 146 |   get_all: 147 | 148 | ![get_all](https://pic3.zhimg.com/v2-5c79f8c07e04f9ef655b9bea406d0306_b.png) 149 | 150 | 151 |   爬虫中使用,如果要在爬虫代码中使用的话, 可以将此api封装成函数直接使用,例如: 152 | ``` 153 | import requests 154 | 155 | def get_proxy(): 156 | return requests.get("http://127.0.0.1:5010/get/").content 157 | 158 | def delete_proxy(proxy): 159 | requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy)) 160 | 161 | # your spider code 162 | 163 | def spider(): 164 | # .... 165 | requests.get('https://www.example.com', proxies={"http": "http://{}".format(get_proxy())}) 166 | # .... 167 | 168 | ``` 169 | 170 |   测试地址:http://123.207.35.36:5010 单机勿压测。谢谢 171 | 172 | ### 6、最后 173 |   时间仓促,功能和代码都比较简陋,以后有时间再改进。喜欢的在github上给个star。感谢! 174 | -------------------------------------------------------------------------------- /kuaishou/lib/crawler.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import json 3 | import os 4 | import re 5 | import requests 6 | from random import randint 7 | from time import sleep 8 | from bs4 import BeautifulSoup 9 | 10 | requests.packages.urllib3.disable_warnings() 11 | 12 | 13 | 14 | def get_proxy(): 15 | return requests.get("http://127.0.0.1:9910/get/").json() 16 | 17 | def delete_proxy(proxy): 18 | requests.get("http://127.0.0.1:9910/delete/?proxy={}".format(proxy)) 19 | 20 | 21 | class Kuaishou(): 22 | 23 | 24 | __headersWeb = { 25 | 'accept': '*/*', 26 | 'Accept-Encoding': 'gzip, deflate, br', 27 | 'Accept-Language': 'zh-CN,zh;q=0.9', 28 | 'Connection': 'keep-alive', 29 | 'Content-Type': 'application/json', 30 | 'Host': 'live.kuaishou.com', 31 | 'Origin': 'https://live.kuaishou.com', 32 | 'Sec-Fetch-Mode': 'cors', 33 | 'Sec-Fetch-Site': 'same-origin', 34 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36', 35 | #填上你的cookie 36 | 'Cookie': '' 37 | } 38 | 39 | __PROFILE_URL = "https://live.kuaishou.com/profile/" 40 | __DATA_URL = "https://live.kuaishou.com/m_graphql" 41 | __WORK_URL = "https://v.kuaishou.com/fw/photo/" 42 | 43 | __DATA_PATH = './data/' 44 | 45 | def __headersMobile(self): 46 | num = randint(1, 300) 47 | with open('./config/ua_mobile.txt', 'r') as f: 48 | ua = f.readlines()[num].replace('\n', '') 49 | headers_mobile = { 50 | 'Host': 'v.kuaishou.com', 51 | 'User-Agent':ua, 52 | # 'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1', 53 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 54 | 'Accept-Language': 'zh-CN,zh;q=0.9', 55 | 'Accept-Encoding': 'gzip, deflate, br', 56 | 'Connection': 'keep-alive', 57 | #填上你的cookie 58 | 'Cookie': '', 59 | 'Upgrade-Insecure-Requests': '1', 60 | } 61 | return headers_mobile 62 | 63 | def __parseVideo(self,videoID): 64 | 65 | proxy = get_proxy().get('proxy') 66 | url = self.__WORK_URL + videoID 67 | print('Current Task: %s' %url) 68 | try: 69 | req = requests.get(url, headers=self.__headersMobile(),proxies={"http": "http://{}".format(proxy)},timeout=(3,7)) 70 | req.raise_for_status() 71 | req.close() 72 | 73 | soup = BeautifulSoup(req.text,'lxml') 74 | noWaterMarkVideo = soup.find(attrs={'id': 'hide-pagedata'}).attrs['data-pagedata'] 75 | pattern = re.compile('\"srcNoMark\":"(.*?)"},', re.S) 76 | real_url = re.findall(pattern, noWaterMarkVideo)[0] 77 | print(real_url) 78 | 79 | if not os.path.exists(self.__DATA_PATH): 80 | os.makedirs(self.__DATA_PATH) 81 | 82 | 83 | with open(self.__DATA_PATH + 'data.txt','a+',encoding='utf-8') as f: 84 | f.write(real_url + '\n') 85 | f.close() 86 | # sleep(5) 87 | except Exception as e: 88 | num = 5 89 | while num < 1: 90 | delete_proxy(proxy) 91 | print('error: %s' %e) 92 | self.__parseVideo(videoID) 93 | sleep(3) 94 | num -= 1 95 | 96 | def setUid(self,uid): 97 | self.uid = uid 98 | self.user() 99 | 100 | def user(self): 101 | 102 | payload1 = {'operationName': "privateFeedsQuery", 103 | 'query': "query privateFeedsQuery($principalId: String, $pcursor: String, $count: Int) {\n privateFeeds(principalId: $principalId, pcursor: $pcursor, count: $count) {\n pcursor\n list {\n id\n thumbnailUrl\n poster\n workType\n type\n useVideoPlayer\n imgUrls\n imgSizes\n magicFace\n musicName\n caption\n location\n liked\n onlyFollowerCanComment\n relativeHeight\n timestamp\n width\n height\n counts {\n displayView\n displayLike\n displayComment\n __typename\n }\n user {\n id\n eid\n name\n avatar\n __typename\n }\n expTag\n __typename\n }\n __typename\n }\n }\n", 104 | 'variables': {'principalId': str(self.uid), 'pcursor': "", 'count': 512}} 105 | 106 | res = requests.post(self.__DATA_URL, headers=self.__headersWeb, json=payload1) 107 | 108 | # print(res.content) 109 | works = json.loads(res.content.decode(encoding='utf-8'))['data']['privateFeeds']['list'] 110 | 111 | # with open("./" + uid + "2.json", "w") as fp: 112 | # fp.write(json.dumps(works, indent=2)) 113 | 114 | if works != []: 115 | if works[0]['id'] is None: 116 | works.pop(0) 117 | 118 | 119 | print('Video Count:%s ' %len(works)) 120 | print(works) 121 | for work in works: 122 | type = work['workType'] 123 | if type == 'video': 124 | work_id = work['id'] 125 | sleep(3) 126 | self.__parseVideo(work_id) 127 | 128 | print('Parse Successful ^-^ \n') 129 | 130 | else: 131 | print(works) 132 | sleep(3) 133 | self.user() 134 | 135 | 136 | 137 | -------------------------------------------------------------------------------- /91user/user.py: -------------------------------------------------------------------------------- 1 | import requests, re 2 | import random 3 | import urllib.parse as up 4 | from time import sleep 5 | from bs4 import BeautifulSoup 6 | from fake_useragent import UserAgent 7 | import sqlite3 8 | 9 | ua = UserAgent() 10 | proxies = { 11 | 'http': 'http://127.0.0.1:1080', 12 | 'https': 'http://127.0.0.1:1080' 13 | } 14 | 15 | def random_headers(): 16 | ip = str(random.choice(list(range(255)))) + '.' + str(random.choice(list(range(255)))) + '.' + str( 17 | random.choice(list(range(255)))) + '.' + str(random.choice(list(range(255)))) 18 | 19 | headers = { 20 | 'X-Client-IP': ip, 21 | 'X-Remote-IP': ip, 22 | 'X-Remote-Addr': ip, 23 | 'X-Originating-IP': ip, 24 | 'x-forwarded-for': ip, 25 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 26 | 'Accept-Encoding': 'gzip, deflate', 27 | 'Accept-Language': 'zh-CN,zh;q=0.9', 28 | 'Cache-Control': 'max-age=0', 29 | # 'Cookie': '__cfduid=dda7b976a0a240beb0968fd6673951c471618894642; CLIPSHARE=jkaaau6k1p151iqhto35fsgrnl; mode=d', 30 | 'Host': '91porn.com', 31 | 'Referer': 'http://91porn.com/', # fe7dCN6lNv5VirM8tSKWVndvRtHMSVyeHRRNQDEbKvUfjKzE 32 | 'User-Agent': ua.random 33 | } 34 | return headers 35 | 36 | def get_page(url): 37 | page = requests.get(url,headers=random_headers(),proxies=proxies) 38 | page.encoding = page.apparent_encoding 39 | html = BeautifulSoup(page.text,'lxml') 40 | if html != None: 41 | return html 42 | else: 43 | return None 44 | 45 | 46 | class User: 47 | 48 | def __init__(self,uid): 49 | self.uid = uid 50 | self.start_url = 'http://91porn.com/uvideos.php?UID={}'.format(self.uid) 51 | self.public_datas = self.public_data() 52 | 53 | def public_data(self): 54 | page = get_page(self.start_url) 55 | page_num = page.select('ul.nav.navbar-nav.navbar-right > a')[-1].get_text() 56 | public_video = re.findall(r'\d+',page_num)[0] 57 | page_num = int(public_video) // 8 58 | if page_num == 0: 59 | page_num = 1 60 | data = { 61 | 'page_num':page_num, 62 | 'public_video':int(public_video) 63 | } 64 | return data 65 | else: 66 | page_num+=1 67 | data = { 68 | 'page_num':page_num, 69 | 'public_video':int(public_video) 70 | } 71 | return data 72 | 73 | def __parse_user(self): 74 | end_num = self.public_datas['page_num'] 75 | urls = ['http://91porn.com/uvideos.php?UID={}&page={}'.format(str(self.uid),str(i))for i in range(1,int(end_num+1))] 76 | # page = get_page(self.start_url) 77 | for url in urls: 78 | print(url) 79 | page = get_page(url) 80 | video_urls = page.select('div.well.well-sm > a') 81 | video_ids = page.select('div.thumb-overlay > img') 82 | video_names = page.select('span.video-title.title-truncate.m-t-5') 83 | for video_url,id,name in zip(video_urls,video_ids,video_names): 84 | data = { 85 | 'url':video_url.get('href'), 86 | 'id':id.get('src').split('/')[-1].strip('.jpg'), 87 | 'title':name.get_text() 88 | } 89 | yield data 90 | 91 | def parse_video(self): 92 | video_data = [] 93 | for user_data in self.__parse_user(): 94 | print('当前执行任务:%s' %user_data['url']) 95 | page = get_page(user_data['url']) 96 | m3u8 = page.find(text=re.compile('.*"%.*"')) 97 | temp = m3u8.split('"')[-2] 98 | m3u8_url = up.unquote(temp).split("'")[1] 99 | new_data = { 100 | 'url': user_data['url'], 101 | 'id': user_data['id'], 102 | 'title': user_data['title'], 103 | 'm3u8': m3u8_url 104 | } 105 | video_data.append(new_data) 106 | sleep(random.randint(1,3)) 107 | up_users = page.select('span.title-yakov > a > span')[0].get_text() 108 | all_data = {'uid':self.uid,'name':up_users,'data':video_data} 109 | return all_data 110 | 111 | class ClientSqlite: 112 | 113 | def __init__(self, dbName="./91user.db"): 114 | self.conn = sqlite3.connect(dbName) 115 | self.cur = self.conn.cursor() 116 | self.create_table() 117 | 118 | def close_conn(self): 119 | self.cur.close() 120 | self.conn.close() 121 | 122 | def create_table(self): 123 | sql = '''CREATE table users( 124 | id INTEGER PRIMARY KEY AUTOINCREMENT , 125 | uid varchar(255) NOT NULL , 126 | name varchar(255) DEFAULT NULL, 127 | data text 128 | )''' 129 | try: 130 | self.cur.execute(sql) 131 | self.conn.commit() 132 | return True 133 | except Exception as e: 134 | #print('[ERROR]:%s' %e) 135 | return False 136 | 137 | def fetchall_table(self,sql,limit_flag=True): 138 | try: 139 | self.cur.execute(sql) 140 | if limit_flag == True: 141 | result = self.cur.fetchall() 142 | if len(result) > 0: 143 | return result 144 | else: 145 | return None 146 | else: 147 | result = self.cur.fetchone() 148 | if len(result) > 0: 149 | return result 150 | else: 151 | return None 152 | except Exception as e: 153 | print('[SELECT TABLE ERROR]:%s' %e) 154 | return None 155 | 156 | def insert_update_table(self,sql): 157 | try: 158 | self.cur.execute(sql) 159 | self.conn.commit() 160 | return True 161 | except Exception as e: 162 | print('[INSERT/UPDATE TABLE ERROR]:%s' %e) 163 | return False 164 | -------------------------------------------------------------------------------- /proxy_pool/README.md: -------------------------------------------------------------------------------- 1 | 2 | 爬虫IP代理池 3 | ======= 4 | [![Build Status](https://travis-ci.org/jhao104/proxy_pool.svg?branch=master)](https://travis-ci.org/jhao104/proxy_pool) 5 | [![](https://img.shields.io/badge/Powered%20by-@j_hao104-green.svg)](http://www.spiderpy.cn/blog/) 6 | [![Requirements Status](https://requires.io/github/jhao104/proxy_pool/requirements.svg?branch=master)](https://requires.io/github/jhao104/proxy_pool/requirements/?branch=master) 7 | [![Packagist](https://img.shields.io/packagist/l/doctrine/orm.svg)](https://github.com/jhao104/proxy_pool/blob/master/LICENSE) 8 | [![GitHub contributors](https://img.shields.io/github/contributors/jhao104/proxy_pool.svg)](https://github.com/jhao104/proxy_pool/graphs/contributors) 9 | [![](https://img.shields.io/badge/language-Python-green.svg)](https://github.com/jhao104/proxy_pool) 10 | 11 | ______ ______ _ 12 | | ___ \_ | ___ \ | | 13 | | |_/ / \__ __ __ _ __ _ | |_/ /___ ___ | | 14 | | __/| _// _ \ \ \/ /| | | || __// _ \ / _ \ | | 15 | | | | | | (_) | > < \ |_| || | | (_) | (_) || |___ 16 | \_| |_| \___/ /_/\_\ \__ |\_| \___/ \___/ \_____\ 17 | __ / / 18 | /___ / 19 | 20 | ##### [介绍文档](https://github.com/jhao104/proxy_pool/blob/master/doc/introduce.md) 21 | 22 | * 支持版本: ![](https://img.shields.io/badge/Python-2.x-green.svg) ![](https://img.shields.io/badge/Python-3.x-blue.svg) 23 | 24 | * 测试地址: http://118.24.52.95 (单机勿压, 感谢。 恶意访问关[小黑屋](https://github.com/jhao104/proxy_pool/blob/bff423dffe6e2881ee45d5b66d8a6ad682c8e4ab/doc/block_ips.md)哦) 25 | 26 | ### 下载安装 27 | 28 | * 下载源码: 29 | 30 | ```shell 31 | git clone git@github.com:jhao104/proxy_pool.git 32 | 33 | 或者直接到https://github.com/jhao104/proxy_pool/releases 下载zip文件 34 | ``` 35 | 36 | * 安装依赖: 37 | 38 | ```shell 39 | pip install -r requirements.txt 40 | ``` 41 | 42 | * 配置Config/setting.py: 43 | 44 | ```shell 45 | # Config/setting.py 为项目配置文件 46 | 47 | # 配置DB 48 | DATABASES = { 49 | "default": { 50 | "TYPE": "SSDB", # 目前支持SSDB或REDIS数据库 51 | "HOST": "127.0.0.1", # db host 52 | "PORT": 8888, # db port,例如SSDB通常使用8888,REDIS通常默认使用6379 53 | "NAME": "proxy", # 默认配置 54 | "PASSWORD": "" # db password 55 | 56 | } 57 | } 58 | 59 | 60 | # 配置 ProxyGetter 61 | 62 | PROXY_GETTER = [ 63 | "freeProxy01", # 这里是启用的代理抓取函数名,可在ProxyGetter/getFreeProxy.py 扩展 64 | "freeProxy02", 65 | .... 66 | ] 67 | 68 | 69 | # 配置 API服务 70 | 71 | SERVER_API = { 72 | "HOST": "0.0.0.0", # 监听ip, 0.0.0.0 监听所有IP 73 | "PORT": 5010 # 监听端口 74 | } 75 | 76 | # 上面配置启动后,代理池访问地址为 http://127.0.0.1:5010 77 | 78 | ``` 79 | 80 | * 启动: 81 | 82 | ```shell 83 | # 如果你的依赖已经安装完成并且具备运行条件,可以在cli目录下通过ProxyPool.py启。动 84 | # 程序分为: schedule 调度程序 和 webserver Api服务 85 | 86 | # 首先启动调度程序 87 | >>>python proxyPool.py schedule 88 | 89 | # 然后启动webApi服务 90 | >>>python proxyPool.py webserver 91 | 92 | 93 | ``` 94 | 95 | ### Docker 96 | 97 | ```bash 98 | docker pull jhao104/proxy_pool 99 | 100 | # 远程数据库 101 | docker run --env db_type=REDIS --env db_host=x.x.x.x --env db_port=6379 --env db_password=pwd_str -p 5010:5010 jhao104/proxy_pool 102 | 103 | # 宿主机上的数据库 104 | docker run --env db_type=REDIS --env db_host=host.docker.internal --env db_port=6379 --env db_password=pwd_str -p 5010:5010 jhao104/proxy_pool 105 | 106 | ``` 107 | 108 | 109 | ### 使用 110 | 111 |   启动过几分钟后就能看到抓取到的代理IP,你可以直接到数据库中查看,推荐一个[SSDB可视化工具](https://github.com/jhao104/SSDBAdmin)。 112 | 113 |   也可以通过api访问http://127.0.0.1:5010 查看。 114 | 115 | * Api 116 | 117 | | api | method | Description | arg| 118 | | ----| ---- | ---- | ----| 119 | | / | GET | api介绍 | None | 120 | | /get | GET | 随机获取一个代理 | None| 121 | | /get_all | GET | 获取所有代理 |None| 122 | | /get_status | GET | 查看代理数量 |None| 123 | | /delete | GET | 删除代理 |proxy=host:ip| 124 | 125 | * 爬虫使用 126 | 127 |   如果要在爬虫代码中使用的话, 可以将此api封装成函数直接使用,例如: 128 | 129 | ```python 130 | import requests 131 | 132 | def get_proxy(): 133 | return requests.get("http://127.0.0.1:5010/get/").json() 134 | 135 | def delete_proxy(proxy): 136 | requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy)) 137 | 138 | # your spider code 139 | 140 | def getHtml(): 141 | # .... 142 | retry_count = 5 143 | proxy = get_proxy().get("proxy") 144 | while retry_count > 0: 145 | try: 146 | html = requests.get('http://www.example.com', proxies={"http": "http://{}".format(proxy)}) 147 | # 使用代理访问 148 | return html 149 | except Exception: 150 | retry_count -= 1 151 | # 出错5次, 删除代理池中代理 152 | delete_proxy(proxy) 153 | return None 154 | ``` 155 | 156 | ### 扩展代理 157 | 158 |   项目默认包含几个免费的代理获取方法,但是免费的毕竟质量不好,所以如果直接运行可能拿到的代理质量不理想。所以,提供了代理获取的扩展方法。 159 | 160 |   添加一个新的代理获取方法如下: 161 | 162 | * 1、首先在[GetFreeProxy](https://github.com/jhao104/proxy_pool/blob/b9ccdfaada51b57cfb1bbd0c01d4258971bc8352/ProxyGetter/getFreeProxy.py#L32)类中添加你的获取代理的静态方法, 163 | 该方法需要以生成器(yield)形式返回`host:ip`格式的代理,例如: 164 | 165 | ```python 166 | 167 | class GetFreeProxy(object): 168 | # .... 169 | 170 | # 你自己的方法 171 | @staticmethod 172 | def freeProxyCustom(): # 命名不和已有重复即可 173 | 174 | # 通过某网站或者某接口或某数据库获取代理 任意你喜欢的姿势都行 175 | # 假设你拿到了一个代理列表 176 | proxies = ["139.129.166.68:3128", "139.129.166.61:3128", ...] 177 | for proxy in proxies: 178 | yield proxy 179 | # 确保每个proxy都是 host:ip正确的格式就行 180 | ``` 181 | 182 | * 2、添加好方法后,修改Config/setting.py文件中的`PROXY_GETTER`项: 183 | 184 |   在`PROXY_GETTER`下添加自定义的方法的名字: 185 | 186 | ```shell 187 | PROXY_GETTER = [ 188 | "freeProxy01", 189 | "freeProxy02", 190 | .... 191 | "freeProxyCustom" # # 确保名字和你添加方法名字一致 192 | ] 193 | ``` 194 | 195 | 196 |   `ProxySchedule`会每隔一段时间抓取一次代理,下次抓取时会自动识别调用你定义的方法。 197 | 198 | ### 代理采集 199 | 200 | 目前实现的采集免费代理网站有(排名不分先后, 下面仅是对其发布的免费代理情况, 付费代理测评可以参考[这里](https://zhuanlan.zhihu.com/p/33576641)): 201 | 202 | | 厂商名称 | 状态 | 更新速度 | 可用率 | 是否被墙 | 地址 | 203 | | ----- | ---- | -------- | ------ | --------- | ----- | 204 | | 无忧代理 | 可用 | 几分钟一次 | * | 否 | [地址](http://www.data5u.com/free/index.html) | 205 | | 66代理 | 可用 | 更新很慢 | * | 否 | [地址](http://www.66ip.cn/) | 206 | | 西刺代理 | 可用 | 几分钟一次 | * | 否 | [地址](http://www.xicidaili.com)| 207 | | 全网代理 | 可用 | 几分钟一次 | * | 否 | [地址](http://www.goubanjia.com/)| 208 | | 训代理 | 已关闭免费代理 | * | * | 否 | [地址](http://www.xdaili.cn/)| 209 | | 快代理 | 可用 |几分钟一次| * | 否 | [地址](https://www.kuaidaili.com/)| 210 | | 云代理 | 可用 |几分钟一次| * | 否 | [地址](http://www.ip3366.net/)| 211 | | IP海 | 可用 |几小时一次| * | 否 | [地址](http://www.iphai.com/)| 212 | | 免费IP代理库 | 可用 |快| * | 否 | [地址](http://ip.jiangxianli.com/)| 213 | | 中国IP地址 | 可用 |几分钟一次| * | 是 | [地址](http://cn-proxy.com/)| 214 | | Proxy List | 可用 |几分钟一次| * | 是 | [地址](https://proxy-list.org/chinese/index.php)| 215 | | ProxyList+ | 可用 |几分钟一次| * | 是 | [地址](https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1)| 216 | 217 | 如果还有其他好的免费代理网站, 可以在提交在[issues](https://github.com/jhao104/proxy_pool/issues/71), 下次更新时会考虑在项目中支持。 218 | 219 | ### 问题反馈 220 | 221 |   任何问题欢迎在[Issues](https://github.com/jhao104/proxy_pool/issues) 中反馈,如果没有账号可以去 我的[博客](http://www.spiderpy.cn/blog/message)中留言。 222 | 223 |   你的反馈会让此项目变得更加完美。 224 | 225 | ### 贡献代码 226 | 227 |   本项目仅作为基本的通用的代理池架构,不接收特有功能(当然,不限于特别好的idea)。 228 | 229 |   本项目依然不够完善,如果发现bug或有新的功能添加,请在[Issues](https://github.com/jhao104/proxy_pool/issues)中提交bug(或新功能)描述,在确认后提交你的代码。 230 | 231 |   这里感谢以下contributor的无私奉献: 232 | 233 |   [@kangnwh](https://github.com/kangnwh)| [@bobobo80](https://github.com/bobobo80)| [@halleywj](https://github.com/halleywj)| [@newlyedward](https://github.com/newlyedward)| [@wang-ye](https://github.com/wang-ye)| [@gladmo](https://github.com/gladmo)| [@bernieyangmh](https://github.com/bernieyangmh)| [@PythonYXY](https://github.com/PythonYXY)| [@zuijiawoniu](https://github.com/zuijiawoniu)| [@netAir](https://github.com/netAir)| [@scil](https://github.com/scil)| [@tangrela](https://github.com/tangrela)| [@highroom](https://github.com/highroom)| [@luocaodan](https://github.com/luocaodan)| [@vc5](https://github.com/vc5)| [@1again](https://github.com/1again)| [@obaiyan](https://github.com/obaiyan) 234 | 235 | 236 | ### Release Notes 237 | 238 | [release notes](https://github.com/jhao104/proxy_pool/blob/master/doc/release_notes.md) 239 | 240 | -------------------------------------------------------------------------------- /proxy_pool/ProxyGetter/getFreeProxy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # !/usr/bin/env python 3 | """ 4 | ------------------------------------------------- 5 | File Name: GetFreeProxy.py 6 | Description : 抓取免费代理 7 | Author : JHao 8 | date: 2016/11/25 9 | ------------------------------------------------- 10 | Change Activity: 11 | 2016/11/25: 12 | ------------------------------------------------- 13 | """ 14 | import re 15 | import sys 16 | import requests 17 | from time import sleep 18 | 19 | sys.path.append('..') 20 | 21 | from Util.WebRequest import WebRequest 22 | from Util.utilFunction import getHtmlTree 23 | 24 | # for debug to disable insecureWarning 25 | requests.packages.urllib3.disable_warnings() 26 | 27 | 28 | class GetFreeProxy(object): 29 | """ 30 | proxy getter 31 | """ 32 | 33 | @staticmethod 34 | def freeProxy01(): 35 | """ 36 | 无忧代理 http://www.data5u.com/ 37 | 几乎没有能用的 38 | :return: 39 | """ 40 | url_list = [ 41 | 'http://www.data5u.com/', 42 | 'http://www.data5u.com/free/gngn/index.shtml', 43 | 'http://www.data5u.com/free/gnpt/index.shtml' 44 | ] 45 | key = 'ABCDEFGHIZ' 46 | for url in url_list: 47 | html_tree = getHtmlTree(url) 48 | ul_list = html_tree.xpath('//ul[@class="l2"]') 49 | for ul in ul_list: 50 | try: 51 | ip = ul.xpath('./span[1]/li/text()')[0] 52 | classnames = ul.xpath('./span[2]/li/attribute::class')[0] 53 | classname = classnames.split(' ')[1] 54 | port_sum = 0 55 | for c in classname: 56 | port_sum *= 10 57 | port_sum += key.index(c) 58 | port = port_sum >> 3 59 | yield '{}:{}'.format(ip, port) 60 | except Exception as e: 61 | print(e) 62 | 63 | @staticmethod 64 | def freeProxy02(count=20): 65 | """ 66 | 代理66 http://www.66ip.cn/ 67 | :param count: 提取数量 68 | :return: 69 | """ 70 | urls = [ 71 | "http://www.66ip.cn/mo.php?sxb=&tqsl={}&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=", 72 | "http://www.66ip.cn/nmtq.php?getnum={}&isp=0&anonymoustype=0&s" 73 | "tart=&ports=&export=&ipaddress=&area=0&proxytype=2&api=66ip" 74 | ] 75 | 76 | try: 77 | import execjs 78 | import requests 79 | 80 | headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', 81 | 'Accept': '*/*', 82 | 'Connection': 'keep-alive', 83 | 'Accept-Language': 'zh-CN,zh;q=0.8'} 84 | session = requests.session() 85 | src = session.get("http://www.66ip.cn/", headers=headers).text 86 | src = src.split("")[0] + '}' 87 | src = src.replace("