├── .idea ├── .gitignore ├── crawlProject.iml ├── inspectionProfiles │ ├── Project_Default.xml │ └── profiles_settings.xml ├── misc.xml ├── modules.xml └── vcs.xml ├── README.md ├── README_img ├── alipay.JPG └── wechatPay.JPG ├── feapder篇 └── air-spider │ └── RedBook │ ├── MYSQLDB.py │ ├── README.md │ ├── __init__.py │ ├── custom_pipeline │ ├── __init__.py │ └── csvPipeline.py │ ├── items │ ├── __init__.py │ └── items.py │ ├── main.py │ ├── midware │ ├── __init__.py │ ├── add_cookie.py │ ├── get_XsXt.py │ ├── get_sign.py │ └── jsCode │ │ └── jsss.js │ ├── setting.py │ ├── spiders │ ├── __init__.py │ ├── get_comment.py │ └── get_homefeed.py │ └── types │ ├── QueryJsonType.py │ └── __init__.py ├── note.txt ├── 基础篇 ├── lxml&re │ ├── 4k图片爬取.py │ ├── 58同城分页爬取.py │ ├── GetFakeUA.py │ ├── bs4案例.py │ ├── bs4解析基础.py │ ├── test.html │ ├── xpath基础.py │ ├── 正则练习.py │ ├── 正则解析.py │ └── 简历爬取.py ├── request │ ├── 01-Request.py │ ├── 02-(UA)网页采集器.py │ ├── 03-(POST)百度翻译.py │ ├── 04-豆瓣电影爬取.py │ └── 05-肯德基餐厅位置查询.py ├── scrapy │ ├── bossjob │ │ ├── __init__.py │ │ ├── bossjob │ │ │ ├── __init__.py │ │ │ ├── fakeCookie.py │ │ │ ├── fake_useragent.py │ │ │ ├── items.py │ │ │ ├── middlewares.py │ │ │ ├── pipelines.py │ │ │ ├── requset.py │ │ │ ├── settings.py │ │ │ └── spiders │ │ │ │ ├── __init__.py │ │ │ │ └── boss.py │ │ ├── scrapy.cfg │ │ └── vimm_chrome_proxyauth_plugin.zip │ ├── caipiao │ │ ├── __init__.py │ │ ├── caipiao │ │ │ ├── __init__.py │ │ │ ├── fake_useragent.py │ │ │ ├── items.py │ │ │ ├── middlewares.py │ │ │ ├── pipelines.py │ │ │ ├── settings.py │ │ │ └── spiders │ │ │ │ ├── __init__.py │ │ │ │ └── seq.py │ │ └── scrapy.cfg │ ├── imgsPro │ │ ├── __init__.py │ │ ├── imgsPro │ │ │ ├── __init__.py │ │ │ ├── fake_useragent.py │ │ │ ├── items.py │ │ │ ├── middlewares.py │ │ │ ├── pipelines.py │ │ │ ├── settings.py │ │ │ └── spiders │ │ │ │ ├── __init__.py │ │ │ │ └── img.py │ │ └── scrapy.cfg │ ├── paper │ │ ├── __init__.py │ │ ├── paper │ │ │ ├── __init__.py │ │ │ ├── fake_useragent.py │ │ │ ├── items.py │ │ │ ├── middlewares.py │ │ │ ├── pipelines.py │ │ │ ├── settings.py │ │ │ └── spiders │ │ │ │ ├── __init__.py │ │ │ │ └── page.py │ │ └── scrapy.cfg │ ├── sunPro │ │ ├── __init__.py │ │ ├── scrapy.cfg │ │ └── sunPro │ │ │ ├── __init__.py │ │ │ ├── fake_useragent.py │ │ │ ├── items.py │ │ │ ├── middlewares.py │ │ │ ├── pipelines.py │ │ │ ├── settings.py │ │ │ └── spiders │ │ │ ├── __init__.py │ │ │ └── sun.py │ ├── wangyi │ │ ├── __init__.py │ │ ├── news.txt │ │ ├── scrapy.cfg │ │ └── wangyi │ │ │ ├── __init__.py │ │ │ ├── fake_useragent.py │ │ │ ├── items.py │ │ │ ├── middlewares.py │ │ │ ├── pipelines.py │ │ │ ├── settings.py │ │ │ └── spiders │ │ │ ├── __init__.py │ │ │ └── news.py │ ├── xiaohua │ │ ├── __init__.py │ │ ├── scrapy.cfg │ │ └── xiaohua │ │ │ ├── __init__.py │ │ │ ├── items.py │ │ │ ├── middlewares.py │ │ │ ├── pipelines.py │ │ │ ├── settings.py │ │ │ └── spiders │ │ │ ├── Xiaohua.py │ │ │ └── __init__.py │ └── yiche │ │ ├── 1.txt │ │ ├── __init__.py │ │ ├── scrapy.cfg │ │ ├── test.py │ │ └── yiche │ │ ├── __init__.py │ │ ├── fake_useragent.py │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ ├── spiders │ │ ├── __init__.py │ │ └── car.py │ │ └── test.py └── 高性能异步爬虫 │ ├── flask_server.py │ ├── meinv.py │ ├── minxing.py │ ├── 协程.py │ ├── 多任务协程01.py │ ├── 多任务异步协程02.py │ ├── 线程池的基本使用.py │ └── 线程池的应用.py ├── 自动化篇 ├── playwright │ ├── 使用本地浏览器创建debug模式 │ │ ├── README.md │ │ └── auto.py │ ├── 反检测浏览器 │ │ ├── README.md │ │ ├── demo.py │ │ └── stealth.min.js │ ├── 起点vip │ │ ├── 10086.png │ │ ├── README.md │ │ ├── demo.py │ │ └── demo2.py │ ├── 邮政编码 │ │ └── hello.py │ └── 采集资料 │ │ └── auto.py └── selenium │ ├── 12306模拟登录.py │ ├── Twisted-20.3.0-cp39-cp39-win_amd64.whl │ ├── chromedriver.exe │ ├── damai.py │ ├── selenium其他自动化操作.py │ ├── selenium模拟登录.py │ ├── 动作链和iframe的处理.py │ └── 谷歌无头浏览器+反检测.py ├── 进阶篇 ├── js逆向 │ ├── wasm │ │ └── air │ │ │ ├── Flight.py │ │ │ ├── README.md │ │ │ ├── acw_tc_3.py │ │ │ ├── ddd.js │ │ │ ├── demo.js │ │ │ └── refer_1306.js │ ├── webPack │ │ └── 五矿 │ │ │ ├── crwalBase.py │ │ │ ├── demo.js │ │ │ ├── encode.py │ │ │ ├── webPack.js │ │ │ └── 五矿.py │ ├── 浏览器指纹检测 │ │ └── 易九批 │ │ │ ├── demo.js │ │ │ └── test.py │ ├── 环境检测 │ │ ├── BossJob │ │ │ ├── BossJob.py │ │ │ ├── chaojiying.py │ │ │ ├── cityCode.json │ │ │ ├── demo.js │ │ │ └── 点选.py │ │ ├── RedBook │ │ │ ├── README.md │ │ │ ├── RedBook.py │ │ │ ├── demo.py │ │ │ └── new │ │ │ │ └── jssss.js │ │ ├── pdd │ │ │ ├── demo.py │ │ │ └── hello.js │ │ ├── 猿人学2023第一题 │ │ │ ├── demo.js │ │ │ └── test.py │ │ └── 饿了么 │ │ │ ├── eleme.py │ │ │ ├── env.js │ │ │ ├── etSign.js │ │ │ └── hello.html │ └── 请求头请求体加密 │ │ ├── b站登录 │ │ └── demo.py │ │ ├── fjs │ │ ├── demo.js │ │ ├── fjs.py │ │ └── sign.js │ │ ├── football │ │ ├── 599_info.py │ │ └── js │ │ │ ├── demo.js │ │ │ └── sss.js │ │ ├── weather │ │ ├── getParams.js │ │ ├── weather_class.py │ │ └── 请求.py │ │ ├── 唯一艺术 │ │ ├── demo.js │ │ └── test.py │ │ ├── 娱乐指数 │ │ ├── demo.js │ │ └── ylzs.py │ │ ├── 广东省公共资源交易 │ │ ├── demo.js │ │ └── guang.py │ │ ├── 有道翻译 │ │ ├── demo.js │ │ └── youdao.py │ │ ├── 烯牛数据 │ │ ├── demo.js │ │ └── xiniu.py │ │ ├── 网易云音乐 │ │ ├── comment │ │ │ ├── comment_of_1297486027.csv │ │ │ └── comment_of_488249475.csv │ │ ├── decrpyo.py │ │ ├── demo.js │ │ └── music.py │ │ ├── 艺恩数据 │ │ ├── demo.js │ │ └── yien.py │ │ └── 行行查 │ │ ├── demo.js │ │ └── hanghangcha.py ├── 基础综合 │ ├── Instagram │ │ └── downloader.py │ ├── bilibili │ │ └── checkLike.py │ ├── dandanzan │ │ ├── M3U8.py │ │ ├── drama.py │ │ ├── main.py │ │ ├── movie.py │ │ └── variety.py │ ├── douyin全站爬取 │ │ ├── douyin.py │ │ └── x-b.js │ ├── weibo全站爬取 │ │ ├── base.py │ │ └── weibo.py │ ├── 语音爬虫 │ │ ├── baidu.py │ │ ├── main.py │ │ ├── media │ │ │ ├── baidu │ │ │ │ ├── kr │ │ │ │ │ └── all.mp3 │ │ │ │ └── wenku.txt │ │ │ │ │ └── all.mp3 │ │ │ └── youdao │ │ │ │ ├── kr │ │ │ │ └── all.mp3 │ │ │ │ ├── kr_exam │ │ │ │ └── all.mp3 │ │ │ │ └── wenku.txt │ │ │ │ └── all.mp3 │ │ ├── mix_media.py │ │ ├── text │ │ │ ├── en │ │ │ ├── kr │ │ │ ├── kr_exam.txt │ │ │ ├── wenku.txt │ │ │ ├── zh.txt │ │ │ └── zh_kr_exam │ │ └── youdao.py │ └── 验证码相关 │ │ ├── 古诗文网验证码识别.py │ │ ├── 模拟登录.py │ │ ├── 模拟登录之后的数据爬取.py │ │ └── 验证码测试.py └── 爬虫轮子 │ ├── README.md │ └── crawl.py └── 验证码篇 ├── 滑块篇 ├── JD滑块 │ ├── JD_Slide.py │ └── demo.js ├── 阿里系226 │ ├── 226.py │ └── README.md └── 飞瓜登录验证码定制阿里系滑块 │ ├── README.md │ ├── send.py │ └── test.py └── 点选篇 └── 极验三代文字点选 ├── demo.py └── main.js /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # 默认忽略的文件 2 | /shelf/ 3 | /workspace.xml 4 | # 基于编辑器的 HTTP 客户端请求 5 | /httpRequests/ 6 | # Datasource local storage ignored files 7 | /dataSources/ 8 | /dataSources.local.xml 9 | -------------------------------------------------------------------------------- /.idea/crawlProject.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /README_img/alipay.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/README_img/alipay.JPG -------------------------------------------------------------------------------- /README_img/wechatPay.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/README_img/wechatPay.JPG -------------------------------------------------------------------------------- /feapder篇/air-spider/RedBook/MYSQLDB.py: -------------------------------------------------------------------------------- 1 | from feapder.db.mysqldb import MysqlDB 2 | 3 | ############################################################################## 4 | # SQL语句 5 | # 创建笔记表 6 | create_note_table_sql = ''' 7 | CREATE TABLE IF NOT EXISTS note ( 8 | note_id varchar(255) PRIMARY KEY, 9 | note_type varchar(255), 10 | display_title TEXT, 11 | note_cover TEXT, 12 | liked_count INTEGER, 13 | user_name varchar(255), 14 | user_id varchar(255), 15 | avatar TEXT 16 | ) 17 | ''' 18 | create_note_comment_table_sql = ''' 19 | CREATE TABLE IF NOT EXISTS comment ( 20 | comment_id varchar(255) PRIMARY KEY, 21 | note_id varchar(255), 22 | target_comment varchar(255), 23 | content TEXT, 24 | like_count varchar(100), 25 | user_name varchar(255), 26 | user_id varchar(255) 27 | ) 28 | ''' 29 | ############################################################################## 30 | 31 | if __name__ == '__main__': 32 | db = MysqlDB() 33 | # 创建note表 34 | # db.execute(create_note_table_sql) 35 | # 创建评论表 36 | db.execute(create_note_comment_table_sql) -------------------------------------------------------------------------------- /feapder篇/air-spider/RedBook/README.md: -------------------------------------------------------------------------------- 1 | # RedBook爬虫文档 2 | 3 | ## 数据库设计 4 | 见MYSQLDB文件 5 | ## 爬虫逻辑 6 | 暂时分为两个部分,获取主页帖子以及获取评论帖子 7 | 使用时需要下载feapder 8 | 需要nodejs环境 9 | ```bash 10 | npm install jsdom 11 | npm install touch-cookie 12 | ``` 13 | ## 项目架构 14 | 支持数据库存储 15 | 支持csv存储 16 | 如果使用mysql需要配置数据库 17 | 如果使用csv需要配置csv文件名,路径已经做好 18 | 并且需要按照js代码的指示完成加密参数生成 19 | 20 | ## 之后新增 21 | 1. 更多功能 22 | 2. 用户池 23 | 3. ... -------------------------------------------------------------------------------- /feapder篇/air-spider/RedBook/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/feapder篇/air-spider/RedBook/__init__.py -------------------------------------------------------------------------------- /feapder篇/air-spider/RedBook/custom_pipeline/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/feapder篇/air-spider/RedBook/custom_pipeline/__init__.py -------------------------------------------------------------------------------- /feapder篇/air-spider/RedBook/custom_pipeline/csvPipeline.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from feapder.pipelines import BasePipeline 4 | from feapder.utils.log import log 5 | 6 | from RedBook.setting import CSV_PATH 7 | import csv 8 | 9 | 10 | class CsvPipeline(BasePipeline): 11 | def __init__(self): 12 | path = os.path.abspath(os.path.dirname(__file__) + '/csv_data/' + CSV_PATH) 13 | self.f = open(path, "w", encoding="utf-8", newline='') 14 | self.len = 0 15 | 16 | def save_items(self, table, items) -> bool: 17 | try: 18 | header = list(items[0].keys()) 19 | writer = csv.DictWriter(self.f, header) 20 | # 如果文件为空,写入表头 21 | if self.len == 0: 22 | writer.writeheader() 23 | self.len = 1 24 | writer.writerows(items) 25 | self.f.flush() # 刷新缓冲区到磁盘 26 | log.info(f"CSV管道 ===> {table},写入csv文件成功") 27 | return True 28 | except Exception as e: 29 | log.error(f"CSV管道 ===> 错误信息: {e}") 30 | log.error(f"CSV管道 ===> {table},写入csv文件失败") 31 | return False 32 | 33 | def close(self): 34 | self.f.close() 35 | -------------------------------------------------------------------------------- /feapder篇/air-spider/RedBook/items/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/feapder篇/air-spider/RedBook/items/__init__.py -------------------------------------------------------------------------------- /feapder篇/air-spider/RedBook/items/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2023-09-09 12:22:13 4 | --------- 5 | @summary: 6 | --------- 7 | @author: dongxishan 8 | """ 9 | 10 | from feapder import Item 11 | 12 | 13 | class NoteItem(Item): 14 | """ 15 | 帖子实体 16 | """ 17 | 18 | __table_name__ = "note" 19 | __unique_key__ = "note_id" 20 | 21 | def __init__(self, **kwargs): 22 | super().__init__(**kwargs) 23 | self.avatar = kwargs.get('avatar') 24 | self.display_title = kwargs.get('display_title') 25 | self.liked_count = kwargs.get('liked_count') 26 | self.note_cover = kwargs.get('note_cover') 27 | self.note_id = kwargs.get('note_id') 28 | self.note_type = kwargs.get('note_type') 29 | self.user_id = kwargs.get('user_id') 30 | self.user_name = kwargs.get('user_name') 31 | 32 | 33 | class Comment(Item): 34 | """ 35 | 评论实体 36 | """ 37 | __table_name__ = "comment" 38 | __unique_key__ = "comment_id" 39 | 40 | def __init__(self, **kwargs): 41 | super().__init__(**kwargs) 42 | self.note_id = kwargs.get('note_id') 43 | self.comment_id = kwargs.get('comment_id') 44 | self.content = kwargs.get('content') 45 | self.target_comment = kwargs.get('target_comment') 46 | self.user_id = kwargs.get('user_id') 47 | self.user_name = kwargs.get('user_name') -------------------------------------------------------------------------------- /feapder篇/air-spider/RedBook/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2023-09-05 22:01:40 4 | --------- 5 | @summary: 爬虫入口 6 | --------- 7 | @author: dong xi shan 8 | """ 9 | 10 | if __name__ == "__main__": 11 | pass 12 | 13 | # main.py作为爬虫启动的统一入口,提供命令行的方式启动多个爬虫,若只有一个爬虫,可不编写main.py 14 | # 将上面的xxx修改为自己实际的爬虫名 15 | # 查看运行命令 python main.py --help 16 | # AirSpider与Spider爬虫运行方式 python main.py --crawl_xxx 17 | # BatchSpider运行方式 18 | # 1. 下发任务:python main.py --crawl_xxx 1 19 | # 2. 采集:python main.py --crawl_xxx 2 20 | # 3. 重置任务:python main.py --crawl_xxx 3 21 | -------------------------------------------------------------------------------- /feapder篇/air-spider/RedBook/midware/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/feapder篇/air-spider/RedBook/midware/__init__.py -------------------------------------------------------------------------------- /feapder篇/air-spider/RedBook/midware/add_cookie.py: -------------------------------------------------------------------------------- 1 | def add_cookie(request): 2 | # 这个中间件是用来添加cookie的,注意此处的cookie需要和js补环境的cookie一致 3 | request.cookies = { 4 | 5 | } 6 | 7 | return request 8 | -------------------------------------------------------------------------------- /feapder篇/air-spider/RedBook/midware/get_XsXt.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from urllib.parse import urlparse, urlencode 4 | 5 | import execjs 6 | 7 | 8 | def get_XsXt(api: str, payload: dict = None) -> dict: 9 | # 获取RedBook/midware/jsCode/jsss.js的绝对路径 10 | js_path: str = os.path.abspath(os.path.dirname(__file__) + '/jsCode/jsss.js') 11 | js = execjs.compile(open(js_path, 'r', encoding='utf-8').read()) 12 | # 加密参数 13 | ctx: dict = js.call('XsXt', api, payload) 14 | return { 15 | 'x-s': ctx['X-s'], 16 | 'x-t': str(ctx['X-t']), 17 | 'content-type': 'application/json;charset=UTF-8', 18 | 'accept': 'application/json, text/plain, */*', 19 | } 20 | 21 | 22 | def add_XsXt(request): 23 | url = request.url 24 | parsed_url = urlparse(url) 25 | path = parsed_url.path 26 | method = request.method 27 | 28 | if method == 'GET': 29 | # GET请求需要拼接params和url 30 | params = request.params 31 | path = path + '?' + urlencode(params) 32 | request.headers = get_XsXt(path) 33 | elif method == 'POST': 34 | # POST请求需要加密payload 35 | json_data = request.json 36 | request.data = json.dumps(json_data, ensure_ascii=False, 37 | separators=(',', ':')).encode() 38 | request.headers = get_XsXt(path, json_data) 39 | return request 40 | -------------------------------------------------------------------------------- /feapder篇/air-spider/RedBook/midware/get_sign.py: -------------------------------------------------------------------------------- 1 | from hashlib import md5 2 | from urllib.parse import urlparse, urlencode 3 | 4 | 5 | def add_sign(request): 6 | url = request.url 7 | parsed_url = urlparse(url) 8 | path = parsed_url.path 9 | params = request.params 10 | if params: 11 | path += '?' + urlencode(params) 12 | obj = md5() 13 | obj.update(f'{path}WSUDD'.encode('utf-8')) 14 | request.headers = { 15 | 'x-sign': 'X' + obj.hexdigest() 16 | } 17 | 18 | return request 19 | -------------------------------------------------------------------------------- /feapder篇/air-spider/RedBook/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "get_homefeed", 3 | "test_spider" 4 | ] -------------------------------------------------------------------------------- /feapder篇/air-spider/RedBook/spiders/get_homefeed.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2023-09-08 19:08:21 4 | --------- 5 | @summary: 6 | --------- 7 | @author: dongxishan 8 | """ 9 | 10 | import feapder 11 | from feapder.utils.log import log 12 | 13 | from RedBook.items.items import NoteItem 14 | from RedBook.midware.add_cookie import add_cookie 15 | from RedBook.midware.get_XsXt import add_XsXt 16 | from RedBook.types.QueryJsonType import QueryHomeFeedNote 17 | from RedBook.setting import CSV_PATH, ITEM_PIPELINES 18 | 19 | 20 | class GetHomeFeed(feapder.AirSpider): 21 | MAPPING: dict = dict(zip( 22 | range(10), 23 | [ 24 | 'homefeed_recommend', 'homefeed.fashion_v3', 'homefeed.food_v3', 'homefeed.cosmetics_v3', 25 | 'homefeed.movie_and_tv_v3', 'homefeed.career_v3', 'homefeed.love_v3', 26 | 'homefeed.household_product_v3', 'homefeed.gaming_v3', 'homefeed.travel_v3', 'homefeed.fitness_v3' 27 | ] 28 | )) 29 | # 默认采集主页推荐 30 | QUERY = QueryHomeFeedNote( 31 | cursor_score='', 32 | num=36, 33 | refresh_type=1, 34 | note_index=29, 35 | unread_begin_note_id='', 36 | unread_end_note_id='', 37 | unread_note_count=0, 38 | category='homefeed_recommend' 39 | ) 40 | 41 | def __init__(self, types: int = 0): 42 | super().__init__() 43 | self.QUERY['category'] = self.MAPPING[types] 44 | if "RedBook.custom_pipeline.csvPipeline.CsvPipeline" in ITEM_PIPELINES: 45 | log.info("csvPipeline已启用, 保存文件路径为: RedBook/custom/csv_data/{}".format(CSV_PATH)) 46 | 47 | def start_requests(self): 48 | yield feapder.Request( 49 | url="https://edith.xiaohongshu.com/api/sns/web/v1/homefeed", 50 | method='POST', 51 | json=self.QUERY, 52 | download_midware=[add_cookie, add_XsXt] 53 | ) 54 | 55 | def parse(self, request, response): 56 | resp = response.json 57 | cursor_score = resp.get('data', {}).get('cursor_score') 58 | if cursor_score: 59 | self.QUERY['cursor_score'] = cursor_score 60 | yield feapder.Request( 61 | url="https://edith.xiaohongshu.com/api/sns/web/v1/homefeed", 62 | method='POST', 63 | json=self.QUERY, 64 | download_midware=[add_cookie, add_XsXt] 65 | ) 66 | data = resp.get('data', {}).get('items') 67 | for note in data: 68 | if note.get('hot_query'): 69 | continue 70 | else: 71 | yield self.parse_dict(note) 72 | 73 | @staticmethod 74 | def parse_dict(note: dict) -> NoteItem: 75 | id = note.get('id') 76 | note = note.get('note_card') 77 | user_name = note.get('user', {}).get('nick_name') 78 | if not user_name: 79 | user_name = note.get('user', {}).get('nickname') 80 | 81 | return NoteItem( 82 | note_id=id, 83 | note_type=note.get('type'), 84 | display_title=note.get('display_title'), 85 | note_cover=note.get('cover', {}).get('url'), 86 | liked_count=note.get('interact_info', {}).get('liked_count'), 87 | user_name=user_name, 88 | user_id=note.get('user', {}).get('user_id'), 89 | avatar=note.get('user', {}).get('avatar') 90 | ) 91 | 92 | 93 | if __name__ == "__main__": 94 | GetHomeFeed(0).start() 95 | -------------------------------------------------------------------------------- /feapder篇/air-spider/RedBook/types/QueryJsonType.py: -------------------------------------------------------------------------------- 1 | from typing import TypedDict, Literal 2 | 3 | 4 | class QueryTopicNote(TypedDict): 5 | """ 6 | 话题请求类型 7 | """ 8 | page_size: int # 每页大小 9 | sort: Literal['hot', 'time'] # 话题类型 10 | page_id: str # 话题的id 11 | cursor: str # 话题位移 12 | sid: str # 暂时不清楚 13 | 14 | 15 | class QueryHomeFeedNote(TypedDict): 16 | """ 17 | 首页请求类型 18 | """ 19 | category: Literal[ 20 | 'homefeed_recommend', 'homefeed.fashion_v3', 'homefeed.food_v3', 'homefeed.cosmetics_v3', 21 | 'homefeed.movie_and_tv_v3', 'homefeed.career_v3', 'homefeed.love_v3', 22 | 'homefeed.household_product_v3','homefeed.gaming_v3', 'homefeed.travel_v3', 'homefeed.fitness_v3' 23 | ] 24 | cursor_score: str 25 | num: int 26 | refresh_type: int 27 | note_index: Literal[29] 28 | unread_begin_note_id: Literal[''] 29 | unread_end_note_id: Literal[''] 30 | unread_note_count: Literal[0] 31 | 32 | 33 | class QueryUserInfo(TypedDict): 34 | """ 35 | 用户信息请求类型 36 | """ 37 | pass 38 | 39 | 40 | class QueryNoteDetail(TypedDict): 41 | """ 42 | 笔记详情请求类型 43 | """ 44 | pass 45 | 46 | 47 | class QueryUserNote(TypedDict): 48 | """ 49 | 用户笔记请求类型 50 | """ 51 | pass 52 | 53 | 54 | class QueryNoteComment(TypedDict): 55 | """ 56 | 笔记评论请求类型 57 | """ 58 | note_id: str 59 | cursor: str 60 | 61 | 62 | class QueryNoteSubComment(TypedDict): 63 | """ 64 | 笔记评论请求类型 65 | """ 66 | note_id: str 67 | cursor: str 68 | root_comment_id: str 69 | num: str 70 | -------------------------------------------------------------------------------- /feapder篇/air-spider/RedBook/types/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/feapder篇/air-spider/RedBook/types/__init__.py -------------------------------------------------------------------------------- /基础篇/lxml&re/4k图片爬取.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from lxml import etree 3 | import urllib3 # 禁用安全请求警告,当目标使用htpps时使用 4 | import os 5 | 6 | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) 7 | 8 | 9 | # 解决爬取网页出现中文乱码的情况 10 | def rebuilt_Language(url, headers): 11 | response = requests.get(url=url, headers=headers, verify=False) 12 | # response.encoding = response.apparent_encoding 13 | return response 14 | 15 | 16 | if __name__ == "__main__": 17 | # UA伪装 18 | headers = { 19 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36' 20 | } 21 | # 建立一个文件夹存储照片 22 | i = -1 23 | if not os.path.exists('./picLibs'): 24 | os.mkdir('./picLibs') 25 | # 设置一个通用的url 26 | url = 'https://pic.netbian.com/4kmeinv/index_%d.html' 27 | pageNum = 1 28 | src_list = [] # 存储图片的src 29 | img_name_list = [] # 存储图片的名字 30 | for pageNum in range(1, 3): 31 | new_url = format(url % pageNum) 32 | page_text = rebuilt_Language(url=new_url, headers=headers).text 33 | tree = etree.HTML(page_text) 34 | # 解析src的属性值,解析alt属性值 35 | li_list = tree.xpath('//div[@class="wrap clearfix"]//li') 36 | for li in li_list: 37 | src = ' https://pic.netbian.com' + li.xpath('./a/img/@src')[0] 38 | src_list.append(src) 39 | img_name = li.xpath('./a/img/@alt')[0] + '.jpg' 40 | # 解决中文乱码的方法 41 | img_name = img_name.encode('iso-8859-1').decode('gbk') 42 | img_name_list.append(img_name) 43 | # 请求图片并持续化存储 44 | for img_url in src_list: 45 | i = i + 1 46 | img_data = requests.get(url=img_url, headers=headers).content 47 | img_path = 'picLibs/' + img_name_list[i] 48 | with open(img_path, 'wb') as fp: 49 | fp.write(img_data) 50 | print(img_name_list[i] + '下载成功!') 51 | -------------------------------------------------------------------------------- /基础篇/lxml&re/58同城分页爬取.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from lxml import etree 3 | 4 | if __name__ == '__main__': 5 | headers = { 6 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36' 7 | } 8 | # 设置一个通用的url 9 | url = 'https://sh.58.com/ershoufang/p%d/?PGTID=0d30000c-0000-2e04-d18a-9af183e2d6a4&ClickID=1' 10 | pageNum = 1 11 | fp = open('58.txt', 'w', encoding='utf-8') 12 | for pageNum in range(1, 9): 13 | new_url = format(url % pageNum) # 拼接成完整的url 14 | page_text = requests.get(url=new_url, headers=headers).text 15 | tree = etree.HTML(page_text) 16 | tongji_list = tree.xpath('//section[@class="list"]/div') 17 | for li in tongji_list: 18 | title = li.xpath('./a/div[2]//h3/text()')[0] 19 | print(title) 20 | fp.write(title + '\n') 21 | print('over!') -------------------------------------------------------------------------------- /基础篇/lxml&re/GetFakeUA.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from lxml import etree 3 | 4 | headers = { 5 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36' 6 | } 7 | 8 | url = 'https://useragentstring.com/pages/useragentstring.php?name=Chrome' 9 | 10 | resp = requests.get(url=url, headers=headers).text 11 | 12 | tree = etree.HTML(resp) 13 | 14 | ul_list = tree.xpath('//*[@id="liste"]/ul') 15 | 16 | USER_AGENT = [] 17 | 18 | fp = open('./fake_UA.txt', 'a', encoding='utf-8') 19 | 20 | for ul in ul_list: 21 | UA = ul.xpath('./li/a/text()') 22 | for i in range(1, len(UA)): 23 | ua = '"' + UA[i] + '",\n' 24 | print(ua) 25 | fp.write(ua) 26 | 27 | fp.close() 28 | 29 | -------------------------------------------------------------------------------- /基础篇/lxml&re/bs4案例.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import urllib3 # 禁用安全请求警告,当目标使用htpps时使用 4 | 5 | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) 6 | 7 | 8 | # 解决爬取网页出现中文乱码的情况 9 | def rebuilt_Language(url, headers): 10 | response = requests.get(url=url, headers=headers, verify=False) 11 | response.encoding = response.apparent_encoding 12 | return response 13 | 14 | 15 | # 爬取三国演义小说所有的章节标题和章节内容 16 | if __name__ == "__main__": 17 | headers = { 18 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36' 19 | } 20 | url = 'https://www.shicimingju.com/book/sanguoyanyi.html' 21 | page_text = rebuilt_Language(url, headers).text 22 | # 创建BeautifulSoup对象 23 | soup = BeautifulSoup(page_text, 'lxml') 24 | li_list = soup.select('.book-mulu > ul >li') 25 | fp = open('./sanguo.txt', 'w', encoding='utf-8') 26 | for li in li_list: 27 | title = li.a.string 28 | detail_url = 'https://www.shicimingju.com' + li.a['href'] 29 | detail_page_text = rebuilt_Language(detail_url, headers).text 30 | # 解析详情页相关章节内容 31 | detail_soup = BeautifulSoup(detail_page_text, 'lxml') 32 | div_tag = detail_soup.find('div', class_='chapter_content') 33 | content = div_tag.text 34 | fp.write(title + ":" + content + "\n") 35 | print(title, '爬取成功') 36 | -------------------------------------------------------------------------------- /基础篇/lxml&re/bs4解析基础.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | 4 | if __name__ == "__main__": 5 | # 将本地的html文件中的数据加载到该对象中 6 | fp = open('test.html', 'r', encoding='utf-8') 7 | soup = BeautifulSoup(fp, 'lxml') 8 | # print(soup) 9 | # print(soup.a) # soup.tagName 返回的是html中第一次出现的tagName标签 10 | # print(soup.find('div')) # 相当于soup.div 11 | # print(soup.find('div', class_='song')) 12 | # print(soup.find_all('a')) 13 | # print(soup.select('.tang')) 14 | # print(soup.select('.tang > ul > li > a')[0]) 15 | # print(soup.select('.tang > ul a')[0]) 16 | # print(soup.select('.tang > ul a')[0].text) 17 | tag = soup.find('div', class_='song') 18 | print(tag.text) -------------------------------------------------------------------------------- /基础篇/lxml&re/test.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 测试bs4 6 | 7 | 8 |
9 |

百里守约

10 |
11 |
12 |

李清照

13 |

王安石

14 |

苏轼

15 |

柳宗元

16 | 17 | this is span 18 | 宋朝是最强大的王朝,不是军队的强大,而是经济很强大,国民都很有钱 19 | 20 | 总为浮云能避日,长安不见使人愁 21 | 22 |
23 |
24 | 33 |
34 | 35 | -------------------------------------------------------------------------------- /基础篇/lxml&re/xpath基础.py: -------------------------------------------------------------------------------- 1 | from lxml import etree 2 | 3 | if __name__ == "__main__": 4 | # 实例化一个etree对象 5 | tree = etree.parse('./test.html') 6 | # r = tree.xpath('/html//title') 7 | # r = tree.xpath('//div[@class="song"]') 8 | # r = tree.xpath('//div[@class="song"]/p[3]') 9 | # r = tree.xpath('//div[@class="tang"]/ul/li[4]/a/text()') 10 | # r = tree.xpath('//div[@class="tang"]//text()') 11 | # r = tree.xpath('//div[@class="song"]/img/@src') 12 | r = tree.xpath('//div[@class="song"]/p/text()') 13 | print(r) 14 | -------------------------------------------------------------------------------- /基础篇/lxml&re/正则练习.py: -------------------------------------------------------------------------------- 1 | import re 2 | # 提取出python 3 | key = "java python c++ php" 4 | s = re.findall('python', key)[0] 5 | print(s) 6 | # key = 'https://scpic.chinaz.net/files/default/imgs/2023-01-04/610de886ffc6b37d_s.jpg' 7 | # s = re.sub('_s', '', key) 8 | # print(s) 9 | # 提取出hello world 10 | # key = "

hello world

" 11 | # s = re.findall('

(.*)

', key)[0] 12 | # print(s) 13 | # 提取出170 14 | # string = '我喜欢身高为170的女生' 15 | # s = re.findall('\d+', string)[0] 16 | # print(s) 17 | # 提取出http:// 和 https:// 18 | # key = 'http://www.baidu.com and https://dong.com' 19 | # s = re.findall('https?://', key) 20 | # print(s) 21 | # 提取出hello 22 | # key = 'lalalahellohahaha' 23 | # s = re.findall('<[Hh][Tt][mM][lL]>(.*)', key) 24 | # print(s) 25 | # 提取出hit. 26 | # key = 'bobo@hit.edu.cn' 27 | # s = re.findall('h.*?\.', key)[0] 28 | # print(s) 29 | # 提取出saas 和 sas 30 | # key = 'saas and sas and saaas' 31 | # s = re.findall('sa{1,2}s', key) 32 | # print(s) 33 | -------------------------------------------------------------------------------- /基础篇/lxml&re/正则解析.py: -------------------------------------------------------------------------------- 1 | import re 2 | import requests 3 | import os 4 | 5 | # 爬取图片 6 | if __name__ == "__main__": 7 | # 创建一个文件夹,用来保存所有的图片 8 | if not os.path.exists('./imgLibs'): 9 | os.mkdir('./imgLibs') 10 | url = 'https://www.douban.com/' 11 | headers = { 12 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36' 13 | } 14 | # 使用通用爬虫对url对应的一整张页面进行爬取 15 | page_text = requests.get(url=url, headers=headers).text 16 | # 使用聚焦爬虫将页面中所有的图片进行解析、提取 17 | ex = '
.*?.*?</div' 18 | img_src_list = re.findall(ex, page_text, re.S) 19 | # print(img_src_list) 20 | for src in img_src_list: 21 | # 将图片信息以二进制存储 22 | img_data = requests.get(url=src, headers=headers).content 23 | # 生成图片名称 24 | img_name = src.split('/')[-1] 25 | imgPath = './imgLibs/' + img_name 26 | with open(imgPath, 'wb') as fp: 27 | fp.write(img_data) 28 | print(img_name, '下载成功') 29 | -------------------------------------------------------------------------------- /基础篇/lxml&re/简历爬取.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from lxml import etree 3 | import os 4 | 5 | if __name__ == '__main__': 6 | # UA伪装 7 | headers = { 8 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36' 9 | } 10 | url0 = 'https://sc.chinaz.com/jianli/free.html' # 访问第一页的链接,这里因为直接用free_1无法打开网页 11 | url = 'https://sc.chinaz.com/jianli/free_%d.html' 12 | pageNum = 1 13 | 14 | download_list = [] 15 | download_name_list = [] 16 | # 新建文件夹可持续化存储 17 | if not os.path.exists('./CV_moban'): 18 | os.mkdir('./CV_moban') 19 | # 分页爬取 20 | for pageNum in range(1, 3): 21 | if pageNum == 1: 22 | new_url = url0 23 | else: 24 | new_url = format(url % pageNum) 25 | # 实例化对象的构建 26 | page_text = requests.get(url=new_url, headers=headers).text 27 | tree = etree.HTML(page_text) 28 | # 爬取需要下载的页面信息 29 | CV_infor_list = tree.xpath('//div[@class="main_list jl_main"]/div') 30 | for cv in CV_infor_list: 31 | CV_src = cv.xpath('./a/@href')[0] 32 | CV_text = requests.get(url=CV_src, headers=headers).text 33 | ctree = etree.HTML(CV_text) 34 | # 爬取简历下载链接 35 | download_src = ctree.xpath('//div[@class="down_wrap"]/div[2]/ul/li/a/@href')[0] 36 | download_list.append(download_src) 37 | # 爬取简历名称 38 | download_name = ctree.xpath('//div[@class="bgwhite"]/div//h1/text()')[0] 39 | download_name = download_name.encode('iso-8859-1').decode('utf-8') + '.rar' 40 | download_name_list.append(download_name) 41 | 42 | # 批量下载简历模板 43 | i = -1 44 | for cvv in download_list: 45 | i = i + 1 46 | cvv = download_list[i] 47 | cv_content = requests.get(url=cvv, headers=headers).content 48 | cv_path = 'CV_moban/' + download_name_list[i] 49 | with open(cv_path, 'wb') as fp: 50 | fp.write(cv_content) 51 | print(download_name_list[i] + '下载完成!') 52 | -------------------------------------------------------------------------------- /基础篇/request/01-Request.py: -------------------------------------------------------------------------------- 1 | # requests模块的使用 2 | import requests 3 | 4 | if __name__ == "__main__": 5 | # 指定url 6 | url = 'https://wz.sun0769.com/political/index/politicsNewest' 7 | # 发起请求 8 | # get方法会返回一个响应对象 9 | response = requests.get(url=url) 10 | # 获取响应数据 11 | page_txt = response.text 12 | # 持久化存储 13 | with open('./sogou.html', 'w', encoding='utf-8') as fp: 14 | fp.write(page_txt) 15 | print('爬取数据结束!') 16 | 17 | -------------------------------------------------------------------------------- /基础篇/request/02-(UA)网页采集器.py: -------------------------------------------------------------------------------- 1 | # UA检测(反爬机制):门户网站的服务器会检测对应请求的载体身份标识,如果检测到请求的载体身份为某一浏览器,说明该请求是一个正常请求。 2 | # 但是如果检测到不是某一浏览器,则表示该请求为非正常请求。服务器端拒绝该次请求。 3 | # UA:User-Agent(请求载体的身份标识) 4 | # UA伪装:让爬虫身份标识伪装成浏览器 5 | import requests 6 | if __name__ == '__main__': 7 | # UA伪装 8 | headers = { 9 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36' 10 | } 11 | url = 'https://www.sogou.com/web?' 12 | # 处理url携带的参数:封装到字典中 13 | kw = input('enter a word:') 14 | param = { 15 | 'query': kw, 16 | } 17 | # 对指定的url发起的请求对应的url是携带参数的,并且请求过程中处理了参数 18 | response = requests.get(url=url, params=param, headers=headers) 19 | 20 | page_text = response.text 21 | fileName = kw+ '.html' 22 | with open(fileName, 'w', encoding='utf-8') as fp: 23 | fp.write(page_text) 24 | print(fileName, '保存成功!') 25 | -------------------------------------------------------------------------------- /基础篇/request/03-(POST)百度翻译.py: -------------------------------------------------------------------------------- 1 | # post请求(携带了参数) 2 | # 响应数据是一组json数据 3 | import requests 4 | import json 5 | if __name__ == '__main__': 6 | # 指定url 7 | post_url = 'https://fanyi.baidu.com/sug' 8 | # UA伪装 9 | headers = { 10 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36' 11 | } 12 | # post请求参数处理 13 | word = input('enter a word:') 14 | data = { 15 | 'kw': word 16 | } 17 | # 请求发送 18 | response = requests.post(url=post_url, data=data, headers=headers) 19 | # 获取响应数据:json方法返回的是obj(如果确认响应数据是json类型的,才可以使用jason() 20 | dic_obj = response.json() 21 | # 持久化存储 22 | filename = word + '.json' 23 | fp = open(filename, 'w', encoding='utf-8') 24 | json.dump(dic_obj, fp=fp, ensure_ascii=False) 25 | 26 | print('over!!') 27 | -------------------------------------------------------------------------------- /基础篇/request/04-豆瓣电影爬取.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | if __name__ == '__main__': 4 | # 指定url 5 | url = 'https://movie.douban.com/j/chart/top_list' 6 | param = { 7 | 'type': '24', 8 | 'interval_id': '100:90', 9 | 'action': '', 10 | 'start': '1', 11 | 'limit': '20', 12 | } 13 | # UA伪装 14 | headers = { 15 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36' 16 | } 17 | response = requests.get(url=url, params=param, headers=headers) 18 | list_data = response.json() 19 | print(list_data) 20 | fp = open('./douban.json', 'w', encoding='utf-8') 21 | json.dump(list_data, fp=fp, ensure_ascii=False) 22 | 23 | print('Over!!') 24 | -------------------------------------------------------------------------------- /基础篇/request/05-肯德基餐厅位置查询.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | if __name__ == '__main__': 4 | url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword' 5 | headers = { 6 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36' 7 | } 8 | 9 | place = input('enter a place:') 10 | page = 1 # 从第1页开始 11 | fileName = place + 'KFC餐厅位置信息' + '.json' 12 | for i in range(0, 20): # 设置一个较大参数直到爬完所有页码 13 | param = { 14 | 'cname': '', 15 | 'pid': '', 16 | 'keyword': place, # 查询地点 17 | 'pageIndex': page, # 查询页码 18 | 'pageSize': '10', # 每页最多显示10个 19 | } 20 | response = requests.post(url=url, params=param, headers=headers) 21 | page_text = response.text 22 | # print(page_text) 23 | with open(fileName, 'a', encoding='utf-8') as fp: 24 | json.dump(page_text, fp=fp, ensure_ascii=False) 25 | fp.write('\n') # 注意这里还是在for循环当中,每爬取完一页内容,就敲个回车 26 | page = page + 1 # 佛如循环的循环变量,注意前文默认为1 27 | print('over!!!') -------------------------------------------------------------------------------- /基础篇/scrapy/bossjob/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/基础篇/scrapy/bossjob/__init__.py -------------------------------------------------------------------------------- /基础篇/scrapy/bossjob/bossjob/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/基础篇/scrapy/bossjob/bossjob/__init__.py -------------------------------------------------------------------------------- /基础篇/scrapy/bossjob/bossjob/fakeCookie.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | COOKIE_LIST = [ 4 | 'wd_guid=544d13f9-f072-4fdc-9989-84452f1ecd52; historyState=state; _bl_uid=XtlO5cqLjv05qpj3t0d0nna8msI4; lastCity=101020100; __g=-; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1673095377,1673165470,1673257271,1673333037; boss_login_mode=sms; __fid=c58f56b0daac21ec5273e9b4b258f537; wt2=DY4IX_Pe18l5jPqD0AYgnA-G9UnTNtDaZ_zMhCpK7UovHjn5bKxYiZ6NtwTrfsFzsgpxFtIBCopvwd7HdvXTGrg~~; wbg=0; __zp_stoken__=887aefCE3dDAxC0wecFokLmdqeARKZz80V3cWbnglEDsONSs%2FVCMzL295aWdxVWw6Ry4PehcuLyROcX4mdTpZXyFXVEtiREADYGooaVQmYhwcSUtZVAQoNVpLLXZRQkdxBRc9G0QGUFhyNA0%3D; geek_zp_token=V1RN0kEOL031ZiVtRvyB4bLCuw6zrQxCo~; __l=l=%2Fwww.zhipin.com%2Fshanghai%2F&r=&g=&s=3&friend_source=0&s=3&friend_source=0; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1673349533; __c=1673333037; __a=68265253.1672926940.1673257271.1673333037.431.9.106.431' 5 | ] 6 | 7 | 8 | def cookie_dic(): 9 | cookie_string = random.choice(COOKIE_LIST) 10 | cookie_dict = {} 11 | for kv in cookie_string.split(';'): 12 | k = kv.split('=')[0] 13 | v = kv.split('=')[1] 14 | cookie_dict[k] = v 15 | return cookie_dict 16 | -------------------------------------------------------------------------------- /基础篇/scrapy/bossjob/bossjob/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/items.html 5 | 6 | import scrapy 7 | 8 | 9 | class BossjobItem(scrapy.Item): 10 | # define the fields for your item here like: 11 | # name = scrapy.Field() 12 | pay = scrapy.Field() # 薪资 13 | job_name = scrapy.Field() # 岗位 14 | detail_url = scrapy.Field() # 职位详情链接 15 | company_name = scrapy.Field() # 公司名称 16 | requirement = scrapy.Field() # 要求 17 | detail = scrapy.Field() 18 | -------------------------------------------------------------------------------- /基础篇/scrapy/bossjob/bossjob/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | 7 | # useful for handling different item types with a single interface 8 | from itemadapter import ItemAdapter 9 | import pymysql 10 | 11 | 12 | class BossjobPipeline: 13 | def process_item(self, item, spider): 14 | print(item['detail']) 15 | return item 16 | 17 | 18 | class mysqlPipeLine(object): 19 | # 数据库连接 20 | conn = None 21 | cursor = None 22 | 23 | def open_spider(self, spider): 24 | self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='', db='Spider', 25 | charset='utf8') 26 | 27 | def process_item(self, item, spider): 28 | self.cursor = self.conn.cursor() 29 | 30 | try: 31 | self.cursor.execute('insert into bossjob values("%s", "%s", "%s", "%s", "%s")' % ( 32 | item["company_name"], item["detail_url"], item["job_name"], item["pay"], item["requirement"])) 33 | self.conn.commit() 34 | print('成功插入', item['job_name'], '的工作信息到数据库中!') 35 | except Exception as e: 36 | print(e) 37 | self.conn.rollback() 38 | 39 | return item 40 | 41 | def close_spider(self, spider): 42 | if self.cursor: 43 | self.cursor.close() 44 | if self.conn: 45 | self.conn.close() 46 | -------------------------------------------------------------------------------- /基础篇/scrapy/bossjob/bossjob/requset.py: -------------------------------------------------------------------------------- 1 | from scrapy import Request 2 | 3 | 4 | class SeleniumRequest(Request): 5 | pass 6 | -------------------------------------------------------------------------------- /基础篇/scrapy/bossjob/bossjob/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /基础篇/scrapy/bossjob/bossjob/spiders/boss.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import scrapy 4 | from lxml import etree 5 | 6 | from ..items import BossjobItem 7 | 8 | 9 | class BossSpider(scrapy.Spider): 10 | name = 'boss' 11 | 12 | def start_requests(self): 13 | for pageNum in range(51, 90): 14 | url = f'https://www.zhipin.com/wapi/zpgeek/mobile/search/joblist.json?page={pageNum}&city=101020100&query=' 15 | yield scrapy.Request(url=url, callback=self.parse) 16 | 17 | def parse(self, response, **kwargs): 18 | res = json.loads(response.text) 19 | it = {'html': res['zpData']['html']} 20 | tree = etree.HTML(it['html']) 21 | li_list = tree.xpath('//li') 22 | 23 | for li in li_list: 24 | item = BossjobItem() 25 | job_name = li.xpath('./a/div[1]/span[1]/text()')[0] 26 | item['job_name'] = job_name 27 | detail_url = 'https://www.zhipin.com' + li.xpath('./a/@href')[0] 28 | item['detail_url'] = detail_url 29 | pay = li.xpath('a/div[1]/span[2]/text()')[0] 30 | item['pay'] = pay 31 | company_name = li.xpath('./a/div[2]/span[1]/text()')[0] 32 | item['company_name'] = company_name 33 | requirement = li.xpath('./a/div[3]//text()') 34 | re = '' 35 | for i in range(1, len(requirement)): 36 | re = re + requirement[i].strip() + ' ' 37 | item['requirement'] = re 38 | 39 | yield item 40 | -------------------------------------------------------------------------------- /基础篇/scrapy/bossjob/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = bossjob.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = bossjob 12 | -------------------------------------------------------------------------------- /基础篇/scrapy/bossjob/vimm_chrome_proxyauth_plugin.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/基础篇/scrapy/bossjob/vimm_chrome_proxyauth_plugin.zip -------------------------------------------------------------------------------- /基础篇/scrapy/caipiao/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/基础篇/scrapy/caipiao/__init__.py -------------------------------------------------------------------------------- /基础篇/scrapy/caipiao/caipiao/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/基础篇/scrapy/caipiao/caipiao/__init__.py -------------------------------------------------------------------------------- /基础篇/scrapy/caipiao/caipiao/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/items.html 5 | 6 | import scrapy 7 | 8 | 9 | class CaipiaoItem(scrapy.Item): 10 | qihao = scrapy.Field() 11 | red_ball = scrapy.Field() 12 | blue_ball = scrapy.Field() 13 | -------------------------------------------------------------------------------- /基础篇/scrapy/caipiao/caipiao/middlewares.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your spider middleware 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 5 | import random 6 | from time import sleep 7 | 8 | from scrapy import signals 9 | 10 | # useful for handling different item types with a single interface 11 | from itemadapter import is_item, ItemAdapter 12 | from scrapy.http import HtmlResponse 13 | 14 | from .fake_useragent import USER_AGENTS 15 | 16 | 17 | class CaipiaoDownloaderMiddleware: 18 | # Not all methods need to be defined. If a method is not defined, 19 | # scrapy acts as if the downloader middleware does not modify the 20 | # passed objects. 21 | 22 | def process_request(self, request, spider): 23 | # UA伪装 24 | request.headers['User-Agent'] = random.choice(USER_AGENTS) 25 | return None 26 | 27 | def process_response(self, request, response, spider): 28 | bro = spider.bro 29 | bro.get(request.url) 30 | sleep(0.5) 31 | click = bro.find_element_by_xpath('//*[@id="link248"]/img').click() 32 | start = bro.find_element_by_id('from') 33 | start.clear() 34 | start.send_keys('16001') 35 | end = bro.find_element_by_id('to') 36 | end.clear() 37 | end.send_keys('23004') 38 | find = bro.find_element_by_id('link176').click() 39 | page_text = bro.page_source 40 | new_response = HtmlResponse(url=request.url, body=page_text, encoding='utf-8', request=request) 41 | 42 | return new_response 43 | 44 | def process_exception(self, request, exception, spider): 45 | 46 | pass 47 | -------------------------------------------------------------------------------- /基础篇/scrapy/caipiao/caipiao/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | 7 | # useful for handling different item types with a single interface 8 | from itemadapter import ItemAdapter 9 | import pymysql 10 | 11 | ''' 12 | 存储数据的方案: 13 | 1、数据要存在csv文件中 14 | 2、数据要存在mysql数据库中 15 | 3、数据要存在mongodb数据库中 16 | 4.文件的存储 17 | ''' 18 | 19 | 20 | class CaipiaoPipeline: 21 | 22 | def open_spider(self, spider): 23 | print('开始存储!') 24 | self.f = open('./双色球.csv', mode='w', encoding='utf-8') 25 | self.f.write("期数,红球号码,蓝球号码\n") 26 | 27 | def close_spider(self, spider): 28 | print('存储完毕!') 29 | if self.f: 30 | self.f.close() 31 | 32 | def process_item(self, item, spider): 33 | # print(item) 34 | self.f.write(f"{item['qihao']},{' '.join(item['red_ball'])},{item['blue_ball']}\n") 35 | return item 36 | 37 | 38 | class mySQLPipeline: 39 | 40 | def open_spider(self, spider): 41 | print('开始存储!') 42 | self.conn = pymysql.Connect( 43 | host="localhost", 44 | port=3306, 45 | user="root", 46 | password="", 47 | database="spider" 48 | ) 49 | 50 | def close_spider(self, spider): 51 | print('存储完毕!') 52 | if self.conn: 53 | self.conn.close() 54 | 55 | def process_item(self, item, spider): 56 | cur = self.conn.cursor() 57 | sql = "insert into caipiao values(%s, %s, %s)" 58 | try: 59 | cur.execute(sql, (item['qihao'], item['red_ball'], item['blue_ball'])) 60 | self.conn.commit() 61 | except Exception as e: 62 | print(e) 63 | self.conn.rollback() 64 | -------------------------------------------------------------------------------- /基础篇/scrapy/caipiao/caipiao/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for caipiao project 2 | # 3 | # For simplicity, this file contains only settings considered important or 4 | # commonly used. You can find more settings consulting the documentation: 5 | # 6 | # https://docs.scrapy.org/en/latest/topics/settings.html 7 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 8 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 9 | 10 | BOT_NAME = 'caipiao' 11 | 12 | SPIDER_MODULES = ['caipiao.spiders'] 13 | NEWSPIDER_MODULE = 'caipiao.spiders' 14 | 15 | 16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 17 | #USER_AGENT = 'caipiao (+http://www.yourdomain.com)' 18 | 19 | # Obey robots.txt rules 20 | ROBOTSTXT_OBEY = False 21 | LOG_LEVEL = 'WARNING' 22 | 23 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 24 | CONCURRENT_REQUESTS = 32 25 | 26 | # Configure a delay for requests for the same website (default: 0) 27 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 28 | # See also autothrottle settings and docs 29 | DOWNLOAD_DELAY = 3 30 | # The download delay setting will honor only one of: 31 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 32 | #CONCURRENT_REQUESTS_PER_IP = 16 33 | 34 | # Disable cookies (enabled by default) 35 | #COOKIES_ENABLED = False 36 | 37 | # Disable Telnet Console (enabled by default) 38 | #TELNETCONSOLE_ENABLED = False 39 | 40 | # Override the default request headers: 41 | #DEFAULT_REQUEST_HEADERS = { 42 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 43 | # 'Accept-Language': 'en', 44 | #} 45 | 46 | # Enable or disable spider middlewares 47 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 48 | # SPIDER_MIDDLEWARES = { 49 | # 'caipiao.middlewares.CaipiaoSpiderMiddleware': 543, 50 | # } 51 | 52 | # Enable or disable downloader middlewares 53 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 54 | DOWNLOADER_MIDDLEWARES = { 55 | 'caipiao.middlewares.CaipiaoDownloaderMiddleware': 543, 56 | } 57 | 58 | # Enable or disable extensions 59 | # See https://docs.scrapy.org/en/latest/topics/extensions.html 60 | #EXTENSIONS = { 61 | # 'scrapy.extensions.telnet.TelnetConsole': None, 62 | #} 63 | 64 | # Configure item pipelines 65 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 66 | ITEM_PIPELINES = { 67 | 'caipiao.pipelines.CaipiaoPipeline': 300, 68 | } 69 | 70 | # Enable and configure the AutoThrottle extension (disabled by default) 71 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 72 | #AUTOTHROTTLE_ENABLED = True 73 | # The initial download delay 74 | #AUTOTHROTTLE_START_DELAY = 5 75 | # The maximum download delay to be set in case of high latencies 76 | #AUTOTHROTTLE_MAX_DELAY = 60 77 | # The average number of requests Scrapy should be sending in parallel to 78 | # each remote server 79 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 80 | # Enable showing throttling stats for every response received: 81 | #AUTOTHROTTLE_DEBUG = False 82 | 83 | # Enable and configure HTTP caching (disabled by default) 84 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 85 | #HTTPCACHE_ENABLED = True 86 | #HTTPCACHE_EXPIRATION_SECS = 0 87 | #HTTPCACHE_DIR = 'httpcache' 88 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 89 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 90 | -------------------------------------------------------------------------------- /基础篇/scrapy/caipiao/caipiao/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /基础篇/scrapy/caipiao/caipiao/spiders/seq.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from ..items import CaipiaoItem 3 | from selenium import webdriver 4 | from selenium.webdriver import ChromeOptions 5 | 6 | class SeqSpider(scrapy.Spider): 7 | name = 'seq' 8 | # allowed_domains = ['www.xxx.com'] 9 | start_urls = ['https://datachart.500.com/ssq/'] 10 | 11 | def __init__(self, **kwargs): 12 | # 实现让selenium规避被检测到的风险 13 | super().__init__(**kwargs) 14 | option = ChromeOptions() 15 | option.add_experimental_option('excludeSwitches', ['enable-automation']) 16 | option.add_experimental_option('excludeSwitches', ['enable-logging']) 17 | option.add_argument("--no-sandbox") 18 | option.add_argument("--disable-dev-shm-usage") 19 | option.add_argument("--window-size=1920,1080") # 建议设置窗口大小 20 | option.add_argument('--headless') 21 | option.add_argument('--disable-gpu') 22 | # option.add_argument('blink-settings=imagesEnabled=false') 23 | self.bro = webdriver.Chrome(executable_path='D:\爬虫\selenium\chromedriver.exe', options=option) 24 | 25 | def closed(self, spider): 26 | self.bro.quit() 27 | 28 | def parse(self, response): 29 | tr_list = response.xpath('//*[@id="tdata"]/tr') 30 | for tr in tr_list: 31 | item = CaipiaoItem() 32 | # 过滤掉没用的标签 33 | if tr.xpath('./@class').extract_first() == 'tdbck': 34 | continue 35 | qishu = tr.xpath('./td[1]/text()').extract_first().strip() 36 | # 也可以用xpath: red_ball = tr.xpath("./td[@class="chartBall01"]/text()").extract() 37 | red_ball = tr.css(".chartBall01::text").extract() 38 | blue_ball = tr.css(".chartBall02::text").extract_first() 39 | item['qihao'] = qishu 40 | item['red_ball'] = red_ball 41 | item['blue_ball'] = blue_ball 42 | 43 | yield item 44 | -------------------------------------------------------------------------------- /基础篇/scrapy/caipiao/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = caipiao.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = caipiao 12 | -------------------------------------------------------------------------------- /基础篇/scrapy/imgsPro/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/基础篇/scrapy/imgsPro/__init__.py -------------------------------------------------------------------------------- /基础篇/scrapy/imgsPro/imgsPro/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/基础篇/scrapy/imgsPro/imgsPro/__init__.py -------------------------------------------------------------------------------- /基础篇/scrapy/imgsPro/imgsPro/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/items.html 5 | 6 | import scrapy 7 | 8 | 9 | class ImgsproItem(scrapy.Item): 10 | # define the fields for your item here like: 11 | # name = scrapy.Field() 12 | img_name = scrapy.Field() 13 | img_src = scrapy.Field() 14 | -------------------------------------------------------------------------------- /基础篇/scrapy/imgsPro/imgsPro/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | 7 | # useful for handling different item types with a single interface 8 | import scrapy 9 | from itemadapter import ItemAdapter 10 | from scrapy.pipelines.images import ImagesPipeline 11 | 12 | 13 | class ImgsproPipeline: 14 | def process_item(self, item, spider): 15 | print(item) 16 | return item 17 | 18 | 19 | class imgsPipeLine(ImagesPipeline): 20 | 21 | # 根据图片地址进行图片数据的请求 22 | def get_media_requests(self, item, info): 23 | yield scrapy.Request(item['img_src']) 24 | 25 | # 指定图片存储的路径 26 | def file_path(self, request, response=None, info=None, *, item): 27 | imgName = item['img_name'] 28 | return imgName 29 | 30 | def item_completed(self, results, item, info): 31 | return item # 返回给下一个即将被执行的管道类 32 | -------------------------------------------------------------------------------- /基础篇/scrapy/imgsPro/imgsPro/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /基础篇/scrapy/imgsPro/imgsPro/spiders/img.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from ..items import ImgsproItem 3 | import re 4 | 5 | 6 | class ImgSpider(scrapy.Spider): 7 | name = 'img' 8 | # allowed_domains = ['www.xxx.com'] 9 | start_urls = ['https://sc.chinaz.com/tupian//'] 10 | page_num = 2 11 | 12 | def parse(self, response): 13 | 14 | div_list = response.xpath('/html/body/div[3]/div[2]/div') 15 | for div in div_list: 16 | item = ImgsproItem() 17 | img_name = div.xpath('./img/@alt').extract() 18 | img_name = ''.join(img_name) + '.jpg' 19 | item['img_name'] = img_name 20 | img_src = div.xpath('./img/@data-original').extract() 21 | img_src = 'https:' + ''.join(img_src) 22 | # 去掉_s以获取高清原图,如果链接里面有_s是缩略图 23 | s = re.sub('_s', '', img_src) 24 | item['img_src'] = s 25 | 26 | yield item 27 | # 另一种分页操作 28 | if self.page_num <= 3: 29 | new_url = f'https://sc.chinaz.com/tupian/index_{self.page_num}.html' 30 | self.page_num += 1 31 | 32 | yield scrapy.Request(new_url, callback=self.parse) -------------------------------------------------------------------------------- /基础篇/scrapy/imgsPro/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = imgsPro.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = imgsPro 12 | -------------------------------------------------------------------------------- /基础篇/scrapy/paper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/基础篇/scrapy/paper/__init__.py -------------------------------------------------------------------------------- /基础篇/scrapy/paper/paper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/基础篇/scrapy/paper/paper/__init__.py -------------------------------------------------------------------------------- /基础篇/scrapy/paper/paper/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/items.html 5 | 6 | import scrapy 7 | 8 | 9 | class PaperItem(scrapy.Item): 10 | # define the fields for your item here like: 11 | # name = scrapy.Field() 12 | pass 13 | -------------------------------------------------------------------------------- /基础篇/scrapy/paper/paper/middlewares.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your spider middleware 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 5 | 6 | from scrapy import signals 7 | 8 | # useful for handling different item types with a single interface 9 | from itemadapter import is_item, ItemAdapter 10 | from .fake_useragent import get_ua 11 | 12 | 13 | class PaperDownloaderMiddleware: 14 | 15 | def process_request(self, request, spider): 16 | # UA伪装 17 | headers = get_ua() 18 | request.headers['User-Agent'] = headers 19 | return None 20 | 21 | def process_response(self, request, response, spider): 22 | return response 23 | 24 | def process_exception(self, request, exception, spider): 25 | pass 26 | 27 | 28 | class CookieDownloaderMiddleware(object): 29 | def process_request(self, request, spider): 30 | cookie_dict = self.get_cookies() 31 | request.cookies = cookie_dict 32 | 33 | def get_cookies(self): 34 | # cookie_string = '' 35 | cookie_string = '' 36 | cookie_dict = {} 37 | for kv in cookie_string.split(';'): 38 | k = kv.split('=')[0] 39 | v = kv.split('=')[1] 40 | cookie_dict[k] = v 41 | return cookie_dict 42 | -------------------------------------------------------------------------------- /基础篇/scrapy/paper/paper/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | 7 | # useful for handling different item types with a single interface 8 | from itemadapter import ItemAdapter 9 | 10 | 11 | class PaperPipeline: 12 | def process_item(self, item, spider): 13 | return item 14 | -------------------------------------------------------------------------------- /基础篇/scrapy/paper/paper/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for paper project 2 | # 3 | # For simplicity, this file contains only settings considered important or 4 | # commonly used. You can find more settings consulting the documentation: 5 | # 6 | # https://docs.scrapy.org/en/latest/topics/settings.html 7 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 8 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 9 | 10 | BOT_NAME = 'paper' 11 | 12 | SPIDER_MODULES = ['paper.spiders'] 13 | NEWSPIDER_MODULE = 'paper.spiders' 14 | 15 | 16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 17 | #USER_AGENT = 'paper (+http://www.yourdomain.com)' 18 | 19 | # Obey robots.txt rules 20 | ROBOTSTXT_OBEY = False 21 | 22 | LOG_LEVEL = 'WARNING' 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'paper.middlewares.PaperSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 55 | DOWNLOADER_MIDDLEWARES = { 56 | 'paper.middlewares.PaperDownloaderMiddleware': 543, 57 | } 58 | 59 | # Enable or disable extensions 60 | # See https://docs.scrapy.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'paper.pipelines.PaperPipeline': 300, 69 | } 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /基础篇/scrapy/paper/paper/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /基础篇/scrapy/paper/paper/spiders/page.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | 3 | 4 | class PageSpider(scrapy.Spider): 5 | name = 'page' 6 | # allowed_domains = ['www.xxx.com'] 7 | start_urls = ['https://user.17k.com/ck/author/shelf?page=1&appKey=2406394919'] 8 | 9 | def start_requests(self): 10 | url = 'https://passport.17k.com/ck/user/login' 11 | username = '' 12 | password = '' 13 | 14 | # 发送post的方案 15 | yield scrapy.FormRequest( 16 | url=url, 17 | formdata={ 18 | 'loginName': username, 19 | 'password': password 20 | }, 21 | callback=self.parse 22 | ) 23 | 24 | def parse(self, response, **kwargs): 25 | yield scrapy.Request(url=self.start_urls[0], callback=self.detail_parse) 26 | 27 | def detail_parse(self, response): 28 | print(response.json()) -------------------------------------------------------------------------------- /基础篇/scrapy/paper/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = paper.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = paper 12 | -------------------------------------------------------------------------------- /基础篇/scrapy/sunPro/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/基础篇/scrapy/sunPro/__init__.py -------------------------------------------------------------------------------- /基础篇/scrapy/sunPro/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = sunPro.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = sunPro 12 | -------------------------------------------------------------------------------- /基础篇/scrapy/sunPro/sunPro/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/基础篇/scrapy/sunPro/sunPro/__init__.py -------------------------------------------------------------------------------- /基础篇/scrapy/sunPro/sunPro/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/items.html 5 | 6 | import scrapy 7 | 8 | 9 | class SunproItem(scrapy.Item): 10 | # define the fields for your item here like: 11 | # name = scrapy.Field() 12 | number = scrapy.Field() 13 | title = scrapy.Field() 14 | status = scrapy.Field() 15 | content = scrapy.Field() 16 | city = scrapy.Field() 17 | time = scrapy.Field() 18 | 19 | # class DetailItem(scrapy.Item): 20 | # # define the fields for your item here like: 21 | # # name = scrapy.Field() 22 | # id = scrapy.Field() 23 | # content = scrapy.Field() -------------------------------------------------------------------------------- /基础篇/scrapy/sunPro/sunPro/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | 7 | # useful for handling different item types with a single interface 8 | import pymysql 9 | from itemadapter import ItemAdapter 10 | 11 | 12 | # class SunproPipeline: 13 | # def process_item(self, item, spider): 14 | # # 如何判断item的类型 15 | # # 将数据写入数据库中,如何保证数据的一致性 16 | # if item.__class__.__name__ == 'DetailItem': 17 | # print(item['id'], item['content']) 18 | # else: 19 | # print(item['number'], item['title']) 20 | # return item 21 | 22 | 23 | class mysqlPipeLine(object): 24 | # 数据库连接 25 | conn = None 26 | cursor = None 27 | 28 | def open_spider(self, spider): 29 | self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='', db='Bossjob', charset='utf8') 30 | 31 | def process_item(self, item, spider): 32 | self.cursor = self.conn.cursor() 33 | 34 | try: 35 | self.cursor.execute('insert into new values("%s", "%s", "%s", "%s", "%s", "%s")' % 36 | (item['number'], item['title'], item['content'], item['status'], item['city'], item['time'])) 37 | self.conn.commit() 38 | print('成功插入编号为', item['number'], '的数据!') 39 | except Exception as e: 40 | print(e) 41 | print('error!') 42 | self.conn.rollback() 43 | 44 | return item 45 | 46 | def close_spider(self, spider): 47 | self.cursor.close() 48 | self.conn.close() 49 | -------------------------------------------------------------------------------- /基础篇/scrapy/sunPro/sunPro/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for sunPro project 2 | # 3 | # For simplicity, this file contains only settings considered important or 4 | # commonly used. You can find more settings consulting the documentation: 5 | # 6 | # https://docs.scrapy.org/en/latest/topics/settings.html 7 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 8 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 9 | 10 | BOT_NAME = 'sunPro' 11 | 12 | SPIDER_MODULES = ['sunPro.spiders'] 13 | NEWSPIDER_MODULE = 'sunPro.spiders' 14 | 15 | 16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 17 | #USER_AGENT = 'sunPro (+http://www.yourdomain.com)' 18 | 19 | # Obey robots.txt rules 20 | ROBOTSTXT_OBEY = False 21 | LOG_LEVEL = 'ERROR' 22 | 23 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 24 | CONCURRENT_REQUESTS = 32 25 | 26 | # Configure a delay for requests for the same website (default: 0) 27 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 28 | # See also autothrottle settings and docs 29 | DOWNLOAD_DELAY = 3 30 | # The download delay setting will honor only one of: 31 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 32 | #CONCURRENT_REQUESTS_PER_IP = 16 33 | 34 | # Disable cookies (enabled by default) 35 | #COOKIES_ENABLED = False 36 | 37 | # Disable Telnet Console (enabled by default) 38 | #TELNETCONSOLE_ENABLED = False 39 | 40 | # Override the default request headers: 41 | #DEFAULT_REQUEST_HEADERS = { 42 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 43 | # 'Accept-Language': 'en', 44 | #} 45 | 46 | # Enable or disable spider middlewares 47 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 48 | #SPIDER_MIDDLEWARES = { 49 | # 'sunPro.middlewares.SunproSpiderMiddleware': 543, 50 | #} 51 | 52 | # Enable or disable downloader middlewares 53 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 54 | DOWNLOADER_MIDDLEWARES = { 55 | 'sunPro.middlewares.RandomuaDownloaderMiddleware': 543, 56 | 'sunPro.middlewares.CookieDownloaderMiddleware': 400, 57 | 'sunPro.middlewares.SunproDownloaderMiddleware': 300, 58 | } 59 | 60 | # Enable or disable extensions 61 | # See https://docs.scrapy.org/en/latest/topics/extensions.html 62 | #EXTENSIONS = { 63 | # 'scrapy.extensions.telnet.TelnetConsole': None, 64 | #} 65 | 66 | # Configure item pipelines 67 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 68 | ITEM_PIPELINES = { 69 | 'sunPro.pipelines.mysqlPipeLine': 200, 70 | } 71 | 72 | # Enable and configure the AutoThrottle extension (disabled by default) 73 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 74 | #AUTOTHROTTLE_ENABLED = True 75 | # The initial download delay 76 | #AUTOTHROTTLE_START_DELAY = 5 77 | # The maximum download delay to be set in case of high latencies 78 | #AUTOTHROTTLE_MAX_DELAY = 60 79 | # The average number of requests Scrapy should be sending in parallel to 80 | # each remote server 81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 82 | # Enable showing throttling stats for every response received: 83 | #AUTOTHROTTLE_DEBUG = False 84 | 85 | # Enable and configure HTTP caching (disabled by default) 86 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 87 | #HTTPCACHE_ENABLED = True 88 | #HTTPCACHE_EXPIRATION_SECS = 0 89 | #HTTPCACHE_DIR = 'httpcache' 90 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 92 | -------------------------------------------------------------------------------- /基础篇/scrapy/sunPro/sunPro/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /基础篇/scrapy/sunPro/sunPro/spiders/sun.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import scrapy 4 | from scrapy.linkextractors import LinkExtractor 5 | from scrapy.spiders import CrawlSpider, Rule 6 | from selenium import webdriver 7 | from selenium.webdriver import ChromeOptions 8 | from ..items import SunproItem 9 | 10 | 11 | class SunSpider(CrawlSpider): 12 | name = 'sun' 13 | # allowed_domains = ['www.xxx.com'] 14 | start_urls = ['https://wz.sun0769.com/political/index/politicsNewest'] 15 | 16 | # 实例化一个浏览器对象 17 | def __init__(self, **kwargs): 18 | # 实现让selenium规避被检测到的风险 19 | super().__init__(**kwargs) 20 | option = ChromeOptions() 21 | option.add_experimental_option('excludeSwitches', ['enable-automation']) 22 | option.add_experimental_option('excludeSwitches', ['enable-logging']) 23 | option.add_argument("--no-sandbox") 24 | option.add_argument("--disable-dev-shm-usage") 25 | option.add_argument("--window-size=1920,1080") # 建议设置窗口大小 26 | option.add_argument('--headless') 27 | option.add_argument('--disable-gpu') 28 | option.add_argument('blink-settings=imagesEnabled=false') 29 | self.bro = webdriver.Chrome(executable_path='D:\爬虫\selenium\chromedriver.exe', options=option) 30 | 31 | def closed(self, spider): 32 | self.bro.quit() 33 | 34 | # 链接提取器: 根据指定规则(allow=r'正则表达式')进行指定链接提取 35 | link = LinkExtractor(allow=r'id=1&page=\d', restrict_xpaths='/html/body/div[2]/div[3]/div[3]/div/a') 36 | # link_detail = LinkExtractor(restrict_xpaths='/html/body/div[2]/div[3]/ul[2]/li/span[3]/a') 37 | 38 | rules = ( 39 | # 规则解析器: 将链接提取器提取到的链接进行指定规则(callback)的解析操作 40 | # follow=True: 可以将链接提取器继续作用到链接提取器提取到的链接所对应的页面中 41 | Rule(link, callback='parse_item', follow=True), 42 | # Rule(link_detail, callback='parse_detail'), 43 | ) 44 | 45 | # 解析投诉的编号和标题 46 | def parse_item(self, response): 47 | li_list = response.xpath('/html/body/div[2]/div[3]/ul[2]/li') 48 | for li in li_list: 49 | item = SunproItem() 50 | number = li.xpath('./span[1]/text()').extract_first() 51 | item['number'] = number 52 | status = li.xpath('./span[2]/text()').extract_first().strip() 53 | item['status'] = status 54 | title = li.xpath('./span[3]/a/text()').extract_first() 55 | item['title'] = title 56 | detail_url = 'https://wz.sun0769.com' + li.xpath('./span[3]/a/@href').extract_first() 57 | 58 | yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={'item': item}) 59 | 60 | # 解析投诉的内容 61 | def parse_detail(self, response): 62 | item = response.meta['item'] 63 | content = response.xpath('/html/body/div[3]/div[2]/div[2]/div[2]/pre//text()').extract() 64 | content = ''.join(content) 65 | item['content'] = content 66 | city = response.xpath('/html/body/div[3]/div[2]/div[2]/div[1]/span[2]/text()').extract_first() 67 | c = re.sub(' 来自:', '', city) 68 | C = re.sub(' ', '', c) 69 | item['city'] = C 70 | time = response.xpath('/html/body/div[3]/div[2]/div[2]/div[1]/span[3]/text()').extract_first() 71 | item['time'] = time 72 | # print(item) 73 | yield item 74 | -------------------------------------------------------------------------------- /基础篇/scrapy/wangyi/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/基础篇/scrapy/wangyi/__init__.py -------------------------------------------------------------------------------- /基础篇/scrapy/wangyi/news.txt: -------------------------------------------------------------------------------- 1 | (1)加拿大将为乌克兰购买美制防空系统 俄方:荒谬: 2 | 3 | 来源:环球网【环球网报道 见习记者 李律杉】据路透社报道,美加两国元首在墨西哥城会晤后,加拿大总理特鲁多办公室周二(10日)发表声明称,加拿大将为乌克兰购买美国制造的“国家先进地对空导弹系统”(NASAMS)。报道披露,当天特鲁多和拜登正在墨西哥参加第十届北美领导人峰会,两人在支持乌克兰方面进行了单独会晤。在此期间,特鲁多告诉拜登,加拿大将为乌克兰购买美制地空导弹系统一事。“这是加拿大首次向乌克兰捐赠防空系统。”加拿大国防部长安妮塔·阿南德在推特上写道。她还表示,乌克兰防长列兹尼科夫10日早些时候在电话中告诉她,得到防空系统是乌克兰的首要任务。阿南德介绍称,NASAMS是一种中短程地面防空系统,可抵御无人机、导弹和飞机的攻击。对于加拿大这一援乌决定,俄罗斯驻加拿大大使奥列格·斯捷潘诺夫作出回应。据俄罗斯卫星通讯社报道,斯捷潘诺夫在得知此事表示,“特鲁多总理的内阁把钱花在(进一步)激化战争上,支持一个距离加拿大上千公里之外的非法政权,这看起来很荒谬。”“尤其荒谬的是,(这是)在加拿大目前国内还面临着各种问题的背景下(做出的决定)。”另外,根据加拿大总理办公室的声明,特鲁多和拜登还就加拿大皇家空军采购F-35战斗机一事展开讨论。据央视新闻报道,加拿大国防部长安妮塔·阿南德当地时间1月9日宣布,加拿大已经签署了购买F-35战机的最终合同,初期购买金额达190亿加元。据悉,这88架战机中的第一架将在2026年之前交付,而第一批F-35中队将在2029年之前投入使用。 4 | 5 | (35)台媒:57架次解放军军机进入台岛周边 "异常紧张": 6 | 7 | 来源:环球网【环球网报道】“解放军对台打击军演 57架次共机‘三面围台’ 我战机与地面飞弹紧盯”,中国人民解放军东部战区1月8日位台岛周边海空域组织诸军兵种联合战备警巡和实战化演练第二天,台湾中时新闻网以此为题渲染“气氛异常紧张”。台防务部门9日的说法宣称,自8日上午6时至9日上午6时止,“侦获”解放军军机57架次(其中28架次逾越“台海中线”)、军舰4艘次,持续在台湾海峡周边活动。8日夜,东部战区新闻发言人施毅陆军大校表示,当天中国人民解放军东部战区位台岛周边海空域组织诸军兵种联合战备警巡和实战化演练,重点演练对陆打击、对海突击等内容,旨在检验部队联合作战能力,坚决反击外部势力、“台独”分裂势力勾连挑衅行径。中时新闻网9日称,解放军军机“扰台”范围明显扩大且集中在8日夜间,台空军战机整夜不断紧急升空,地面导弹部队更是进入高度警戒。台军还声称,运用任务机、舰艇及岸基导弹系统“严密监控”与“应处”。中时新闻网还称,台各空军基地8日晚气氛异常紧张,从北到南甚至东部,各基地战机接连紧急起飞,架次比平常多,状况如去年大陆军演一般,不少住在基地周边的民众都感觉到一丝不寻常的气氛,直到解放军东部战区发文,才知道原因是大陆进行演练。此次演习距东部战区位台岛周边海空域演习还不到半个月,2022年12月25日,中国人民解放军东部战区位台岛周边海空域组织诸军兵种联合战备警巡和联合火力打击演练。这是针对当前美台升级勾连挑衅的坚决回应。此前的12月23日,美国总统拜登签署“2023财年国防授权法案”,其中一项内容是未来5年将对台提供总额100亿美元、每年最多20亿美元的“军事援助”。该法案还要求“加速处理台湾军购请求”,并建议邀请台湾参与2024年“环太平洋军演”。这些严重违反一个中国原则和中美三个联合公报规定的恶性条款,给台海和平稳定造成严重损害。 8 | 9 | (34)德媒:柏林正疯狂寻找向基辅承诺的40辆步兵战车: 10 | 11 | 来源:中国新闻网中新网1月9日电 据德国《明镜》周刊报道,德国正在“疯狂地”寻找给乌克兰承诺的40辆“黄鼠狼”步兵战车,柏林将不得不从自己的武装力量储备中取出所承诺战车的大部分。报道称,德国总理朔尔茨此前曾向基辅承诺了40辆“黄鼠狼”步兵战车,目前联邦政府正在疯狂地寻找承诺的步兵战车。“德国政府尚未准备好供应此类军备,这就是为什么德国国防军必须清空其仓库,但它储备状态其实已经很差了。”德国联邦议院议员亨宁·奥特说道。报道指出,当政府决定将“黄鼠狼”步兵战车交付给乌克兰,德国军方、政界人士和安全专家都开始怀疑柏林将从哪里获得承诺的设备。朔尔茨的话“没那么容易实现”。消息显示,德国国防企业莱茵金属(Rheinmetall)公司库存有近60辆有缺陷的“黄鼠狼”步兵战车,但将其升级会需要很长时间。据报道,德国总理朔尔茨与美国总统拜登5日通电话,就向基辅运送重型军事装备达成一致。随后德国宣布,拟向乌克兰供应40辆“黄鼠狼”步兵战车和1枚“爱国者”防空导弹。乌克兰局势升级以来,德国已向乌克兰提供价值22.5亿欧元的武器和军事装备。 12 | 13 | -------------------------------------------------------------------------------- /基础篇/scrapy/wangyi/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = wangyi.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = wangyi 12 | -------------------------------------------------------------------------------- /基础篇/scrapy/wangyi/wangyi/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/基础篇/scrapy/wangyi/wangyi/__init__.py -------------------------------------------------------------------------------- /基础篇/scrapy/wangyi/wangyi/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/items.html 5 | 6 | import scrapy 7 | 8 | 9 | class WangyiItem(scrapy.Item): 10 | # define the fields for your item here like: 11 | # name = scrapy.Field() 12 | title = scrapy.Field() 13 | content = scrapy.Field() 14 | number = scrapy.Field() 15 | 16 | -------------------------------------------------------------------------------- /基础篇/scrapy/wangyi/wangyi/middlewares.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your spider middleware 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 5 | import random 6 | 7 | from scrapy import signals 8 | 9 | # useful for handling different item types with a single interface 10 | from itemadapter import is_item, ItemAdapter 11 | 12 | from .fake_useragent import USER_AGENTS 13 | from scrapy.http import HtmlResponse 14 | from time import sleep 15 | 16 | 17 | class WangyiDownloaderMiddleware: 18 | # Not all methods need to be defined. If a method is not defined, 19 | # scrapy acts as if the downloader middleware does not modify the 20 | # passed objects. 21 | 22 | def process_request(self, request, spider): 23 | # UA伪装 24 | request.headers['User-Agent'] = random.choice(USER_AGENTS) 25 | return None 26 | 27 | def process_response(self, request, response, spider): 28 | # 挑选出指定的响应对象进行篡改 29 | # 通过url指定request,通过request指定response 30 | # 获取动态加载出的动态数据,基于selenium 31 | bro = spider.bro 32 | 33 | if request.url in spider.models_url: 34 | # 五大板块对应的响应对象 35 | # 针对定位到的这些response进行篡改 36 | # 实例化一个新响应对象,包含动态加载的新闻数据,用新的换旧的 37 | bro.get(request.url) 38 | sleep(0.5) 39 | bro.execute_script('window.scrollTo(0,10000)') 40 | page_text = bro.page_source 41 | # self.fp = open('./news.html', 'w', encoding='utf-8') 42 | # self.fp.write(page_text) 43 | # self.fp.close() 44 | new_response = HtmlResponse(url=request.url, body=page_text, encoding='utf-8', request=request) 45 | 46 | return new_response 47 | else: 48 | # 其他请求对应的响应对象 49 | return response 50 | 51 | def process_exception(self, request, exception, spider): 52 | pass 53 | -------------------------------------------------------------------------------- /基础篇/scrapy/wangyi/wangyi/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | 7 | # useful for handling different item types with a single interface 8 | from itemadapter import ItemAdapter 9 | 10 | 11 | class WangyiPipeline(object): 12 | fp = None 13 | 14 | # 重写父类的一个方法:该方法只在开始爬虫的时候被调用一次 15 | def open_spider(self, spider): 16 | print('开始爬虫!') 17 | self.fp = open('./news.txt', 'w', encoding='utf-8') 18 | 19 | # 专门用来处理item类型对象 20 | # 该方法可以接受爬虫文件提交过来的item对象 21 | # 该方法每接收到一个item就会被调用一次 22 | def process_item(self, item, spider): 23 | title = item['title'] 24 | content = item['content'] 25 | number = item['number'] 26 | print('正在下载第', number, '个新闻。。。') 27 | # 持久化存储 28 | self.fp.write('(' + str(number) + ')' + title + ':' + '\n' + content + '\n') 29 | 30 | return item # 就会传递给下一个即将被执行的管道类 31 | 32 | # 重写父类 33 | def close_spider(self, spider): 34 | print('结束爬虫!') 35 | self.fp.close() 36 | -------------------------------------------------------------------------------- /基础篇/scrapy/wangyi/wangyi/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for wangyi project 2 | # 3 | # For simplicity, this file contains only settings considered important or 4 | # commonly used. You can find more settings consulting the documentation: 5 | # 6 | # https://docs.scrapy.org/en/latest/topics/settings.html 7 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 8 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 9 | 10 | BOT_NAME = 'wangyi' 11 | 12 | SPIDER_MODULES = ['wangyi.spiders'] 13 | NEWSPIDER_MODULE = 'wangyi.spiders' 14 | 15 | LOG_LEVEL = 'ERROR' 16 | 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 18 | #USER_AGENT = 'wangyi (+http://www.yourdomain.com)' 19 | 20 | # Obey robots.txt rules 21 | ROBOTSTXT_OBEY = False 22 | 23 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 24 | CONCURRENT_REQUESTS = 32 25 | 26 | # Configure a delay for requests for the same website (default: 0) 27 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 28 | # See also autothrottle settings and docs 29 | DOWNLOAD_DELAY = 3 30 | # The download delay setting will honor only one of: 31 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 32 | #CONCURRENT_REQUESTS_PER_IP = 16 33 | 34 | # Disable cookies (enabled by default) 35 | #COOKIES_ENABLED = False 36 | 37 | # Disable Telnet Console (enabled by default) 38 | #TELNETCONSOLE_ENABLED = False 39 | 40 | # Override the default request headers: 41 | #DEFAULT_REQUEST_HEADERS = { 42 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 43 | # 'Accept-Language': 'en', 44 | #} 45 | 46 | # Enable or disable spider middlewares 47 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 48 | #SPIDER_MIDDLEWARES = { 49 | # 'wangyi.middlewares.WangyiSpiderMiddleware': 543, 50 | #} 51 | 52 | # Enable or disable downloader middlewares 53 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 54 | DOWNLOADER_MIDDLEWARES = { 55 | 'wangyi.middlewares.WangyiDownloaderMiddleware': 543, 56 | } 57 | 58 | # Enable or disable extensions 59 | # See https://docs.scrapy.org/en/latest/topics/extensions.html 60 | #EXTENSIONS = { 61 | # 'scrapy.extensions.telnet.TelnetConsole': None, 62 | #} 63 | 64 | # Configure item pipelines 65 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 66 | ITEM_PIPELINES = { 67 | 'wangyi.pipelines.WangyiPipeline': 300, 68 | } 69 | 70 | # Enable and configure the AutoThrottle extension (disabled by default) 71 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 72 | #AUTOTHROTTLE_ENABLED = True 73 | # The initial download delay 74 | #AUTOTHROTTLE_START_DELAY = 5 75 | # The maximum download delay to be set in case of high latencies 76 | #AUTOTHROTTLE_MAX_DELAY = 60 77 | # The average number of requests Scrapy should be sending in parallel to 78 | # each remote server 79 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 80 | # Enable showing throttling stats for every response received: 81 | #AUTOTHROTTLE_DEBUG = False 82 | 83 | # Enable and configure HTTP caching (disabled by default) 84 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 85 | #HTTPCACHE_ENABLED = True 86 | #HTTPCACHE_EXPIRATION_SECS = 0 87 | #HTTPCACHE_DIR = 'httpcache' 88 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 89 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 90 | -------------------------------------------------------------------------------- /基础篇/scrapy/wangyi/wangyi/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /基础篇/scrapy/wangyi/wangyi/spiders/news.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from selenium import webdriver 3 | from selenium.webdriver import ChromeOptions 4 | from ..items import WangyiItem 5 | 6 | 7 | class NewsSpider(scrapy.Spider): 8 | name = 'news' 9 | # allowed_domains = ['www.xxx.com'] 10 | start_urls = ['https://news.163.com/'] 11 | models_url = [] # 存放板块的详情页url 12 | number = 1 13 | 14 | # 实例化一个浏览器对象 15 | def __init__(self, **kwargs): 16 | # 实现让selenium规避被检测到的风险 17 | super().__init__(**kwargs) 18 | option = ChromeOptions() 19 | option.add_experimental_option('excludeSwitches', ['enable-automation']) 20 | option.add_experimental_option('useAutomationExtension', False) 21 | option.add_experimental_option('excludeSwitches', ['enable-logging']) 22 | option.add_argument("--no-sandbox") 23 | option.add_argument("--disable-dev-shm-usage") 24 | option.add_argument("--window-size=1920,1080") # 建议设置窗口大小 25 | option.add_argument('--headless') 26 | option.add_argument('--disable-gpu') 27 | self.bro = webdriver.Chrome(executable_path='D:\爬虫\selenium\chromedriver.exe', options=option) 28 | 29 | def closed(self, spider): 30 | self.bro.quit() 31 | 32 | # 解析每一个板块对应的详情页url 33 | # 每一个板块对应新闻相关的内容都是动态加载出来的 34 | def detail_parse(self, response): 35 | div_list = response.xpath('//div[@class="ndi_main"]/div[@class="data_row news_article clearfix news_first"] | //div[@class="ndi_main"]/div[@class="data_row news_article clearfix "]') 36 | # print(div_list) 37 | for div in div_list: 38 | item = WangyiItem() 39 | title = div.xpath('./div/div/h3/a/text()').extract_first() 40 | item['title'] = title 41 | item['number'] = self.number 42 | self.number += 1 43 | content_url = div.xpath('./div/div/h3/a/@href').extract_first() 44 | 45 | yield scrapy.Request(url=content_url, callback=self.content_parse, meta={'item': item}) 46 | 47 | # 解析新闻内容 48 | def content_parse(self, response): 49 | item = response.meta['item'] 50 | content = response.xpath('//*[@id="content"]/div[2]//text()').extract() 51 | content = ''.join(content) 52 | item['content'] = content 53 | # print(item) 54 | yield item 55 | 56 | # 解析五大板块的详情页url 57 | def parse(self, response): 58 | li_list = response.xpath('//*[@id="index2016_wrap"]/div[3]/div[2]/div[2]/div[2]/div/ul/li') 59 | alist = [1, 2, 4, 5] # 存储各个领域的li标签编号 60 | 61 | for index in alist: 62 | model_url = li_list[index].xpath('./a/@href').extract_first() 63 | # print(model_url) 64 | self.models_url.append(model_url) 65 | 66 | # 依次对每个板块进行发起请求 67 | for url in self.models_url: 68 | yield scrapy.Request(url=url, callback=self.detail_parse) 69 | -------------------------------------------------------------------------------- /基础篇/scrapy/xiaohua/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/基础篇/scrapy/xiaohua/__init__.py -------------------------------------------------------------------------------- /基础篇/scrapy/xiaohua/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = xiaohua.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = xiaohua 12 | -------------------------------------------------------------------------------- /基础篇/scrapy/xiaohua/xiaohua/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/基础篇/scrapy/xiaohua/xiaohua/__init__.py -------------------------------------------------------------------------------- /基础篇/scrapy/xiaohua/xiaohua/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/items.html 5 | 6 | import scrapy 7 | 8 | 9 | class XiaohuaItem(scrapy.Item): 10 | # define the fields for your item here like: 11 | # name = scrapy.Field() 12 | author = scrapy.Field() 13 | content = scrapy.Field() 14 | -------------------------------------------------------------------------------- /基础篇/scrapy/xiaohua/xiaohua/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | 7 | # useful for handling different item types with a single interface 8 | from itemadapter import ItemAdapter 9 | import pymysql 10 | 11 | 12 | class XiaohuaPipeline: 13 | fp = None 14 | 15 | # 重写父类 16 | def open_spider(self, spider): 17 | print('开始爬虫。。。') 18 | self.fp = open('./xiaohua.txt', 'w', encoding='utf-8') 19 | 20 | def process_item(self, item, spider): 21 | author = item['author'] 22 | content = item['content'] 23 | 24 | # 持久化存储 25 | self.fp.write(author + '-->' + '\n' + content + '\n') 26 | 27 | return item 28 | 29 | # 重写父类 30 | def close_spider(self, spider): 31 | print('结束爬虫!') 32 | self.fp.close() 33 | 34 | 35 | class mysqlPipeLine(object): 36 | # 数据库连接 37 | conn = None 38 | cursor = None 39 | 40 | def open_spider(self, spider): 41 | self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='', db='xioahua', charset='utf8') 42 | 43 | def process_item(self, item, spider): 44 | self.cursor = self.conn.cursor() 45 | 46 | try: 47 | self.cursor.execute('insert into xiaohua.xiaohua values("%s", "%s")' % (item["author"], item["content"])) 48 | self.conn.commit() 49 | except Exception as e: 50 | print(e) 51 | self.conn.rollback() 52 | 53 | return item 54 | 55 | def close_spider(self, spider): 56 | self.cursor.close() 57 | self.conn.close() -------------------------------------------------------------------------------- /基础篇/scrapy/xiaohua/xiaohua/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for xiaohua project 2 | # 3 | # For simplicity, this file contains only settings considered important or 4 | # commonly used. You can find more settings consulting the documentation: 5 | # 6 | # https://docs.scrapy.org/en/latest/topics/settings.html 7 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 8 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 9 | 10 | BOT_NAME = 'xiaohua' 11 | 12 | SPIDER_MODULES = ['xiaohua.spiders'] 13 | NEWSPIDER_MODULE = 'xiaohua.spiders' 14 | 15 | 16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 17 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36' 18 | 19 | # Obey robots.txt rules 20 | ROBOTSTXT_OBEY = False 21 | 22 | LOG_LEVEL = 'ERROR' 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'xiaohua.middlewares.XiaohuaSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'xiaohua.middlewares.XiaohuaDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See https://docs.scrapy.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'xiaohua.pipelines.XiaohuaPipeline': 300, 69 | 'xiaohua.pipelines.mysqlPipeLine': 301, 70 | } 71 | 72 | # Enable and configure the AutoThrottle extension (disabled by default) 73 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 74 | #AUTOTHROTTLE_ENABLED = True 75 | # The initial download delay 76 | #AUTOTHROTTLE_START_DELAY = 5 77 | # The maximum download delay to be set in case of high latencies 78 | #AUTOTHROTTLE_MAX_DELAY = 60 79 | # The average number of requests Scrapy should be sending in parallel to 80 | # each remote server 81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 82 | # Enable showing throttling stats for every response received: 83 | #AUTOTHROTTLE_DEBUG = False 84 | 85 | # Enable and configure HTTP caching (disabled by default) 86 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 87 | #HTTPCACHE_ENABLED = True 88 | #HTTPCACHE_EXPIRATION_SECS = 0 89 | #HTTPCACHE_DIR = 'httpcache' 90 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 92 | -------------------------------------------------------------------------------- /基础篇/scrapy/xiaohua/xiaohua/spiders/Xiaohua.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from ..items import XiaohuaItem 3 | 4 | 5 | class XiaohuaSpider(scrapy.Spider): 6 | name = 'Xiaohua' 7 | # allowed_domains = ['www.xiaohua.com'] 8 | start_urls = ['https://www.xiaohua.com/duanzi/'] 9 | 10 | # 生成一个通用的url模板 11 | url = 'https://www.xiaohua.com/duanzi?page=%d' 12 | page_num = 2 13 | 14 | def parse(self, response): 15 | div_list = response.xpath('/html/body/div/div[8]/div[2]/div[2]/div[@class="one-cont"]') 16 | all_data = [] 17 | for div in div_list: 18 | author = div.xpath('./div/div/a/i/text()')[0].extract() 19 | content = div.xpath('./p/a//text()').extract() 20 | # 将列表转化为字符串 21 | content = ''.join(content) 22 | item = XiaohuaItem() 23 | item['author'] = author 24 | item['content'] = content 25 | # 将item提交给管道 26 | yield item 27 | 28 | if self.page_num <= 3: 29 | new_url = format(self.url % self.page_num) 30 | self.page_num += 1 31 | # 手动请求发送;callback回调函数是专门用作数据解析 32 | yield scrapy.Request(url=new_url, callback=self.parse) 33 | -------------------------------------------------------------------------------- /基础篇/scrapy/xiaohua/xiaohua/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /基础篇/scrapy/yiche/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/基础篇/scrapy/yiche/__init__.py -------------------------------------------------------------------------------- /基础篇/scrapy/yiche/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = yiche.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = yiche 12 | -------------------------------------------------------------------------------- /基础篇/scrapy/yiche/test.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | 4 | fp = open('1.txt', 'r', encoding='utf-8').read() 5 | res = json.loads(fp) 6 | k = 0 7 | car = [] # 存储所有车辆的所有信息 8 | name_list = [] # 存储参数的名称 9 | 10 | while k < len(res['data']): 11 | if k >= 3: # 控制获取信息到目录的第几级 12 | break 13 | else: 14 | item_list = res["data"][k]["items"] 15 | 16 | value_list = [] 17 | car_list = [] 18 | 19 | for item in item_list: 20 | # 车辆颜色需要专门写 21 | if item['id'] == -30 or item['id'] == -31: 22 | break 23 | else: 24 | name_list.append(item['name']) 25 | value_list.append(item['paramValues']) 26 | 27 | for value in value_list: 28 | i = 0 29 | while i < len(value): 30 | va = value[i]['value'] 31 | if va == '-': 32 | va = value[i]['subList'][0]['value'] 33 | car_list.append(va) 34 | i += 1 35 | car.append(car_list) 36 | car_list = [] 37 | k += 1 38 | 39 | # 规范汽车参数格式 40 | forN = len(car) # 参数的个数 41 | carN = len(car[1]) # 车辆的个数 42 | car = sum(car, []) # 整合汽车信息 43 | time = 0 # 循环次数 44 | name0 = [] 45 | a = [] 46 | b = [] 47 | 48 | 49 | while time < carN: 50 | x = time 51 | k = 0 52 | for i in range(forN): 53 | if k == 0: 54 | name0.append(car[x]) 55 | else: 56 | c = name_list[k] + ':' + car[x] 57 | a.append(c) 58 | x += carN 59 | k += 1 60 | time += 1 61 | b.append(a) 62 | a = [] 63 | s = [] 64 | 65 | for k in b: 66 | k = ' '.join(k) 67 | s.append(k) 68 | 69 | print(s, name0) 70 | -------------------------------------------------------------------------------- /基础篇/scrapy/yiche/yiche/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/基础篇/scrapy/yiche/yiche/__init__.py -------------------------------------------------------------------------------- /基础篇/scrapy/yiche/yiche/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/items.html 5 | 6 | import scrapy 7 | 8 | 9 | class YicheItem(scrapy.Item): 10 | # define the fields for your item here like: 11 | # name = scrapy.Field() 12 | brand = scrapy.Field() 13 | car_name = scrapy.Field() 14 | car_num = scrapy.Field() 15 | car_detail = scrapy.Field() 16 | car_name1 = scrapy.Field() 17 | 18 | -------------------------------------------------------------------------------- /基础篇/scrapy/yiche/yiche/middlewares.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your spider middleware 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 5 | 6 | from scrapy import signals 7 | 8 | # useful for handling different item types with a single interface 9 | from itemadapter import is_item, ItemAdapter 10 | 11 | # class jsDownloaderMiddleware(object): 12 | # def process_requset(self, request, spider): 13 | 14 | -------------------------------------------------------------------------------- /基础篇/scrapy/yiche/yiche/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | 7 | # useful for handling different item types with a single interface 8 | import time 9 | 10 | from itemadapter import ItemAdapter 11 | 12 | 13 | class YichePipeline: 14 | def process_item(self, item, spider): 15 | print(item) 16 | return item 17 | 18 | 19 | import pymysql 20 | 21 | 22 | class mysqlPipeLine(object): 23 | # 数据库连接 24 | conn = None 25 | cursor = None 26 | 27 | def open_spider(self, spider): 28 | self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='', db='Spider', 29 | charset='utf8') 30 | print('开始插入数据') 31 | 32 | def process_item(self, item, spider): 33 | self.cursor = self.conn.cursor() 34 | 35 | try: 36 | i = 0 37 | if item['car_detail'] == '参数配置暂未公开' or item['car_detail'] == '暂无在售车辆': 38 | self.cursor.execute('insert into cars values("%s", "%s", "%s", "%s", "%s")' % ( 39 | item["brand"], item['car_name'], item["car_name"] + item['car_name1'], item["car_num"], item['car_detail'])) 40 | self.conn.commit() 41 | print(item['car_name']) 42 | else: 43 | for k in item['car_detail']: 44 | v = item['car_name1'][i] 45 | i += 1 46 | self.cursor.execute('insert into cars values("%s","%s", "%s", "%s", "%s")' % ( 47 | item["brand"], item['car_name'], item["car_name"] + ' ' + v, item["car_num"], k)) 48 | self.conn.commit() 49 | print(item['car_name']) 50 | except Exception as e: 51 | # print(item) 52 | print(e) 53 | self.conn.rollback() 54 | 55 | return item 56 | 57 | def close_spider(self, spider): 58 | print('结束插入数据') 59 | if self.cursor: 60 | self.cursor.close() 61 | if self.conn: 62 | self.conn.close() 63 | -------------------------------------------------------------------------------- /基础篇/scrapy/yiche/yiche/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for yiche project 2 | # 3 | # For simplicity, this file contains only settings considered important or 4 | # commonly used. You can find more settings consulting the documentation: 5 | # 6 | # https://docs.scrapy.org/en/latest/topics/settings.html 7 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 8 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 9 | 10 | BOT_NAME = 'yiche' 11 | 12 | SPIDER_MODULES = ['yiche.spiders'] 13 | NEWSPIDER_MODULE = 'yiche.spiders' 14 | 15 | 16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 17 | #USER_AGENT = 'yiche (+http://www.yourdomain.com)' 18 | 19 | # Obey robots.txt rules 20 | ROBOTSTXT_OBEY = False 21 | LOG_LEVEL = 'ERROR' 22 | 23 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 24 | #CONCURRENT_REQUESTS = 32 25 | 26 | # Configure a delay for requests for the same website (default: 0) 27 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 28 | # See also autothrottle settings and docs 29 | #DOWNLOAD_DELAY = 3 30 | # The download delay setting will honor only one of: 31 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 32 | #CONCURRENT_REQUESTS_PER_IP = 16 33 | 34 | # Disable cookies (enabled by default) 35 | #COOKIES_ENABLED = False 36 | 37 | # Disable Telnet Console (enabled by default) 38 | #TELNETCONSOLE_ENABLED = False 39 | 40 | # Override the default request headers: 41 | #DEFAULT_REQUEST_HEADERS = { 42 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 43 | # 'Accept-Language': 'en', 44 | #} 45 | 46 | # Enable or disable spider middlewares 47 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 48 | #SPIDER_MIDDLEWARES = { 49 | # 'yiche.middlewares.YicheSpiderMiddleware': 543, 50 | #} 51 | 52 | # Enable or disable downloader middlewares 53 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 54 | # DOWNLOADER_MIDDLEWARES = { 55 | # 'yiche.middlewares.FakeUADownloaderMiddleware': 543, 56 | # } 57 | 58 | # Enable or disable extensions 59 | # See https://docs.scrapy.org/en/latest/topics/extensions.html 60 | #EXTENSIONS = { 61 | # 'scrapy.extensions.telnet.TelnetConsole': None, 62 | #} 63 | 64 | # Configure item pipelines 65 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 66 | ITEM_PIPELINES = { 67 | 'yiche.pipelines.mysqlPipeLine': 300, 68 | # 'yiche.pipelines.YichePipeline': 300, 69 | } 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /基础篇/scrapy/yiche/yiche/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /基础篇/scrapy/yiche/yiche/test.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | def detail(fp): 5 | res = json.loads(fp) 6 | k = 0 7 | car = [] # 存储所有车辆的所有信息 8 | name_list = [] # 存储参数的名称 9 | 10 | while k < len(res['data']): 11 | if k >= 3: # 控制获取信息到目录的第几级 12 | break 13 | else: 14 | item_list = res["data"][k]["items"] 15 | 16 | value_list = [] 17 | car_list = [] 18 | 19 | for item in item_list: 20 | # 车辆颜色需要专门写 21 | if item['id'] == -30 or item['id'] == -31: 22 | break 23 | else: 24 | name_list.append(item['name']) 25 | value_list.append(item['paramValues']) 26 | 27 | for value in value_list: 28 | i = 0 29 | while i < len(value): 30 | va = value[i]['value'] 31 | if va == '-': 32 | va = value[i]['subList'][0]['value'] 33 | car_list.append(va) 34 | i += 1 35 | car.append(car_list) 36 | car_list = [] 37 | k += 1 38 | 39 | # 规范汽车参数格式 40 | forN = len(car) # 参数的个数 41 | carN = len(car[1]) # 车辆的个数 42 | car = sum(car, []) # 整合汽车信息 43 | time = 0 # 循环次数 44 | a = [] 45 | b = [] 46 | name0 = [] 47 | 48 | while time < carN: 49 | x = time 50 | k = 0 51 | for i in range(forN): 52 | if k == 0: 53 | name0.append(car[x]) 54 | else: 55 | c = name_list[k] + ':' + car[x] 56 | a.append(c) 57 | x += carN 58 | k += 1 59 | time += 1 60 | b.append(a) 61 | a = [] 62 | 63 | s = [] 64 | 65 | for k in b: 66 | k = ' '.join(k) 67 | s.append(k) 68 | sk = { 69 | 'detail': s, 70 | 'name': name0 71 | } 72 | return sk 73 | 74 | -------------------------------------------------------------------------------- /基础篇/高性能异步爬虫/flask_server.py: -------------------------------------------------------------------------------- 1 | from flask import Flask 2 | import time 3 | 4 | app = Flask(__name__) 5 | 6 | 7 | @app.route('/dxs') 8 | def index_dxs(): 9 | time.sleep(2) 10 | return 'Hello dxs!' 11 | 12 | 13 | @app.route('/dxy') 14 | def index_dxy(): 15 | time.sleep(2) 16 | return 'Hello dxy!' 17 | 18 | 19 | @app.route('/date') 20 | def index_date(): 21 | time.sleep(2) 22 | return 'dxs date dxy!' 23 | 24 | 25 | if __name__ == '__main__': 26 | app.run(threaded=True) 27 | -------------------------------------------------------------------------------- /基础篇/高性能异步爬虫/meinv.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import aiofile 3 | import requests 4 | from lxml import html 5 | import os 6 | import aiohttp 7 | 8 | etree = html.etree 9 | cookies = { 10 | 'Hm_lvt_c8263f264e5db13b29b03baeb1840f60': '1676030483', 11 | 'Hm_lpvt_c8263f264e5db13b29b03baeb1840f60': '1676030939', 12 | } 13 | headers = { 14 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 15 | 'Accept-Language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en;q=0.7', 16 | 'Cache-Control': 'max-age=0', 17 | 'Connection': 'keep-alive', 18 | # 'Cookie': 'Hm_lvt_c8263f264e5db13b29b03baeb1840f60=1676030483; Hm_lpvt_c8263f264e5db13b29b03baeb1840f60=1676030939', 19 | 'Referer': 'https://www.3gbizhi.com/tag/meinv/2.html', 20 | 'Sec-Fetch-Dest': 'document', 21 | 'Sec-Fetch-Mode': 'navigate', 22 | 'Sec-Fetch-Site': 'same-origin', 23 | 'Sec-Fetch-User': '?1', 24 | 'Upgrade-Insecure-Requests': '1', 25 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36', 26 | 'sec-ch-ua': '"Chromium";v="110", "Not A(Brand";v="24", "Google Chrome";v="110"', 27 | 'sec-ch-ua-mobile': '?0', 28 | 'sec-ch-ua-platform': '"Windows"', 29 | } 30 | 31 | 32 | def getUrl(page): 33 | all = [] 34 | response = requests.get(f'https://desk.3gbizhi.com/deskMV/index_{page}.html', cookies=cookies, headers=headers) 35 | tree = etree.HTML(response.text) 36 | li_list = tree.xpath('/html/body/div[5]/ul/li') 37 | for li in li_list: 38 | photo = { 39 | '标题': li.xpath('./a/img/@title')[0], 40 | 'url': li.xpath('./a/@href')[0] 41 | } 42 | all.append(photo) 43 | return all 44 | 45 | 46 | def getpic(data): 47 | response = requests.get(data['url'], headers, cookies=cookies).text 48 | tree = etree.HTML(response) 49 | url = tree.xpath('//*[@id="showimg"]/a[4]/img/@src')[0] 50 | return url 51 | 52 | 53 | async def thread(url, name): 54 | async with aiohttp.ClientSession() as session: 55 | async with session.get(url, ssl=False, headers=headers, cookies=cookies) as resp: 56 | datas = await resp.read() 57 | async with aiofile.async_open(f'./picLibs/{name}.jpg', 'wb') as fp: 58 | await fp.write(datas) 59 | print(name + '爬取成功!') 60 | 61 | 62 | if __name__ == '__main__': 63 | if not os.path.exists('./picLibs'): 64 | os.mkdir('./picLibs') 65 | loop = asyncio.get_event_loop() 66 | for page in range(1, 24): 67 | print(page) 68 | all = getUrl(page) 69 | URL = [] 70 | for data in all: 71 | url = getpic(data) 72 | name = data['标题'] 73 | URL.append(thread(url, name)) 74 | loop.run_until_complete(asyncio.wait(URL)) 75 | loop.close() 76 | 77 | -------------------------------------------------------------------------------- /基础篇/高性能异步爬虫/minxing.py: -------------------------------------------------------------------------------- 1 | import os 2 | import requests 3 | from lxml import html 4 | 5 | headers = { 6 | 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 7 | 'accept-language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en;q=0.7', 8 | 'cookie': '__yjs_duid=1_f064d94f3576b1069275a2e233974a2c1676030524524; PHPSESSID=1asobv9sgpl0sb0ian1dm9jcc7; sYQDUGqqzHsearch_history=%u7F8E%u5973', 9 | 'referer': 'https://www.syt5.com/mingxing/mnmx', 10 | 'sec-ch-ua': '"Chromium";v="110", "Not A(Brand";v="24", "Google Chrome";v="110"', 11 | 'sec-ch-ua-mobile': '?0', 12 | 'sec-ch-ua-platform': '"Windows"', 13 | 'sec-fetch-dest': 'document', 14 | 'sec-fetch-mode': 'navigate', 15 | 'sec-fetch-site': 'same-origin', 16 | 'sec-fetch-user': '?1', 17 | 'upgrade-insecure-requests': '1', 18 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36', 19 | } 20 | etree = html.etree 21 | url = 'https://www.syt5.com/mingxing/mnmx/index_%d.html' 22 | 23 | 24 | def rebuilt_Language(url, headers): 25 | response = requests.get(url=url, headers=headers) 26 | response.encoding = response.apparent_encoding 27 | return response 28 | 29 | 30 | def getDetailInfo(url): 31 | all = [] 32 | for page in range(2, 20): 33 | new_url = format(url % page) 34 | resp = rebuilt_Language(new_url, headers) 35 | tree = etree.HTML(resp.text) 36 | div_list = tree.xpath('//*[@id="body"]/main/div[4]/div/div') 37 | for div in div_list: 38 | info = { 39 | '标题': div.xpath('./div[1]/a/@title')[0], 40 | '链接': div.xpath('./div[1]/a/@href')[0] 41 | } 42 | all.append(info) 43 | return all 44 | 45 | 46 | def getPhotoUrl(data): 47 | resp = rebuilt_Language(data['链接'], headers) 48 | tree = etree.HTML(resp.text) 49 | li_list = tree.xpath('//*[@id="showimages"]/div[3]/div[2]/div[2]/ul/li') 50 | url = [] 51 | for li in li_list: 52 | s = li.xpath('./a/@href')[0] 53 | url.append(s) 54 | if not url: 55 | li_list = tree.xpath('//*[@id="showimages"]/div[3]/div[3]/div[2]/ul/li') 56 | for li in li_list: 57 | s = li.xpath('./a/@href')[0] 58 | url.append(s) 59 | info = { 60 | '标题': data['标题'], 61 | 'urls': url 62 | } 63 | return info 64 | 65 | 66 | def download(Name, url): 67 | resp = rebuilt_Language(url, headers) 68 | tree = etree.HTML(resp.text) 69 | src = tree.xpath('//*[@id="showpicsouutuIs2020"]/@src')[0] 70 | name = src.split('/')[-1] 71 | data = requests.get(src, headers).content 72 | with open(f'./{Name}/{name}', 'wb')as fp: 73 | fp.write(data) 74 | print('over!') 75 | 76 | 77 | if __name__ == '__main__': 78 | total = getDetailInfo(url) 79 | for data in total: 80 | Info = getPhotoUrl(data) 81 | # print('正在采集'+ Info["标题"]) 82 | # if not os.path.exists(f'./Piclib/{Info["标题"]}'): 83 | # os.mkdir(f'./Piclib/{Info["标题"]}') 84 | # for i in range(len(Info['urls'])): 85 | # download(Info['标题'],Info['urls'][i]) 86 | print(Info) 87 | -------------------------------------------------------------------------------- /基础篇/高性能异步爬虫/协程.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | 4 | async def request(url): 5 | print('正在请求', url) 6 | print(url, '请求成功!') 7 | return url 8 | 9 | # async修饰的函数,调用成功后返回的一个协程对象 10 | c = request('www.baidu.com') 11 | 12 | # # 创建一个协程对象 13 | # loop = asyncio.get_event_loop() 14 | # # 将协程对象注册到loop之中,然后启动loop 15 | # loop.run_until_complete(c) 16 | 17 | # # task的使用 18 | # loop = asyncio.get_event_loop() 19 | # # 基于loop创建了一个task对象 20 | # task = loop.create_task(c) 21 | # print(task) 22 | # loop.run_until_complete(task) 23 | # print(task) 24 | 25 | # # future的使用 26 | # loop = asyncio.get_event_loop() 27 | # task = asyncio.ensure_future(c) 28 | # print(task) 29 | # loop.run_until_complete(task) 30 | # print(task) 31 | 32 | 33 | def callback_func(task): 34 | print(task.result()) 35 | 36 | 37 | # 绑定回调 38 | loop = asyncio.get_event_loop() 39 | task = asyncio.ensure_future(c) 40 | # 将回调函数绑定到任务对象中 41 | task.add_done_callback(callback_func) 42 | loop.run_until_complete(task) 43 | 44 | -------------------------------------------------------------------------------- /基础篇/高性能异步爬虫/多任务协程01.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import time 3 | 4 | 5 | async def request(url): 6 | print('正在下载', url) 7 | # 在异步协程中,如果出现了同步模块相关的代码,就无法实现异步 8 | # time.sleep(2) 9 | # 当在asyncio中遇到阻塞操作必须进行手动挂起 10 | await asyncio.sleep(2) 11 | print('下载完成', url) 12 | 13 | 14 | urls = [ 15 | 'www.baidu.com', 16 | 'www.douban.com', 17 | 'www.shu.edu.cn' 18 | ] 19 | # 任务列表:存放多个任务对象 20 | tasks = [] 21 | for url in urls: 22 | c = request(url) 23 | task = asyncio.ensure_future(c) 24 | tasks.append(task) 25 | 26 | start = time.time() 27 | 28 | loop = asyncio.get_event_loop() 29 | # 需要将任务列表封装到wait中 30 | loop.run_until_complete(asyncio.wait(tasks)) 31 | 32 | end = time.time() 33 | print(end - start) 34 | -------------------------------------------------------------------------------- /基础篇/高性能异步爬虫/多任务异步协程02.py: -------------------------------------------------------------------------------- 1 | # 使用aiohttp中的ClientSession 2 | import requests 3 | import asyncio 4 | import time 5 | import aiohttp 6 | 7 | urls = [ 8 | 'http://127.0.0.1:5000/dxs', 9 | 'http://127.0.0.1:5000/dxy', 10 | 'http://127.0.0.1:5000/date' 11 | ] 12 | 13 | 14 | async def get_page(url): 15 | print('正在下载', url) 16 | # requests发起的请求时基于同步的,必须使用基于异步的网络请求模块进行指定url的请求发送 17 | # aiohttp:基于异步的网络请求 18 | # response = requests.get(url=url).text 19 | async with aiohttp.ClientSession() as session: 20 | # headers,params/data,proxy='http://ip:port' 21 | async with await session.get(url=url) as response: 22 | # text()返回字符串型的响应对象 23 | # read()返回的二进制响应对象 24 | # json()返回的json对象 25 | # 注意在获取响应数据操作之前一定要使用await进行手动挂起 26 | page_text = await response.text() 27 | print('下载完成', url) 28 | return page_text 29 | 30 | 31 | def callback(task): 32 | print(task.result()) 33 | 34 | 35 | tasks = [] 36 | 37 | for url in urls: 38 | c = get_page(url) 39 | task = asyncio.ensure_future(c) 40 | task.add_done_callback(callback) 41 | tasks.append(task) 42 | 43 | start = time.time() 44 | loop = asyncio.get_event_loop() 45 | loop.run_until_complete(asyncio.wait(tasks)) 46 | end = time.time() 47 | 48 | print('总耗时', end - start) 49 | -------------------------------------------------------------------------------- /基础篇/高性能异步爬虫/线程池的基本使用.py: -------------------------------------------------------------------------------- 1 | import time 2 | from multiprocessing.dummy import Pool # 导入线程池模块对应的类 3 | 4 | # # 使用单线程串行方式执行 5 | # def get_page(str): 6 | # print('正在下载: ', str) 7 | # time.sleep(2) # 模拟阻塞操作 8 | # print('下载成功: ', str) 9 | # 10 | # 11 | # name_list = ['aa', 'bb', 'cc', 'dd'] 12 | # start_time = time.time() 13 | # for i in range(len(name_list)): 14 | # get_page(name_list[i]) 15 | # end_time = time.time() 16 | # print('%d second' % (end_time - start_time)) 17 | 18 | 19 | # 使用线程池的方式执行 20 | start_time = time.time() 21 | 22 | 23 | def get_page(str): 24 | print('正在下载:', str) 25 | time.sleep(2) # 模拟阻塞操作 26 | print('下载成功:', str) 27 | 28 | 29 | name_list = ['aa', 'bb', 'cc', 'dd'] 30 | # 实例化一个线程池对象 31 | pool = Pool(4) 32 | # 将列表中的每一个元素传递给get_page处理,返回值就是get_page的返回值 33 | pool.map(get_page, name_list) 34 | end_time = time.time() 35 | 36 | pool.close() 37 | pool.join() 38 | print(end_time - start_time, 'second') 39 | -------------------------------------------------------------------------------- /基础篇/高性能异步爬虫/线程池的应用.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from lxml import etree 3 | from multiprocessing.dummy import Pool 4 | import time 5 | import os 6 | 7 | headers = { 8 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36' 9 | } 10 | # 原则: 线程池处理的是阻塞且耗时的操作 11 | url = 'https://www.pearvideo.com/category_1' 12 | 13 | page_text = requests.get(url=url, headers=headers).text 14 | tree = etree.HTML(page_text) 15 | video_src_list = tree.xpath('//*[@id="listvideoListUl"]/li') 16 | 17 | for li in video_src_list: 18 | video_src = 'https://www.pearvideo.com/' + li.xpath('./div[1]/a/@href')[0] 19 | # print(video_src) 20 | video_name = li.xpath('./div[1]/a/div[2]/text()')[0] 21 | # print(video_name) 22 | 23 | -------------------------------------------------------------------------------- /自动化篇/playwright/使用本地浏览器创建debug模式/README.md: -------------------------------------------------------------------------------- 1 | ## 说明 2 | 3 | 此方法是playwright与本地浏览器以ws方式通信 4 | 5 | 可以绕过基本上大部分浏览器检测,因为这就是一个真正的浏览器 6 | 7 | 两种使用方式: 8 | 9 | 1. 每次运行程序之后先打开浏览器 10 | 11 | > 1. 找到自己桌面chrome的快捷方式键 12 | > 2. 点击属性 13 | > 3. 在目标一栏的最后添加 --remote-debugging-port=9999 端口可自定义 14 | > 4. ``` 15 | > with sync_playwright() as p: 16 | > # 创建一个连接 17 | > browser = p.chromium.connect_over_cdp("http://localhost:9999") 18 | > content = browser.contexts[0] 19 | > page = content.new_page() 20 | > ``` 21 | > 5. 在上述page下进行浏览器操作即可 22 | 23 | 2. 不打开浏览器,自行打开 24 | > 在程序中添加下面的代码即可 25 | >``` 26 | >import subprocess 27 | ># 这个路径可以是Google浏览器的exe路径,也可以是快捷方式的路径 28 | >chrome_path = r'"C:\Program Files\Google\Chrome\Application\chrome.exe"' 29 | >debugging_port = "--remote-debugging-port=9999" 30 | > 31 | >command = f"{chrome_path} {debugging_port}" 32 | >subprocess.Popen(command, shell=True) 33 | >``` 34 | >之后就是 35 | > ``` 36 | > with sync_playwright() as p: 37 | > # 创建一个连接 38 | > browser = p.chromium.connect_over_cdp("http://localhost:9999") 39 | > content = browser.contexts[0] 40 | > page = content.new_page() 41 | > ``` 42 | > 在上述page下进行浏览器操作即可 43 | > 44 | > __注意__: 45 | > 此方法不可以在打开了普通版(非第一种情况)的浏览器使用 -------------------------------------------------------------------------------- /自动化篇/playwright/使用本地浏览器创建debug模式/auto.py: -------------------------------------------------------------------------------- 1 | from playwright.sync_api import sync_playwright 2 | 3 | import subprocess 4 | 5 | # 这个路径可以是Google浏览器的exe路径,也可以是快捷方式的路径 6 | chrome_path = r'"C:\Program Files\Google\Chrome\Application\chrome.exe"' 7 | debugging_port = "--remote-debugging-port=9999" 8 | 9 | command = f"{chrome_path} {debugging_port}" 10 | subprocess.Popen(command, shell=True) 11 | 12 | 13 | # 拦截请求 14 | def intercept_xhr(route, request): 15 | route.continue_() 16 | response = route.fetch() 17 | json = response.json() 18 | print(json) 19 | 20 | 21 | with sync_playwright() as p: 22 | # 创建一个连接 23 | browser = p.chromium.connect_over_cdp("http://localhost:9999") 24 | content = browser.contexts[0] 25 | page = content.new_page() 26 | 27 | # 设置拦截规则 28 | page.route("**/api/sns/web/v1/homefeed", lambda route, request: intercept_xhr(route, request)) 29 | page.goto('https://www.xiaohongshu.com/') 30 | page.wait_for_selector('.feeds-container') 31 | 32 | # 获取页面内容高度 33 | page_height = page.evaluate('() => document.body.scrollHeight') 34 | 35 | # 模拟鼠标滚动操作,向下滚动到底部 36 | while page.evaluate('() => window.scrollY + window.innerHeight') < page_height: 37 | page.mouse.wheel(0, 100) # 这里的参数可以根据需要进行调整 38 | 39 | page.wait_for_timeout(5000) 40 | -------------------------------------------------------------------------------- /自动化篇/playwright/反检测浏览器/README.md: -------------------------------------------------------------------------------- 1 | ## 如何获取js文件? 2 | 3 | ```bash 4 | npx extract-stealth-evasions 5 | ``` 6 | 需要本机有node环境才可下载,之后在项目下就会出现这个文件了 7 | 8 | 此方法可以绕过90%浏览器检测 -------------------------------------------------------------------------------- /自动化篇/playwright/反检测浏览器/demo.py: -------------------------------------------------------------------------------- 1 | from playwright.sync_api import sync_playwright 2 | 3 | # stealth.min.js文件的存放路径 4 | STEALTH_PATH = 'stealth.min.js' 5 | 6 | with sync_playwright() as p: 7 | # 创建一个正常的浏览器窗口 8 | browser = p.chromium.launch( 9 | headless=False, 10 | chromium_sandbox=False, 11 | ignore_default_args=["--enable-automation"], 12 | channel="chrome", 13 | ) 14 | ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36' 15 | content = browser.new_context(user_agent=ua) 16 | # 添加初始化脚本 17 | content.add_init_script(path=STEALTH_PATH) 18 | # 创建页面 19 | page = content.new_page() 20 | page.goto('https://bot.sannysoft.com/') 21 | # 查看效果,和浏览器一致 22 | page.wait_for_timeout(5000) 23 | # 关闭所有 24 | page.close() 25 | content.close() 26 | browser.close() 27 | -------------------------------------------------------------------------------- /自动化篇/playwright/起点vip/10086.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/自动化篇/playwright/起点vip/10086.png -------------------------------------------------------------------------------- /自动化篇/playwright/起点vip/README.md: -------------------------------------------------------------------------------- 1 | ### 说明 2 | 3 | demo文件是我自己手写的截取固定元素下的所有内容 4 | 5 | demo2是使用playwright自带的api截取(可以说简直是方便了不少) 6 | 7 | 由于第一次写的时候不知道playwright自带了滚动截图所以手写了一段滚动 8 | 9 | > 注: 由于qidian的导航在滚动的时候会附着在顶部,所以我们在开始的时候用js把这个dom元素给删除掉就可以了 10 | > 11 | > 如果不删除会出现遮挡小说内容的情况 12 | > 13 | 可以说playwright自带的各种api太强大了 -------------------------------------------------------------------------------- /自动化篇/playwright/起点vip/demo.py: -------------------------------------------------------------------------------- 1 | import playwright.sync_api 2 | from PIL import Image 3 | 4 | 5 | def run(syncPlayWright: playwright.sync_api.Playwright, url: str, savePath: str, cookies: list[dict]): 6 | browser = syncPlayWright.chromium.launch( 7 | headless=False, 8 | chromium_sandbox=False, 9 | channel="chrome", 10 | ) 11 | ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36' 12 | content = browser.new_context(user_agent=ua) 13 | content.add_init_script(path=r'D://crawlProjects/stealth.min.js') 14 | content.add_cookies(cookies) 15 | page = content.new_page() 16 | 17 | page.goto(url) 18 | # 获取 main 标签的高度 19 | rectangle = page.wait_for_selector('main') 20 | box = rectangle.bounding_box() 21 | main_height = box['height'] + box['y'] 22 | main_left = box['x'] 23 | main_offset = box['y'] 24 | main_width = box['width'] 25 | # 初始化截图列表 26 | screenshots = [] 27 | # 逐步滚动并截取屏幕截图 28 | scroll_offset = main_offset 29 | prev = 0 30 | scroll_height = 500 31 | while True: 32 | # 滚动屏幕 33 | page.evaluate(f'window.scrollTo({prev}, {scroll_offset})') 34 | # 截个图 35 | page.wait_for_timeout(100) 36 | screenshots.append(page.screenshot( 37 | clip={"x": main_left, "y": 0, "width": main_width, "height": scroll_height} 38 | )) 39 | # 记录上一次的终点 40 | prev = scroll_offset 41 | # 判断边界 42 | if prev < main_height <= prev + scroll_height: 43 | page.evaluate(f'window.scrollTo(0, {prev})') 44 | page.wait_for_timeout(100) 45 | screenshots.append(page.screenshot( 46 | clip={"x": main_left, "y": 0, "width": main_width, "height": main_height - prev} 47 | )) 48 | break 49 | scroll_offset += scroll_height 50 | 51 | # 将截图拼接在一起 52 | full_screenshot = Image.new('RGB', (round(main_width), round(box['height']))) 53 | y_offset = 0 54 | for index, screenshot in enumerate(screenshots): 55 | with open(savePath, 'wb') as f: 56 | f.write(screenshot) 57 | img = Image.open(savePath) 58 | full_screenshot.paste(img, (0, y_offset)) 59 | y_offset += img.height 60 | # 保存完整截图 61 | full_screenshot.save(savePath) 62 | page.close() 63 | 64 | 65 | if __name__ == '__main__': 66 | cookies = [] 67 | cookie_string = '_csrfToken=;fu=;_yep_uuid=;ywguid=;ywkey=;ywopenid=' 68 | cookie_items = cookie_string.split(';') 69 | for item in cookie_items: 70 | name, value = item.split('=') 71 | cookies.append({'name': name, 'value': value, 'domain': '.qidian.com', 'path': '/'}) 72 | with playwright.sync_api.sync_playwright() as p: 73 | run(p, 'https://www.qidian.com/chapter/1036094942/764016875/', '10086.png', cookies) 74 | -------------------------------------------------------------------------------- /自动化篇/playwright/起点vip/demo2.py: -------------------------------------------------------------------------------- 1 | import playwright.sync_api 2 | 3 | 4 | def run(syncPlayWright: playwright.sync_api.Playwright, url: str, savePath: str, cookies: list[dict]): 5 | run_js = 'document.getElementById("navbar").remove();' 6 | browser = syncPlayWright.chromium.launch( 7 | headless=False, 8 | chromium_sandbox=False, 9 | channel="chrome", 10 | ) 11 | ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36' 12 | content = browser.new_context(user_agent=ua) 13 | content.add_init_script(path=r'D://crawlProjects/stealth.min.js') 14 | content.add_cookies(cookies) 15 | page = content.new_page() 16 | page.goto(url) 17 | page.evaluate(run_js) 18 | page.locator(".print").screenshot(path="screenshot.png", animations='disabled') 19 | page.close() 20 | 21 | 22 | if __name__ == '__main__': 23 | cookies = [] 24 | cookie_string = '' 25 | cookie_items = cookie_string.split(';') 26 | for item in cookie_items: 27 | name, value = item.split('=') 28 | cookies.append({'name': name, 'value': value, 'domain': '.qidian.com', 'path': '/'}) 29 | with playwright.sync_api.sync_playwright() as p: 30 | run(p, 'https://www.qidian.com/chapter/1035571469/733045990/', '10086.png', cookies) 31 | -------------------------------------------------------------------------------- /自动化篇/playwright/邮政编码/hello.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from playwright.sync_api import sync_playwright, Error 3 | 4 | 5 | def getCode(addr): 6 | # 用同步的方式打开一个浏览器 7 | with sync_playwright() as p: 8 | try: 9 | # 设置浏览器配置 10 | browser = p.chromium.launch(headless=True) 11 | # 打开一个新窗口 12 | page = browser.new_page() 13 | # 去往这个链接 14 | page.goto('https://www.youbianku.com/%E9%A6%96%E9%A1%B5') 15 | # 等待页面加载完毕 16 | page.wait_for_load_state('load') 17 | # 通过id选中input框 18 | search_input = page.query_selector("#mySearchInput") 19 | # 往input框输入数据 20 | search_input.type(addr) 21 | # 通过id选择按钮 22 | search_button = page.query_selector("#mySearchButton") 23 | # 按钮点击 24 | search_button.click() 25 | # 表格选择器 26 | table_selector = ".zipcode-datas" 27 | # 等待表格渲染完毕 28 | page.wait_for_selector(table_selector, timeout=5000) 29 | # 通过class选择表格 30 | table = page.query_selector(table_selector) 31 | # 根据表格的类选择不同的行 32 | if "top-space" in table.get_attribute("class"): 33 | postal_code_selector = "tr:nth-child(5) td a" 34 | else: 35 | postal_code_selector = "tr:nth-child(3) td a" 36 | # 获取邮政编码所在的行 37 | postal_code_element = table.query_selector(postal_code_selector) 38 | # 获取邮政编码 39 | return postal_code_element.inner_text() 40 | except Error as e: 41 | # 捕获异常,出现地址错误可能表格无法加载 42 | print(e) 43 | return '000000' 44 | finally: 45 | # 关闭浏览器 46 | browser.close() 47 | 48 | 49 | if __name__ == '__main__': 50 | # 定义一个地址存储器,每次先从里面查找,找不到再去请求 51 | storgeCode = { 52 | } 53 | # 打开文佳佳 54 | fileA = pd.read_excel('./file.xlsx') 55 | # 修改格式 56 | new_header = fileA.iloc[0] 57 | fileA.columns = new_header 58 | # 遍历文件 59 | for index, row in fileA.iterrows(): 60 | address = row['通讯地址'] 61 | # 判断地址是否为空 62 | if pd.notna(address) and address != '通讯地址': 63 | if storgeCode.get(address, None) is None: 64 | # 找不到,就去请求 65 | code = getCode(address) 66 | storgeCode[address] = code 67 | postal_code = storgeCode[address] 68 | fileA.at[index, '邮政编码'] = postal_code 69 | else: 70 | continue 71 | # 每次修改后打印一下 72 | print(fileA.iloc[index]['姓名'], fileA.iloc[index]['通讯地址'], fileA.iloc[index]['邮政编码']) 73 | # 保存修改 74 | fileA.to_excel('updated_file.xlsx', index=False) 75 | -------------------------------------------------------------------------------- /自动化篇/selenium/12306模拟登录.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.common.keys import Keys 3 | from time import sleep 4 | from selenium.webdriver import ActionChains 5 | from selenium.webdriver import ChromeOptions 6 | 7 | # 实现让selenium规避被检测到的风险 8 | option = ChromeOptions() 9 | option.add_experimental_option('excludeSwitches', ['enable-logging']) 10 | option.add_experimental_option("excludeSwitches", ['enable-automation']) 11 | option.add_argument("--no-sandbox") 12 | option.add_argument("--disable-dev-shm-usage") 13 | option.add_argument("--window-size=1920,1080") # 建议设置窗口大小 14 | # option.add_argument('--headless') 15 | option.add_argument('--disable-gpu') 16 | 17 | bro = webdriver.Chrome(executable_path='chromedriver.exe', options=option) 18 | 19 | 20 | 21 | 22 | # 去除特征识别 防止服务器识别到的selenium的特征从而阻止后续的滑动验证 23 | bro.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", { 24 | "source": """ 25 | Object.defineProperty(navigator, 'webdriver', { 26 | get: () => undefined 27 | }) 28 | """ 29 | }) 30 | 31 | bro.get('https://kyfw.12306.cn/otn/resources/login.html') 32 | bro.maximize_window() 33 | 34 | # 标签定位 35 | user = bro.find_element_by_id('J-userName') 36 | pwd = bro.find_element_by_id('J-password') 37 | 38 | # 传入数据 39 | user.send_keys('') 40 | sleep(1) 41 | pwd.send_keys('') 42 | sleep(1) 43 | 44 | # 登录 45 | login = bro.find_element_by_id('J-login') 46 | login.click() 47 | sleep(2) 48 | 49 | slide = bro.find_element('id', 'nc_1_n1z') 50 | 51 | # 验证码 52 | action = ActionChains(bro) 53 | action.click_and_hold(slide) 54 | action.move_by_offset(300, 0).perform() 55 | sleep(2) 56 | # 点击确定 57 | ok = bro.find_element_by_class_name('btn') 58 | ok.click() 59 | sleep(2) 60 | 61 | ticket = bro.find_element_by_id('link_for_ticket') 62 | ticket.click() 63 | sleep(2) 64 | 65 | # 输入查询车站 66 | From = bro.find_element_by_id('fromStationText') 67 | From.click() 68 | From.send_keys('泸州') 69 | From.send_keys(Keys.ENTER) 70 | sleep(0.5) 71 | 72 | To = bro.find_element_by_id('toStationText') 73 | To.click() 74 | To.send_keys('乐山') 75 | To.send_keys(Keys.ENTER) 76 | sleep(0.5) 77 | 78 | # 找到出发站、到达站的隐藏HTML标签 79 | js = "document.getElementById('train_date').removeAttribute('readonly')" # 去除日期栏只读属性 80 | bro.execute_script(js) 81 | 82 | # 选择日期 83 | data = bro.find_element_by_id('train_date') 84 | data.clear() 85 | data.send_keys('2022-12-31') 86 | data.send_keys(Keys.ENTER) 87 | sleep(0.5) 88 | 89 | # 查询 90 | find = bro.find_element_by_id('query_ticket') 91 | find.click() 92 | sleep(2) 93 | 94 | # 关闭浏览器 95 | # sleep(5) 96 | # bro.quit() 97 | -------------------------------------------------------------------------------- /自动化篇/selenium/Twisted-20.3.0-cp39-cp39-win_amd64.whl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/自动化篇/selenium/Twisted-20.3.0-cp39-cp39-win_amd64.whl -------------------------------------------------------------------------------- /自动化篇/selenium/chromedriver.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/自动化篇/selenium/chromedriver.exe -------------------------------------------------------------------------------- /自动化篇/selenium/selenium其他自动化操作.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | from selenium import webdriver 4 | from time import sleep 5 | 6 | bro = webdriver.Chrome(executable_path='chromedriver.exe') 7 | 8 | if __name__ == '__main__': 9 | bro = webdriver.Chrome() 10 | bro.get("https://useragentstring.com/pages/useragentstring.php?name=Chrome") 11 | 12 | bro.quit() 13 | 14 | # 15 | # # 标签定位 16 | # search_input = bro.find_element_by_id('q') 17 | # # 标签交互 18 | # search_input.send_keys('iphone') 19 | # sleep(2) 20 | # # 执行一组js程序 21 | # 22 | # bro.execute_script('window.scrollTo(0,document.body.scrollHeight)') 23 | # sleep(2) 24 | # 25 | # # 点击搜索按钮 26 | # search = bro.find_element_by_class_name('btn-search') 27 | # search.click() 28 | # 29 | # # 切换页面 30 | # bro.get('https://www.baidu.com') 31 | # sleep(2) 32 | # # 回退 33 | # bro.back() 34 | # sleep(2) 35 | # # 前进 36 | # bro.forward() 37 | # 38 | # # 关闭浏览器 39 | # sleep(5) 40 | # bro.quit() 41 | -------------------------------------------------------------------------------- /自动化篇/selenium/selenium模拟登录.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from time import sleep 3 | from selenium.webdriver import ActionChains 4 | 5 | bro = webdriver.Chrome(executable_path='chromedriver.exe') 6 | 7 | bro.get('https://qzone.qq.com') 8 | 9 | # 切换作用域 10 | bro.switch_to.frame('login_frame') 11 | # 标签定位与点击 12 | pwdLogin = bro.find_element_by_id('switcher_plogin') 13 | pwdLogin.click() 14 | 15 | # 输入账号密码 16 | zhanghao = bro.find_element_by_id('u') 17 | zhanghao.send_keys('') 18 | pwd = bro.find_element_by_id('p') 19 | pwd.send_keys('') 20 | 21 | login = bro.find_element_by_id('login_button') 22 | login.click() 23 | 24 | sleep(5) 25 | bro.quit() 26 | -------------------------------------------------------------------------------- /自动化篇/selenium/动作链和iframe的处理.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from time import sleep 3 | from selenium.webdriver import ActionChains 4 | 5 | # 创建对象 6 | bro = webdriver.Chrome('chromedriver.exe') 7 | # 指定url 8 | bro.get('https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable') 9 | 10 | # 想要定位的标签是存在于iframe之中,则必须通过如下操作再进行标签定位 11 | # div = bro.find_element_by_id('draggable') 错误的方法定位 12 | bro.switch_to.frame('iframeResult') # 切换到我们想要指定的iframe中 13 | div = bro.find_element_by_id('draggable') 14 | 15 | # 动作链 16 | action = ActionChains(bro) 17 | # 点击长按指定的标签 18 | action.click_and_hold(div) 19 | for i in range(5): 20 | # perform表示立即执行动作链操作 21 | action.move_by_offset(17, 0).perform() 22 | sleep(0.3) 23 | # 释放动作链 24 | action.release() 25 | 26 | # 退出浏览器 27 | sleep(5) 28 | bro.quit() 29 | -------------------------------------------------------------------------------- /自动化篇/selenium/谷歌无头浏览器+反检测.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver import ChromeOptions 3 | from time import sleep 4 | 5 | # 实现让selenium规避被检测到的风险 6 | option = ChromeOptions() 7 | option.add_experimental_option('excludeSwitches', ['enable-automation']) 8 | # 实现无可视化界面的操作 9 | option.add_argument('--headless') 10 | option.add_argument('--disable-gpu') 11 | 12 | bro = webdriver.Chrome(executable_path='chromedriver.exe', options=option) 13 | 14 | # 无可视化界面(无头浏览器) phantomJs 15 | bro.get('https://www.baidu.com') 16 | print(bro.page_source) 17 | 18 | # 关闭浏览器 19 | sleep(5) 20 | bro.quit() 21 | 22 | -------------------------------------------------------------------------------- /进阶篇/js逆向/wasm/air/README.md: -------------------------------------------------------------------------------- 1 | # 某东航空 2 | 3 | 需要自己将滑块的html文档下载到本地,在acw文件中指定网页的路径,要绝对路径 4 | 5 | 之后按照acw的注释完成文件,即可 -------------------------------------------------------------------------------- /进阶篇/js逆向/wasm/air/acw_tc_3.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | import re 4 | from urllib.parse import urlparse, parse_qs 5 | 6 | import playwright.sync_api 7 | import requests 8 | from playwright.sync_api import sync_playwright 9 | 10 | # 存放滑块的页面 11 | FILEPATH = '' 12 | 13 | # 拦截验证的路由,自己写一下url, 格式参照playwright官网 14 | INTERRUPT_ROUTE = '' 15 | 16 | # 指定谷歌浏览器路径,以debug模式打开,如果已经打开了debug,下面四行代码可以注释掉 17 | # chrome_path = r'"C:\Program Files\Google\Chrome\Application\chrome.exe"' 18 | # debugging_port = "--remote-debugging-port=9999" 19 | # 20 | # command = f"{chrome_path} {debugging_port}" 21 | # subprocess.Popen(command, shell=True) 22 | 23 | # 创建的ws链接 24 | WS_URL = 'http://localhost:your_port' 25 | 26 | headers = { 27 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36', 28 | } 29 | 30 | 31 | def replace_info(html: str): 32 | # 识别出requestInfo 33 | pattern = re.compile(r'requestInfo\s*=\s*\{.*?};', re.S) 34 | # 读取旧文件 35 | with open(FILEPATH, 'r', encoding='utf-8') as f: 36 | old_html = f.read() 37 | # 从新html中查找info, 如果有就做替换,没有就保留 38 | info = pattern.findall(html)[0] 39 | if info: 40 | new_html = pattern.sub(info, old_html) 41 | with open(FILEPATH, 'w', encoding='utf-8') as f: 42 | f.write(new_html) 43 | 44 | def get_226() -> dict: 45 | pattern = re.compile(r'\((.*)\)', re.S) 46 | result: dict = {} 47 | 48 | def intercept_xhr(route: playwright.sync_api.Route): 49 | params = parse_qs(urlparse(route.request.url).query) 50 | result['t'] = params['t'][0] 51 | resp = requests.get(url=route.request.url, headers=headers) 52 | data = json.loads(pattern.findall(resp.text)[0]) 53 | # 我们获取到了数据是不是应该返还给result 54 | print(data) 55 | route.abort() 56 | 57 | with sync_playwright() as p: 58 | # 使用强化脚本来过验证 59 | browser = p.chromium.launch( 60 | # headless=False, 61 | ) 62 | ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36' 63 | content = browser.new_context(user_agent=ua) 64 | content.add_init_script(path=r'D://crawlProjects/stealth.min.js') 65 | page = content.new_page() 66 | 67 | # # 创建一个ws链接 68 | # browser = p.chromium.connect_over_cdp(WS_URL) 69 | # # 使用浏览器的上下文创建页面 70 | # content = browser.contexts[0] 71 | # page = content.new_page() 72 | 73 | page.route(INTERRUPT_ROUTE, intercept_xhr) 74 | page.goto(FILEPATH) 75 | 76 | btn = page.locator('#nc_1_n1z') 77 | btn_position = btn.bounding_box() 78 | new_x = btn_position['x'] + random.randint(290, 310) 79 | new_y = btn_position['y'] 80 | page.mouse.click(btn_position['x'], btn_position['y']) 81 | 82 | page.mouse.down() 83 | page.mouse.move(new_x, new_y) 84 | page.mouse.up() 85 | 86 | page.close() 87 | content.close() 88 | browser.close() 89 | 90 | return result 91 | 92 | -------------------------------------------------------------------------------- /进阶篇/js逆向/wasm/air/ddd.js: -------------------------------------------------------------------------------- 1 | // 由于某些不可抗力此文件分享到百度网盘领取 2 | // 链接:https://pan.baidu.com/s/1DMzG2h0kwnWepxfzhQWBqA?pwd=jrsg 3 | // 提取码:jrsg 4 | // --来自百度网盘超级会员V3的分享 -------------------------------------------------------------------------------- /进阶篇/js逆向/webPack/五矿/crwalBase.py: -------------------------------------------------------------------------------- 1 | import json 2 | from random import uniform 3 | from time import sleep 4 | from typing import Union, Generator, Literal 5 | 6 | from curl_cffi import requests 7 | from ddddocr import DdddOcr 8 | from execjs import compile 9 | 10 | Method = Literal['get', 'post', 'POST', 'GET'] 11 | 12 | 13 | class Crawler: 14 | # 设置请求session 15 | session = requests.Session() 16 | # 请求方式 17 | methodProcessors = { 18 | 'get': requests.get, 19 | 'post': requests.post 20 | } 21 | sessionProcessors = { 22 | 'get': session.get, 23 | 'post': session.post 24 | } 25 | # 验证码识别 26 | ocr = DdddOcr() 27 | 28 | def ajax_requests( 29 | self, url: str, 30 | method: Method, 31 | params: dict = None, 32 | jsonData: dict = None, 33 | retryTimes: int = 10, 34 | timeOut: int = 20, 35 | headers: dict = None, 36 | isSession: bool = False, 37 | cookies: dict = None, 38 | ) -> requests.Response: 39 | methodProcessor = self.methodProcessors[method] if not isSession else self.sessionProcessors[method] 40 | for _ in range(retryTimes): 41 | try: 42 | response = methodProcessor( 43 | url=url, 44 | headers=headers, 45 | cookies=cookies, 46 | params=params, 47 | data=json.dumps(jsonData, ensure_ascii=False, separators=(',', ':')), 48 | json=jsonData, 49 | timeout=timeOut 50 | ) 51 | return response 52 | except Exception as e: 53 | sleep(uniform(5, 10)) 54 | print( 55 | f"错误链接: {url}", 56 | f"请求出现错误, 正在重试: {_}/{retryTimes}", 57 | f"错误信息为: {e}", 58 | sep='\n' 59 | ) 60 | else: 61 | raise '重试5次后仍然无法获取数据,可能是加密参数错误或者ip风控' 62 | 63 | def get_code(self, url: str, params: dict = None, jsonData: dict = None) -> str: 64 | imgBytes = self.ajax_requests( 65 | url=url, 66 | method='get', 67 | jsonData=jsonData, 68 | params=params 69 | ).content 70 | return self.ocr.classification(imgBytes) 71 | 72 | @staticmethod 73 | def open_js(path: str): 74 | return compile(open(path, 'r', encoding='utf-8').read()) 75 | 76 | # 用于检查传入的键值是否正确 77 | @staticmethod 78 | def check_key(dic: dict, key: str) -> Union[str, int, list, dict]: 79 | if key not in dic: 80 | raise NameError(f'错误的初始化键值, key = {key}') 81 | return dic[key] 82 | 83 | # 在字典中搜索关键字,返回信息,可以搜索到字典中所有匹配的关键字 84 | @staticmethod 85 | def search_dict(items: dict, search_key: str) -> Generator: 86 | stack = [items] 87 | while stack: 88 | current_item = stack.pop() 89 | if isinstance(current_item, dict): 90 | for key, value in current_item: 91 | if search_key == key: 92 | yield value 93 | else: 94 | stack.append(value) 95 | elif isinstance(current_item, list): 96 | for value in current_item: 97 | stack.append(value) 98 | -------------------------------------------------------------------------------- /进阶篇/js逆向/webPack/五矿/demo.js: -------------------------------------------------------------------------------- 1 | const crypto = require('crypto'); 2 | 3 | function w(hexString) { 4 | const buffer = Buffer.from(hexString, 'hex'); 5 | return buffer.toString('base64'); 6 | } 7 | 8 | function md5(data) { 9 | return crypto.createHash('md5').update(data).digest('hex'); 10 | } 11 | 12 | function rsa(data, key) { 13 | const publicKey = `-----BEGIN PUBLIC KEY-----\n${key}\n-----END PUBLIC KEY-----`; 14 | const buffer = Buffer.from(data, 'utf-8'); 15 | const publicKeyBuffer = Buffer.from(publicKey, 'utf-8'); 16 | const encryptedData = crypto.publicEncrypt({ 17 | key: publicKeyBuffer, 18 | padding: crypto.constants.RSA_PKCS1_PADDING 19 | }, buffer); 20 | return encryptedData.toString('hex') 21 | } 22 | 23 | 24 | function getParams(data, key) { 25 | let a = JSON.stringify({ 26 | ...data, 27 | ...{ 28 | sign: md5(JSON.stringify(data)), 29 | timeStamp: +new Date 30 | } 31 | }) 32 | var r = ''; 33 | n = a.match(/.{1,50}/g); 34 | n.forEach((function (A) { 35 | var t = rsa(A, key); 36 | r += t 37 | })) 38 | return w(r) 39 | } 40 | 41 | -------------------------------------------------------------------------------- /进阶篇/js逆向/webPack/五矿/encode.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import binascii 3 | import hashlib 4 | import json 5 | import time 6 | from Crypto.PublicKey import RSA 7 | from Crypto.Cipher import PKCS1_v1_5 8 | 9 | 10 | def w(hex_string): 11 | byte_data = binascii.unhexlify(hex_string) # 将十六进制字符串转换为字节数组 12 | base64_data = base64.b64encode(byte_data) # 将字节数组转换为Base64编码 13 | return base64_data.decode('utf-8') 14 | 15 | 16 | def md5(data): 17 | return hashlib.md5(data.encode('utf-8')).hexdigest() 18 | 19 | 20 | def rsa(plaintext, key): 21 | publicKey = f'-----BEGIN PUBLIC KEY-----\n{key}\n-----END PUBLIC KEY-----' 22 | public_key = RSA.import_key(publicKey) 23 | cipher_rsa = PKCS1_v1_5.new(public_key) 24 | return cipher_rsa.encrypt(plaintext.encode('utf-8')).hex() 25 | 26 | 27 | def getParams(data, key): 28 | a = json.dumps({ 29 | **data, 30 | **{ 31 | 'sign': md5(json.dumps(data, separators=(',', ':'), ensure_ascii=False)), 32 | 'timeStamp': int(time.time() * 1000) 33 | } 34 | }, ensure_ascii=False, separators=(',', ':')) 35 | n = [rsa(a[i:i + 50], key) for i in range(0, len(a), 50)] 36 | return w(''.join(n)) 37 | 38 | -------------------------------------------------------------------------------- /进阶篇/js逆向/浏览器指纹检测/易九批/test.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | 4 | import execjs 5 | from curl_cffi import requests 6 | 7 | URLS = [ 8 | 'https://www.yijiupi.com/v54/ProductCategory/ListCategoryTree', 9 | 'https://www.yijiupi.com/v54/PurchaseChannel/List', 10 | 'https://www.yijiupi.com/v54/ProductCategory/ListProductCategory' 11 | ] 12 | 13 | 14 | def get_data(json_data, url, sepUrl): 15 | timestamp = str(int(time.time())) 16 | headers = { 17 | 'Content-Type': 'application/json', 18 | 'token': '', 19 | } 20 | # 问题的关键是把中文好好处理!! 21 | data = json.dumps(json_data, ensure_ascii=False, separators=(',', ':')) 22 | x_ = execjs.compile(open('demo.js', 'r', encoding='utf-8').read()) \ 23 | .call('setHeader', 'POST', sepUrl, data, timestamp) 24 | headers.update(x_) 25 | response = requests.post(url, headers=headers, 26 | data=data, impersonate='chrome110') 27 | print(response.json()) 28 | 29 | 30 | if __name__ == '__main__': 31 | set1 = { 32 | 'json_data': { 33 | 'data': { 34 | 'zoneId': '4932265882383941446', 35 | }, 36 | 'cityId': '701', 37 | 'userClassId': 1, 38 | 'userDisplayClass': 0, 39 | 'addressId': '', 40 | 'deviceType': 3, 41 | }, 42 | 'url': URLS[0], 43 | 'sepUrl': '/v54/ProductCategory/ListCategoryTree' 44 | } 45 | # get_data(**set1) 46 | set2 = { 47 | 'json_data': { 48 | 'cityId': '701', 49 | 'userClassId': 1, 50 | 'userDisplayClass': 0, 51 | 'addressId': '', 52 | 'deviceType': 3, 53 | }, 54 | 'url': URLS[1], 55 | 'sepUrl': '/v54/PurchaseChannel/List' 56 | } 57 | # get_data(**set2) 58 | set3 = { 59 | 'json_data': { 60 | 'data': { 61 | 'sonCategoryId': '', 62 | 'brandId': '', 63 | 'firstCategoryId': '', 64 | 'searchKey': '国台国酱', 65 | 'specialAreaId': '', 66 | 'categoryIds': [], 67 | 'brandIds': [], 68 | 'labelId': None, 69 | 'isAscending': '', 70 | 'searchModes': [ 71 | 2, 72 | ], 73 | 'sort': 0, 74 | 'shopId': '', 75 | 'currentPage': 1, 76 | 'pageSize': 60, 77 | 'filterSpecialArea': False, 78 | 'searchSource': 1, 79 | 'warehouseIds': [], 80 | 'searchKeyNotCorrect': False, 81 | 'couponTemplateId': '', 82 | 'channelId': '', 83 | }, 84 | 'cityId': '701', 85 | 'userClassId': 1, 86 | 'userDisplayClass': 0, 87 | 'addressId': '', 88 | 'deviceType': 3, 89 | }, 90 | 'url': URLS[2], 91 | 'sepUrl': '/v54/ProductCategory/ListProductCategory' 92 | } 93 | get_data(**set3) 94 | -------------------------------------------------------------------------------- /进阶篇/js逆向/环境检测/BossJob/chaojiying.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding:utf-8 3 | 4 | from hashlib import md5 5 | 6 | import requests 7 | 8 | 9 | class Chaojiying_Client(object): 10 | 11 | def __init__(self, username, password, soft_id): 12 | self.username = username 13 | password = password.encode('utf8') 14 | self.password = md5(password).hexdigest() 15 | self.soft_id = soft_id 16 | self.base_params = { 17 | 'user': self.username, 18 | 'pass2': self.password, 19 | 'softid': self.soft_id, 20 | } 21 | self.headers = { 22 | 'Connection': 'Keep-Alive', 23 | 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)', 24 | } 25 | 26 | def PostPic(self, im, codetype): 27 | """ 28 | im: 图片字节 29 | codetype: 题目类型 参考 http://www.chaojiying.com/price.html 30 | """ 31 | params = { 32 | 'codetype': codetype, 33 | } 34 | params.update(self.base_params) 35 | files = {'userfile': ('ccc.jpg', im)} 36 | r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, 37 | headers=self.headers) 38 | return r.json() 39 | 40 | def PostPic_base64(self, base64_str, codetype): 41 | """ 42 | im: 图片字节 43 | codetype: 题目类型 参考 http://www.chaojiying.com/price.html 44 | """ 45 | params = { 46 | 'codetype': codetype, 47 | 'file_base64': base64_str 48 | } 49 | params.update(self.base_params) 50 | r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, headers=self.headers) 51 | return r.json() 52 | 53 | def ReportError(self, im_id): 54 | """ 55 | im_id:报错题目的图片ID 56 | """ 57 | params = { 58 | 'id': im_id, 59 | } 60 | params.update(self.base_params) 61 | r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers) 62 | return r.json() 63 | 64 | 65 | if __name__ == '__main__': 66 | chaojiying = Chaojiying_Client('******', '******', '96001') # 用户中心>>软件ID 生成一个替换 96001 67 | im = open('a.jpg', 'rb').read() # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要// 68 | print(chaojiying.PostPic(im, 1902)) # 1902 验证码类型 69 | -------------------------------------------------------------------------------- /进阶篇/js逆向/环境检测/RedBook/README.md: -------------------------------------------------------------------------------- 1 | # 使用 2 | ## 补环境的完整版 3 | > 见RedBook.py文件。使用前需要修改new/jssss.js文件的localstorage以及cookie 4 | 5 | ## 无需补环境的部分ios端api 6 | > 见demo.py文件,基本上不需要cookie,需要cookie的函数我列出来了 7 | > 其中只有一个接口需要逆向一个x-sign参数,这个参数很好逆向 -------------------------------------------------------------------------------- /进阶篇/js逆向/环境检测/pdd/demo.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import execjs 3 | 4 | anti_content = execjs.compile(open('hello.js', 'r', encoding='utf-8').read()).call('dt') 5 | 6 | headers = { 7 | 'Accept': 'application/json, text/javascript', 8 | 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 9 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', 10 | } 11 | 12 | params = { 13 | 'tf_id': 'TFRQ0v00000Y_13394', 14 | 'page': '1', 15 | 'size': '100', 16 | 'anti_content': anti_content 17 | } 18 | 19 | response = requests.get('https://apiv2.pinduoduo.com/api/gindex/tf/query_tf_goods_info', params=params, headers=headers) 20 | print(response.text) 21 | -------------------------------------------------------------------------------- /进阶篇/js逆向/环境检测/猿人学2023第一题/test.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import execjs 3 | 4 | cookies = { 5 | } 6 | 7 | headers = { 8 | 'authority': 'match2023.yuanrenxue.cn', 9 | 'accept': 'application/json, text/javascript, */*; q=0.01', 10 | 'content-type': 'application/x-www-form-urlencoded; charset=UTF-8', 11 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36', 12 | } 13 | value = 0 14 | for page in range(1, 6): 15 | token = execjs.compile(open('demo.js', 'r', encoding='utf-8').read()).call('solve', page) 16 | response = requests.post('https://match2023.yuanrenxue.cn/api/match2023/1', cookies=cookies, headers=headers, 17 | data=token) 18 | data = response.json()['data'] 19 | for v in data: 20 | value += v['value'] 21 | print(value) 22 | -------------------------------------------------------------------------------- /进阶篇/js逆向/环境检测/饿了么/hello.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Title 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /进阶篇/js逆向/请求头请求体加密/fjs/demo.js: -------------------------------------------------------------------------------- 1 | const Crypto = require('crypto-js') 2 | 3 | var c = 'EB444973714E4A40876CE66BE45D5930' 4 | var b = 'B5A8904209931867' 5 | function decrypt(t) { 6 | var e = Crypto.enc.Utf8.parse(c) 7 | , n = Crypto.enc.Utf8.parse(b) 8 | , a = Crypto.AES.decrypt(t, e, { 9 | iv: n, 10 | mode: Crypto.mode.CBC, 11 | padding: Crypto.pad.Pkcs7 12 | }); 13 | return a.toString(Crypto.enc.Utf8) 14 | } 15 | -------------------------------------------------------------------------------- /进阶篇/js逆向/请求头请求体加密/fjs/fjs.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import execjs 3 | pageNum = 1 4 | # 控制请求的页数 5 | while pageNum < 2: 6 | # 准备js逆向出请求头和表单签名 7 | ts = int(execjs.compile(open('sign.js', 'r', encoding='utf-8').read()).call('ts')) 8 | json_data = { 9 | 'pageNo': pageNum, 10 | 'pageSize': 40, 11 | 'total': 5770, 12 | 'AREACODE': '', 13 | 'M_PROJECT_TYPE': '', 14 | 'KIND': 'GCJS', 15 | 'GGTYPE': '1', 16 | 'PROTYPE': '', 17 | 'timeType': '6', 18 | 'BeginTime': '2022-07-18 00:00:00', 19 | 'EndTime': '2023-01-18 23:59:59', 20 | 'createTime': [], 21 | 'ts': ts, 22 | } 23 | sign = str(execjs.compile(open('sign.js', 'r', encoding='utf-8').read()).call('sign', json_data)) 24 | headers = { 25 | 'Accept': 'application/json, text/plain, */*', 26 | 'Accept-Language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en;q=0.7', 27 | 'Connection': 'keep-alive', 28 | 'Content-Type': 'application/json;charset=UTF-8', 29 | 'Origin': 'https://ggzyfw.fujian.gov.cn', 30 | 'Referer': 'https://ggzyfw.fujian.gov.cn/business/list/', 31 | 'Sec-Fetch-Dest': 'empty', 32 | 'Sec-Fetch-Mode': 'cors', 33 | 'Sec-Fetch-Site': 'same-origin', 34 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36', 35 | 'portal-sign': sign, 36 | 'sec-ch-ua': '"Not?A_Brand";v="8", "Chromium";v="108", "Google Chrome";v="108"', 37 | 'sec-ch-ua-mobile': '?0', 38 | 'sec-ch-ua-platform': '"Windows"', 39 | } 40 | 41 | # 发起请求 42 | response = requests.post('https://ggzyfw.fujian.gov.cn/FwPortalApi/Trade/TradeInfo', headers=headers, json=json_data).json() 43 | data = response['Data'] 44 | 45 | # 解密文件 46 | ctx = execjs.compile(open('demo.js', 'r', encoding='utf-8').read()).call('decrypt', data) 47 | print(ctx) 48 | pageNum += 1 49 | -------------------------------------------------------------------------------- /进阶篇/js逆向/请求头请求体加密/fjs/sign.js: -------------------------------------------------------------------------------- 1 | const Crypto = require('crypto-js') 2 | 3 | var d = "B3978D054A72A7002063637CCDF6B2E5" 4 | 5 | function sign(t) { 6 | for (var e in t) 7 | "" !== t[e] && void 0 !== t[e] || delete t[e]; 8 | var n = d + l(t); 9 | return s(n) 10 | } 11 | function s(e) { 12 | return md5(e) 13 | } 14 | 15 | function l(t) { 16 | for (var e = Object.keys(t).sort(u), n = "", a = 0; a < e.length; a++) 17 | if (void 0 !== t[e[a]]) 18 | if (t[e[a]] && t[e[a]]instanceof Object || t[e[a]]instanceof Array) { 19 | var i = JSON.stringify(t[e[a]]); 20 | n += e[a] + i 21 | } else 22 | n += e[a] + t[e[a]]; 23 | return n 24 | } 25 | 26 | // 创建标准md5算法 27 | function md5(text){ 28 | return Crypto.MD5(text).toString() 29 | } 30 | function u(t, e) { 31 | return t.toString().toUpperCase() > e.toString().toUpperCase() ? 1 : t.toString().toUpperCase() == e.toString().toUpperCase() ? 0 : -1 32 | } 33 | 34 | // 测试数据 35 | data = { 36 | 'pageNo': 1, 37 | 'pageSize': 20, 38 | 'total': 0, 39 | 'AREACODE': '', 40 | 'M_PROJECT_TYPE': '', 41 | 'KIND': 'GCJS', 42 | 'GGTYPE': '1', 43 | 'PROTYPE': '', 44 | 'timeType': '6', 45 | 'BeginTime': '2022-07-18 00:00:00', 46 | 'EndTime': '2023-01-18 23:59:59', 47 | 'createTime': [], 48 | 'ts': ts(), 49 | } 50 | 51 | // 生成时间戳 52 | function ts(){ 53 | return (new Date).getTime() 54 | } 55 | 56 | console.log(ts()) 57 | console.log(sign(data)) 58 | 59 | -------------------------------------------------------------------------------- /进阶篇/js逆向/请求头请求体加密/football/599_info.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import execjs 3 | import time 4 | 5 | headers = { 6 | 'authority': 'api.599.com', 7 | 'accept': 'application/json, text/plain, */*', 8 | 'accept-language': 'zh-CN,zh;q=0.9', 9 | 'cache-control': 'no-cache', 10 | 'origin': 'https://599.com', 11 | 'pragma': 'no-cache', 12 | 'referer': 'https://599.com/', 13 | 'sec-ch-ua': '"Chromium";v="110", "Not A(Brand";v="24", "Google Chrome";v="110"', 14 | 'sec-ch-ua-mobile': '?0', 15 | 'sec-ch-ua-platform': '"Windows"', 16 | 'sec-fetch-dest': 'empty', 17 | 'sec-fetch-mode': 'cors', 18 | 'sec-fetch-site': 'same-site', 19 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36', 20 | } 21 | ts = int(time.time() * 1000) 22 | pre_params = { 23 | "appType": "3", 24 | "channelNumber": "GF1001", 25 | "comId": "8", 26 | "lang": "zh", 27 | "platform": "pc", 28 | "st": ts, 29 | "timeZone": "8", 30 | "version": "671", 31 | "versionCode": "671" 32 | } 33 | sign = execjs.compile(open('js/sss.js', 'r', encoding='utf-8').read()).call('Z', '/footballapi/core/matchlist/v2/immediate', pre_params) 34 | params = { 35 | 'comId': '8', 36 | 'lang': 'zh', 37 | 'timeZone': '8', 38 | 'version': '671', 39 | 'versionCode': '671', 40 | 'channelNumber': 'GF1001', 41 | 'platform': 'pc', 42 | 'appType': '3', 43 | 'st': str(ts), 44 | 'sign': sign, 45 | } 46 | response = requests.get('https://api.599.com/footballapi/core/matchlist/v2/immediate', params=params, headers=headers) 47 | 48 | data = response.json()['data'] 49 | ctx = execjs.compile(open('js/demo.js', 'r', encoding='utf-8').read()).call('decrypt', data) 50 | print(ctx) 51 | -------------------------------------------------------------------------------- /进阶篇/js逆向/请求头请求体加密/football/js/sss.js: -------------------------------------------------------------------------------- 1 | const crypto = require('crypto-js') 2 | 3 | function md5(text){ 4 | text = String(text) 5 | return crypto.MD5(text).toString() 6 | } 7 | 8 | var e = '/footballapi/core/matchlist/v2/immediate' 9 | var t = { 10 | "appType": "3", 11 | "channelNumber": "GF1001", 12 | "comId": "8", 13 | "lang": "zh", 14 | "platform": "pc", 15 | "st": 1678167676726, 16 | "timeZone": "8", 17 | "version": "671", 18 | "versionCode": "671" 19 | } 20 | 21 | function l() { 22 | return e 23 | } 24 | function Z(e, t) { 25 | var n = {} 26 | , o = e; 27 | for (var r in Object.keys(t).sort().map((function(e) { 28 | n[e] = t[e] 29 | } 30 | )), 31 | n) 32 | o = o + r + n[r]; 33 | return o += md5("wjj"), 34 | md5(o).toLowerCase() + "99" 35 | } 36 | 37 | console.log(Z(e, t)); -------------------------------------------------------------------------------- /进阶篇/js逆向/请求头请求体加密/唯一艺术/demo.js: -------------------------------------------------------------------------------- 1 | const Crypto = require('crypto-js') 2 | 3 | var data = 'truiLeKm7AKyuie+33QCYVOB58uNUU9k+FEIeXVsr/ztKrMa9ytcHn11hxFo6XLAe2ye5nNmVQAAZ3zKiCcZZoPPcUBuypN/3xXg6+l98m38zldv8b2wlIVuy24U1PxbPFKGrQEbJTTwnoujMCcaeZfiOdyyjSMX24EXL8o244bbHdJm6UWRWxMux1ICO9tBg10IQxFo+j8/Cc3jAdGAlg==' 4 | 5 | window = { 6 | deciphering: function (t){ 7 | { 8 | e = "4tBlCLWFZ3eD93CvDE2lpw==" || 32; 9 | var o = "ABCDEFGHJKMNPQRSTWXYZabcdefhijkmnprstwxyz2345678" 10 | , r = o.length; 11 | for (let t = 0; t < e; t++) 12 | o.charAt(Math.floor(Math.random() * r)); 13 | return t 14 | } 15 | } 16 | } 17 | 18 | function encryptSelf(t, o) { 19 | var r = Crypto.enc.Base64.parse("4tBlCLWFZ3eD93CvDE2lpw=="); 20 | let i = JSON.stringify({ 21 | id: t.substr(0, t.length - 1), 22 | sum: o 23 | }); 24 | var s = Crypto.enc.Utf8.parse(i); 25 | return Crypto.AES.encrypt(s, r, { 26 | mode: Crypto.mode.ECB, 27 | padding: Crypto.pad.Pkcs7 28 | }).toString() 29 | } 30 | function decrypt(t) { 31 | var e = Crypto.enc.Base64.parse("5opkytHOggKj5utjZOgszg==") 32 | var o = Crypto.AES.decrypt(t, e, { 33 | mode: Crypto.mode.ECB, 34 | padding: Crypto.pad.Pkcs7 35 | }); 36 | return Crypto.enc.Utf8.stringify(o).toString() 37 | } 38 | 39 | function getSign(data){ 40 | let dataresult = decrypt(data) 41 | , dataResultFun = dataresult.split(",")[0].substr(4) 42 | , dataResultId = dataresult.split(",")[1].split("=")[1] 43 | , sigresult = eval(dataResultFun); 44 | console.log(sigresult) 45 | return encryptSelf(dataResultId, sigresult) 46 | // return sigresult 47 | } 48 | 49 | function design(data){ 50 | return encodeURIComponent(data) 51 | } 52 | 53 | -------------------------------------------------------------------------------- /进阶篇/js逆向/请求头请求体加密/唯一艺术/test.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import execjs 3 | 4 | 5 | def getSign(): 6 | url = 'https://api.theone.art/market/api/key/get' 7 | headers = { 8 | 'Accept': 'application/json, text/plain, */*', 9 | 'Accept-Language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en;q=0.7', 10 | 'Connection': 'keep-alive', 11 | 'Origin': 'https://www.theone.art', 12 | 'Referer': 'https://www.theone.art/', 13 | 'Sec-Fetch-Dest': 'empty', 14 | 'Sec-Fetch-Mode': 'cors', 15 | 'Sec-Fetch-Site': 'same-site', 16 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36', 17 | 'sec-ch-ua': '"Not_A Brand";v="99", "Google Chrome";v="109", "Chromium";v="109"', 18 | 'sec-ch-ua-mobile': '?0', 19 | 'sec-ch-ua-platform': '"Windows"', 20 | } 21 | res = requests.get(url=url, headers=headers).json() 22 | data = str(res['data']) 23 | sign = execjs.compile(open('demo.js', 'r', encoding='utf-8').read()).call('getSign', data) 24 | return sign 25 | 26 | 27 | headers = { 28 | 'Accept': 'application/json, text/plain, */*', 29 | 'Accept-Language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en;q=0.7', 30 | 'Connection': 'keep-alive', 31 | 'Content-Type': 'application/json;charset=UTF-8', 32 | 'Origin': 'https://www.theone.art', 33 | 'Referer': 'https://www.theone.art/', 34 | 'Sec-Fetch-Dest': 'empty', 35 | 'Sec-Fetch-Mode': 'cors', 36 | 'Sec-Fetch-Site': 'same-site', 37 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36', 38 | 'sec-ch-ua': '"Not_A Brand";v="99", "Google Chrome";v="109", "Chromium";v="109"', 39 | 'sec-ch-ua-mobile': '?0', 40 | 'sec-ch-ua-platform': '"Windows"', 41 | 'sig': '8hJWPRjfS7l%2Fj86OrejRjAZDLiwIzZfQcKKIuEWB3154u4wv3WeQIv2pV3nzAo3HnXEoW0t6Tmxp9nRUjnrGtA%3D%3D', 42 | } 43 | 44 | for pageNum in range(1, 20): 45 | sign = getSign() 46 | hsign = execjs.compile(open('demo.js', 'r', encoding='utf-8').read()).call('design', sign) 47 | json_data = { 48 | 'authorId': None, 49 | 'chainContract': None, 50 | 'commodityCategoryId': None, 51 | 'commodityCategoryIdList': [], 52 | 'commodityId': None, 53 | 'highPrice': '', 54 | 'lowPrice': '', 55 | 'pageCount': pageNum, 56 | 'pageSize': 20, 57 | 'seriesWorks': None, 58 | 'seriesWorksId': None, 59 | 'sort': { 60 | 'field': 2, 61 | 'upOrDown': 1, 62 | }, 63 | 'statusSell': 1, 64 | 'topicId': None, 65 | 'typeMarket': 1, 66 | 'commodityTraitList': [], 67 | 'sig': sign, 68 | } 69 | response = requests.post('https://api.theone.art/market/api/saleRecord/list/v2', headers=headers, json=json_data) 70 | res = response.json()["data"] 71 | print(res) 72 | 73 | 74 | -------------------------------------------------------------------------------- /进阶篇/js逆向/请求头请求体加密/娱乐指数/ylzs.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import execjs 3 | 4 | cookies = { 5 | 'mobile_iindex_uuid': '9f0ae384-2821-5797-8a76-87bb1cef4a1f', 6 | 'Hm_lvt_2873e2b0bdd5404c734992cd3ae7253f': '1674101222,1674103567', 7 | 'Hm_lpvt_2873e2b0bdd5404c734992cd3ae7253f': '1674103567', 8 | } 9 | 10 | headers = { 11 | 'authority': 'www.chinaindex.net', 12 | 'accept': 'application/json, text/plain, */*', 13 | 'accept-language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en;q=0.7', 14 | # 'cookie': 'mobile_iindex_uuid=9f0ae384-2821-5797-8a76-87bb1cef4a1f; Hm_lvt_2873e2b0bdd5404c734992cd3ae7253f=1674101222,1674103567; Hm_lpvt_2873e2b0bdd5404c734992cd3ae7253f=1674103567', 15 | 'funcid': 'undefined', 16 | 'incognitomode': '0', 17 | 'referer': 'https://www.chinaindex.net/ranklist/5/0', 18 | 'sec-ch-ua': '"Not_A Brand";v="99", "Google Chrome";v="109", "Chromium";v="109"', 19 | 'sec-ch-ua-mobile': '?0', 20 | 'sec-ch-ua-platform': '"Windows"', 21 | 'sec-fetch-dest': 'empty', 22 | 'sec-fetch-mode': 'cors', 23 | 'sec-fetch-site': 'same-origin', 24 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36', 25 | 'uuid': '9f0ae384-2821-5797-8a76-87bb1cef4a1f', 26 | } 27 | 28 | params = { 29 | 'keyWord': '李知恩', 30 | 'sign': 'b3776cdf6331ee0f6653d1de544291c3' 31 | } 32 | 33 | response = requests.get( 34 | 'https://www.chinaindex.net/iIndexMobileServer/mobile/comm/getSearchResult', 35 | params=params, 36 | cookies=cookies, 37 | headers=headers, 38 | ) 39 | 40 | r = response.json()['data'] 41 | lastFetchTime = response.json()['lastFetchTime'] 42 | 43 | ctx = execjs.compile(open('demo.js', 'r', encoding='utf-8').read()).call('decrypt', r, lastFetchTime) 44 | 45 | print(ctx) -------------------------------------------------------------------------------- /进阶篇/js逆向/请求头请求体加密/广东省公共资源交易/demo.js: -------------------------------------------------------------------------------- 1 | const crypto = require('crypto') 2 | const Py = "zxcvbnmlkjhgfdsaqwertyuiop0987654321QWERTYUIOPLKJHGFDSAZXCVBNM" , jq = Py + "-@#$%^&*+!"; 3 | function Nonce(e) { 4 | return [...Array(e)].map(()=>Py[Vq(0, 61)]).join("") 5 | } 6 | function Vq(e, t) { 7 | switch (arguments.length) { 8 | case 1: 9 | return parseInt(Math.random() * e + 1, 10); 10 | case 2: 11 | return parseInt(Math.random() * (t - e + 1) + e, 10); 12 | default: 13 | return 0 14 | } 15 | } 16 | function lr(e=[]) { 17 | return e.map(t=>jq[t]).join("") 18 | } 19 | function Rg(e={}) { 20 | const {p: t, t: n, n: u, k: o} = e 21 | , r = zq(t); 22 | console.log(r) 23 | const hash = crypto.createHash('sha256') 24 | return hash.update(u + o + decodeURIComponent(r) + n).digest('hex') 25 | } 26 | function zq(e) { 27 | let t = ""; 28 | return typeof e == "object" ? t = Object.keys(e).map(n=>`${n}=${e[n]}`).sort().join("&") : typeof e == "string" && (t = e.split("&").sort().join("&")), 29 | t 30 | } 31 | function hash256(datas){ 32 | let c = lr([8, 28, 20, 42, 21, 53, 65, 6]) 33 | a = Date.now() 34 | let l = Nonce(16) 35 | let Signature = Rg({ 36 | p: JSON.stringify(datas).replace(/:/g, "=").replace(/["{}]/g, '').replace(/,/g, '&'), 37 | t: a, 38 | n: l, 39 | k: c 40 | }) 41 | text = { 42 | App: lr([11, 11, 0, 21, 62, 25, 24, 19, 20, 15, 7]), 43 | Nonce: l, 44 | Signature: Signature, 45 | Timestamp: a, 46 | } 47 | return text 48 | } 49 | 50 | data = { 51 | 'type': "trading-type", 52 | "publishStartTime": "", 53 | "publishEndTime": "", 54 | "siteCode": "44", 55 | "secondType": "A", 56 | "projectType": "", 57 | "thirdType": "", 58 | "dateType": "", 59 | "total": 189352, 60 | "pageNo": 5, 61 | "pageSize": 10, 62 | "openConvert": false 63 | } 64 | 65 | console.log(hash256(data)) -------------------------------------------------------------------------------- /进阶篇/js逆向/请求头请求体加密/广东省公共资源交易/guang.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import execjs 3 | 4 | cookies = { 5 | '_horizon_uid': 'd6a5d5ea-b057-4431-8d41-982f8bf12b08', 6 | '_horizon_sid': 'e2c9e3b6-2ee8-49e6-a54a-0a15a39ee1b7', 7 | } 8 | 9 | 10 | def fun(page): 11 | json_data = { 12 | 'type': 'trading-type', 13 | 'publishStartTime': '', 14 | 'publishEndTime': '', 15 | 'siteCode': '44', 16 | 'secondType': 'A', 17 | 'projectType': '', 18 | 'thirdType': '', 19 | 'dateType': '', 20 | 'total': 189836, 21 | 'pageNo': page, 22 | 'pageSize': 10, 23 | 'openConvert': False, 24 | } 25 | 26 | data = execjs.compile(open('demo.js', 'r', encoding='utf-8').read()).call('hash256', json_data) 27 | 28 | headers = { 29 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36', 30 | 'X-Dgi-Req-App': data['App'], 31 | 'X-Dgi-Req-Nonce': data['Nonce'], 32 | 'X-Dgi-Req-Signature': data['Signature'], 33 | 'X-Dgi-Req-Timestamp': str(data['Timestamp']), 34 | } 35 | 36 | response = requests.post('https://ygp.gdzwfw.gov.cn/ggzy-portal/search/v1/items', cookies=cookies, headers=headers, 37 | json=json_data) 38 | print(response.json()) 39 | 40 | 41 | if __name__ == '__main__': 42 | for page in range(1, 5): 43 | fun(page) 44 | -------------------------------------------------------------------------------- /进阶篇/js逆向/请求头请求体加密/有道翻译/demo.js: -------------------------------------------------------------------------------- 1 | // const Crypto = require('crypto-js逆向') 2 | // 3 | // 4 | // var data = 'Z21kD9ZK1ke6ugku2ccWuz4Ip5f4PLCoxWstZf_6UUyBoy8dpWc3NOXFRrnPMya7chcEL7e2Yz1xjFqcfdncOW4vOoJ66RTmRa8-dGZla_ExpWOUP0G1QJFtJ6Gj0ngir07R0ETWttaGO185v5rccLlZKqOCmJuChZSA-Dw9U6B2AOK4-RqYjAQEQ5vF7ph71eC5ZEvV6dm_xv0ywEOKi58R9xWx7fiJytxxlsz-oprAHdRXnI6kWszLLJJpr45DMBjoeArZfVssgWXzX_IlNUvTtj_1o95BpERVvV1FxGEeN-_TLgLaK9j7rjT4O-yPHpbuCk9q1BpLVSh3B4CPWCZPMIHwJiFtfQAC8_t-HWs45DWbW54DEny_doBItZ6v' 5 | // var key = 'ydsecret://query/key/B*RGygVywfNBwpmBaZg*WT7SIOUP2T0C9WHMZN39j^DAdaZhAnxvGcCY6VYFwnHl' 6 | // var iv = 'ydsecret://query/iv/C@lZe2YzHtZ2CYgaXKSVfsb7Y4QWHjITPPZ0nQp87fBeJ!Iv6v^6fvi2WN@bYpJ4' 7 | // 8 | // var ax = [8, 20, 157, 167, 60, 89, 206, 98, 85, 91, 1, 233, 47, 52, 232, 56] 9 | // var b = [210, 187, 27, 253, 232, 59, 56, 195, 68, 54, 99, 87, 183, 156, 174, 28] 10 | 11 | let data01 = '08149da73c59ce62555b01e92f34e838'//十六进制 12 | 13 | let newdata = Buffer.from(data01,'hex');//先把数据存在buf里面 14 | 15 | console.log("newdata ",newdata); 16 | 17 | console.log(newdata.toString("utf-8"));//使用toString函数就能转换成字符串 18 | -------------------------------------------------------------------------------- /进阶篇/js逆向/请求头请求体加密/有道翻译/youdao.py: -------------------------------------------------------------------------------- 1 | import json 2 | from Crypto.Cipher import AES 3 | import base64 4 | import time 5 | from hashlib import md5 6 | import requests 7 | 8 | 9 | def sign(): 10 | t = int(time.time() * 1000) 11 | n = f'client=fanyideskweb&mysticTime={t}&product=webfanyi&key=fsdsogkndfokasodnaso' 12 | obj = md5() 13 | obj.update(n.encode('utf-8')) 14 | sign = obj.hexdigest() 15 | return sign 16 | 17 | 18 | def decrypto(data): 19 | key = b'\x08\x14\x9d\xa7\x3c\x59\xce\x62\x55\x5b\x01\xe9\x2f\x34\xe8\x38' 20 | iv = b'\xd2\xbb\x1b\xfd\xe8\x3b\x38\xc3\x44\x36\x63\x57\xb7\x9c\xae\x1c' 21 | aes = AES.new(key, AES.MODE_CBC, iv) 22 | den_text = aes.decrypt(base64.urlsafe_b64decode(data)) 23 | return str(den_text, 'utf-8').strip() 24 | 25 | 26 | def post(w, f, t): 27 | cookies = { 28 | 'OUTFOX_SEARCH_USER_ID': '123456789@192.168.60.5', 29 | } 30 | headers = { 31 | 'Accept': 'application/json, text/plain, */*', 32 | 'Accept-Language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en;q=0.7', 33 | 'Connection': 'keep-alive', 34 | # 'Cookie': 'OUTFOX_SEARCH_USER_ID_NCOO=340028215.7799288; OUTFOX_SEARCH_USER_ID=-1551186736@49.52.96.107; P_INFO=18608219667|1670406132|1|youdaonote|00&99|null&null&null#shh&null#10#0|&0||18608219667', 35 | 'Origin': 'https://fanyi.youdao.com', 36 | 'Referer': 'https://fanyi.youdao.com/', 37 | 'Sec-Fetch-Dest': 'empty', 38 | 'Sec-Fetch-Mode': 'cors', 39 | 'Sec-Fetch-Site': 'same-site', 40 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36', 41 | 'sec-ch-ua': '"Chromium";v="110", "Not A(Brand";v="24", "Google Chrome";v="110"', 42 | 'sec-ch-ua-mobile': '?0', 43 | 'sec-ch-ua-platform': '"Windows"', 44 | } 45 | data = { 46 | 'i': w, 47 | 'from': f, 48 | 'to': t, 49 | 'dictResult': 'true', 50 | 'keyid': 'webfanyi', 51 | 'sign': sign(), 52 | 'client': 'fanyideskweb', 53 | 'product': 'webfanyi', 54 | 'appVersion': '1.0.0', 55 | 'vendor': 'web', 56 | 'pointParam': 'client,mysticTime,product', 57 | 'mysticTime': str(int(time.time() * 1000)), 58 | 'keyfrom': 'fanyi.web', 59 | } 60 | response = requests.post('https://dict.youdao.com/webtranslate', headers=headers, data=data, cookies=cookies) 61 | return response.text 62 | 63 | 64 | if __name__ == '__main__': 65 | while True: 66 | try: 67 | From = input('请输入开始语言(自动auto, 中文zh-CHS, 韩文ko, 英文en)\n') 68 | To = input('请输入翻译的语言(默认, 中文zh-CHS, 韩文ko, 英文en)\n') 69 | word = input('请输入单词:') 70 | enc = post(word, From, To) 71 | ctx = decrypto(enc) 72 | print(ctx) 73 | except: 74 | print('出现异常,请重新输入!') 75 | continue 76 | -------------------------------------------------------------------------------- /进阶篇/js逆向/请求头请求体加密/烯牛数据/xiniu.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import execjs 3 | 4 | 5 | cookies = { 6 | 'btoken': '89091VUM5EXO41RJFVJ7G478EIJV2990', 7 | 'hy_data_2020_id': '185c2fd82a1a09-073d27a69f05c6-26021151-1327104-185c2fd82a2dcb', 8 | 'hy_data_2020_js_sdk': '%7B%22distinct_id%22%3A%22185c2fd82a1a09-073d27a69f05c6-26021151-1327104-185c2fd82a2dcb%22%2C%22site_id%22%3A211%2C%22user_company%22%3A105%2C%22props%22%3A%7B%7D%2C%22device_id%22%3A%22185c2fd82a1a09-073d27a69f05c6-26021151-1327104-185c2fd82a2dcb%22%7D', 9 | 'sajssdk_2020_cross_new_user': '1', 10 | 'Hm_lvt_42317524c1662a500d12d3784dbea0f8': '1674013672', 11 | 'Hm_lpvt_42317524c1662a500d12d3784dbea0f8': '1674021425', 12 | } 13 | 14 | headers = { 15 | 'authority': 'www.xiniudata.com', 16 | 'accept': 'application/json', 17 | 'accept-language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en;q=0.7', 18 | 'content-type': 'application/json', 19 | # 'cookie': 'btoken=89091VUM5EXO41RJFVJ7G478EIJV2990; hy_data_2020_id=185c2fd82a1a09-073d27a69f05c6-26021151-1327104-185c2fd82a2dcb; hy_data_2020_js_sdk=%7B%22distinct_id%22%3A%22185c2fd82a1a09-073d27a69f05c6-26021151-1327104-185c2fd82a2dcb%22%2C%22site_id%22%3A211%2C%22user_company%22%3A105%2C%22props%22%3A%7B%7D%2C%22device_id%22%3A%22185c2fd82a1a09-073d27a69f05c6-26021151-1327104-185c2fd82a2dcb%22%7D; sajssdk_2020_cross_new_user=1; Hm_lvt_42317524c1662a500d12d3784dbea0f8=1674013672; Hm_lpvt_42317524c1662a500d12d3784dbea0f8=1674021425', 20 | 'origin': 'https://www.xiniudata.com', 21 | 'referer': 'https://www.xiniudata.com/industry/newest?from=data', 22 | 'sec-ch-ua': '"Not?A_Brand";v="8", "Chromium";v="108", "Google Chrome";v="108"', 23 | 'sec-ch-ua-mobile': '?0', 24 | 'sec-ch-ua-platform': '"Windows"', 25 | 'sec-fetch-dest': 'empty', 26 | 'sec-fetch-mode': 'cors', 27 | 'sec-fetch-site': 'same-origin', 28 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36', 29 | } 30 | 31 | payload = { 32 | "sort": 1, 33 | "start": 0, 34 | "limit": 20 35 | } 36 | 37 | pl = str(execjs.compile(open('demo.js', 'r', encoding='utf-8').read()).call('hhy', payload)) 38 | sig = str(execjs.compile(open('demo.js', 'r', encoding='utf-8').read()).call('hy', payload)) 39 | 40 | json_data = { 41 | 'payload': pl, 42 | 'sig': sig, 43 | 'v': 1, 44 | } 45 | 46 | response = requests.post( 47 | 'https://www.xiniudata.com/api2/service/x_service/person_industry_list/list_industries_by_sort', 48 | # https://www.xiniudata.com/api2/service/x_service/person_industry_list/list_industries_by_sort 49 | cookies=cookies, 50 | headers=headers, 51 | json=json_data, 52 | ) 53 | 54 | res = response.json()['d'] 55 | 56 | ctx = str(execjs.compile(open('demo.js', 'r', encoding='utf-8').read()).call('dy', res)) 57 | 58 | print(ctx) 59 | -------------------------------------------------------------------------------- /进阶篇/js逆向/请求头请求体加密/网易云音乐/decrpyo.py: -------------------------------------------------------------------------------- 1 | import random 2 | from binascii import hexlify 3 | import base64 4 | from Crypto.Cipher import AES 5 | 6 | e = "010001" 7 | g = "0CoJUm6Qyw8W8jud" 8 | f = "00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7" 9 | i3x = '{"csrf_token":"","cursor":"1672939386847","offset":"0","orderType":"1","pageNo":"3","pageSize":"20","rid":"R_SO_4_1835283134","threadId":"R_SO_4_1835283134"}' 10 | 11 | 12 | # 生成随机的16位字符传 13 | def RandomString(a): 14 | string = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789' 15 | randomStr = random.sample(string, a) 16 | return ''.join(randomStr) 17 | 18 | 19 | # AES加密算法 20 | def AESEncrypto(text, key): 21 | BS = 16 22 | pad = lambda s: s + (BS - len(s) % BS) * bytes([BS - len(s) % BS]) 23 | c = key.encode("utf-8") 24 | d = "0102030405060708".encode("utf-8") 25 | e = text.encode("utf-8") 26 | aes = AES.new(c, AES.MODE_CBC, d) 27 | enc = base64.b64encode(aes.encrypt(pad(e))).decode("utf-8") 28 | return enc 29 | 30 | 31 | # RSA加密 32 | def RSAEncrypto(text): 33 | text = text[::-1] # 表示文本倒序 34 | result = pow(int(hexlify(text.encode('utf-8')), 16), int(e, 16), int(f, 16)) 35 | return format(result, 'x').zfill(131) 36 | 37 | 38 | def d(text): 39 | i = RandomString(16) 40 | encText = AESEncrypto(text, g) 41 | encText = AESEncrypto(encText, i) 42 | encSecKey = RSAEncrypto(i) 43 | h = { 44 | "encText": encText, 45 | "encSecKey": encSecKey 46 | } 47 | return h 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /进阶篇/js逆向/请求头请求体加密/艺恩数据/yien.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import execjs 3 | 4 | headers = { 5 | 'Accept': 'text/plain, */*; q=0.01', 6 | 'Accept-Language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en;q=0.7', 7 | 'Connection': 'keep-alive', 8 | 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 9 | 'Origin': 'https://www.endata.com.cn', 10 | 'Sec-Fetch-Dest': 'empty', 11 | 'Sec-Fetch-Mode': 'cors', 12 | 'Sec-Fetch-Site': 'same-origin', 13 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36', 14 | 'X-Requested-With': 'XMLHttpRequest', 15 | 'sec-ch-ua': '"Not_A Brand";v="99", "Google Chrome";v="109", "Chromium";v="109"', 16 | 'sec-ch-ua-mobile': '?0', 17 | 'sec-ch-ua-platform': '"Windows"', 18 | } 19 | 20 | data = { 21 | 'year': '2023', 22 | 'MethodName': 'BoxOffice_GetYearInfoData', 23 | } 24 | 25 | response = requests.post('https://www.endata.com.cn/API/GetData.ashx', headers=headers, data=data) 26 | 27 | ctx = execjs.compile(open('demo.js', 'r', encoding='utf-8').read()).call('decrypt', response.text) 28 | print(ctx) 29 | -------------------------------------------------------------------------------- /进阶篇/js逆向/请求头请求体加密/行行查/hanghangcha.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import execjs 3 | 4 | cookies = { 5 | 'UM_distinctid': '185c4195bd7e6f-038d88d1a6e504-26021151-144000-185c4195bd8968', 6 | 'Hm_lvt_1521e0fb49013136e79181f2888214a7': '1674032275', 7 | 'Hm_lpvt_1521e0fb49013136e79181f2888214a7': '1674032275', 8 | 'JSESSIONID': 'F83DF5ABA6CAAEE674C850D3483CB550', 9 | '_ACCOUNT_': 'OTM0NmEzMDU1YmEzNGY4MDk3NjliZDI4NjUyNzhmNDElNDAlNDBtb2JpbGU6MTY3NTI0MzYxNzI2NjowYjBlNmMwYzJhZTFhYjFjNzFjZjIyYTQ5MDM1ZDA4Yg', 10 | } 11 | 12 | headers = { 13 | 'Accept': 'application/json, text/javascript, */*; q=0.01', 14 | 'Accept-Language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en;q=0.7', 15 | 'Auth-Plus': '', 16 | 'Connection': 'keep-alive', 17 | # 'Cookie': 'UM_distinctid=185c4195bd7e6f-038d88d1a6e504-26021151-144000-185c4195bd8968; Hm_lvt_1521e0fb49013136e79181f2888214a7=1674032275; Hm_lpvt_1521e0fb49013136e79181f2888214a7=1674032275; JSESSIONID=F83DF5ABA6CAAEE674C850D3483CB550; _ACCOUNT_=OTM0NmEzMDU1YmEzNGY4MDk3NjliZDI4NjUyNzhmNDElNDAlNDBtb2JpbGU6MTY3NTI0MzYxNzI2NjowYjBlNmMwYzJhZTFhYjFjNzFjZjIyYTQ5MDM1ZDA4Yg', 18 | 'Origin': 'https://www.hanghangcha.com', 19 | 'Sec-Fetch-Dest': 'empty', 20 | 'Sec-Fetch-Mode': 'cors', 21 | 'Sec-Fetch-Site': 'same-site', 22 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36', 23 | 'X-Requested-With': 'XMLHttpRequest', 24 | 'clientInfo': 'web', 25 | 'clientVersion': '1.0.0', 26 | 'sec-ch-ua': '"Not?A_Brand";v="8", "Chromium";v="108", "Google Chrome";v="108"', 27 | 'sec-ch-ua-mobile': '?0', 28 | 'sec-ch-ua-platform': '"Windows"', 29 | } 30 | 31 | params = { 32 | 'filter': '{"title":null,"sortType":null,"limit":9,"skip":0,"userId":2636778}', 33 | } 34 | 35 | response = requests.get( 36 | 'https://api.hanghangcha.com/hhc/industry/articleWithTags', 37 | params=params, 38 | cookies=cookies, 39 | headers=headers, 40 | ) 41 | 42 | data = response.json()['data'] 43 | 44 | ctx = str(execjs.compile(open('demo.js', 'r', encoding='utf-8').read()).call('decrypt', data)) 45 | 46 | print(ctx) 47 | -------------------------------------------------------------------------------- /进阶篇/基础综合/dandanzan/M3U8.py: -------------------------------------------------------------------------------- 1 | import time 2 | import m3u8, requests, os 3 | from urllib.parse import urljoin 4 | from Crypto.Cipher import AES 5 | from requests.exceptions import RequestException 6 | 7 | 8 | class M3U8: 9 | def __init__(self, url): 10 | self.decryptor = None 11 | self.url = url 12 | self.count = 0 13 | self.ts_urls = [] 14 | 15 | def get_ts_url(self): 16 | son_url = self.url 17 | key_url = None 18 | r = m3u8.load(son_url).data 19 | self.ts_urls = [urljoin(son_url, ts['uri']) for ts in r['segments']] 20 | try: 21 | if r['segments'][0]['key']['uri'].startswith('h' or 'H'): 22 | key_url = r['segments'][0]['key']['uri'] 23 | else: 24 | key_url = urljoin(son_url, r['segments'][0]['key']['uri']) 25 | except: 26 | pass 27 | if key_url: 28 | key = requests.get(key_url).content 29 | self.decryptor = AES.new(key, AES.MODE_CBC, b'\x00' * 16) 30 | else: 31 | self.decryptor = None 32 | 33 | def download(self, url, path, index, retry_times=5): 34 | for i in range(retry_times): 35 | try: 36 | resp = requests.get(url, timeout=60) 37 | if resp.status_code == 200: 38 | with open(path + f'\\{index}.ts', 'wb') as f: 39 | if self.decryptor: 40 | f.write(self.decryptor.decrypt(resp.content)) 41 | else: 42 | f.write(resp.content) 43 | self.count += 1 44 | if self.count % 100 == 0: 45 | print(f'已经下载{self.count}个分片了!') 46 | return True 47 | except RequestException as e: 48 | print(f"Download failed: {url}\n{e}\nretrying ({i + 1}/{retry_times})...") 49 | time.sleep(5) 50 | raise RequestException(f"Failed to download {url} after {retry_times} retries.") 51 | 52 | @staticmethod 53 | def merge(total, path): 54 | with open(path + '\\video.mp4', 'ab') as fp: 55 | for index in range(total): 56 | try: 57 | f = path + f'\\{index}.ts' 58 | content = open(f, 'rb').read() 59 | fp.write(content) 60 | os.remove(path + f'\\{index}.ts') 61 | except Exception as e: 62 | print(e) 63 | -------------------------------------------------------------------------------- /进阶篇/基础综合/dandanzan/main.py: -------------------------------------------------------------------------------- 1 | from drama import drama 2 | from movie import movie 3 | from variety import variety 4 | import os, subprocess 5 | 6 | 7 | def clear_screen(): 8 | subprocess.call('cls', shell=True) 9 | 10 | 11 | def drama_fun(): 12 | key = input('请输入电视剧名称: ') 13 | d = drama(pipe) 14 | d.search(key) 15 | if d.item: 16 | if input('请输入1进行下载, 其他任意键回到主页面: ') == '1': 17 | flag = input('输入选择下载的剧集序号: ') 18 | ji = int(input('输入开始集数: ')) 19 | jj = int(input('输入结束集数: ')) 20 | clear_screen() 21 | d.get_m3u8_url(flag, ji, jj) 22 | d.download_all(ji, jj) 23 | clear_screen() 24 | else: 25 | clear_screen() 26 | 27 | 28 | def movie_fun(): 29 | key = input('请输入电影名称: ') 30 | m = movie(pipe) 31 | m.search(key) 32 | if m.item: 33 | if input('请输入1进行下载, 其他任意键回到主页面: ') == '1': 34 | f = input('输入选择下载的电影序号: ') 35 | clear_screen() 36 | m.get_m3u8(f) 37 | m.download_movie() 38 | clear_screen() 39 | else: 40 | clear_screen() 41 | 42 | 43 | def variety_fun(): 44 | key = input('请输入综艺的名称: ') 45 | v = variety(pipe) 46 | v.search(key) 47 | if v.item: 48 | flag = input('输入选择查看的综艺序号: ') 49 | clear_screen() 50 | v.print_num(flag) 51 | if input('请输入1进行下载, 其他任意键回到主页面: ') == '1': 52 | num = input('输入选择下载的期数: ') 53 | v.get_m3u8_urls(flag, num) 54 | v.download(num) 55 | clear_screen() 56 | else: 57 | clear_screen() 58 | 59 | 60 | if __name__ == '__main__': 61 | if not os.path.exists('D:/m3u8视频'): 62 | os.mkdir('D:/m3u8视频') 63 | pipe = input('清输入选择的下载通道(0-5)\n建议通道0, 如果出现程序闪退可考虑更换通道,或者打开VPN\n') 64 | while True: 65 | choice = input('请输入想要搜索的类型(1 表示电视剧, 2 表示电影, 3 表示综艺, 其他任意键退出): ') 66 | clear_screen() 67 | if choice == '1': 68 | drama_fun() 69 | elif choice == '2': 70 | movie_fun() 71 | elif choice == '3': 72 | variety_fun() 73 | else: 74 | break 75 | -------------------------------------------------------------------------------- /进阶篇/基础综合/dandanzan/movie.py: -------------------------------------------------------------------------------- 1 | import m3u8, re 2 | from lxml import etree 3 | from M3U8 import M3U8 4 | from drama import drama 5 | import prettytable as pt 6 | from urllib.parse import urljoin 7 | from concurrent.futures import ThreadPoolExecutor, wait 8 | 9 | 10 | class movie(drama): 11 | def search(self, key): 12 | params = { 13 | 'q': key, 14 | } 15 | response = self.session.get('https://dandanzan.net/so', params=params, headers=self.headers) 16 | tree = etree.HTML(response.text) 17 | li_list = tree.xpath('//div[@class="lists-content"]/ul/li') 18 | for li in li_list: 19 | try: 20 | a = li.xpath('./a/@href')[0] 21 | id = re.findall(self.r, a)[0] 22 | length = li.xpath('./a/div[1]/span/text()')[0].strip() 23 | name = li.xpath('./h2/a//text()')[0].strip() 24 | if not length.startswith('第'): 25 | s = { 26 | 'id': id, 27 | 'length': length, 28 | 'name': name 29 | } 30 | self.item.append(s) 31 | except: 32 | pass 33 | tb = pt.PrettyTable() 34 | tb.field_names = ['序号', '电影名称', '清晰度'] 35 | tb.align = 'c' 36 | # 填充宽度 37 | tb.padding_width = 5 38 | count = 0 39 | for item in self.item: 40 | tb.add_row([count, item['name'], item['length']]) 41 | count += 1 42 | print(tb) 43 | 44 | def get_m3u8(self, flag): 45 | id = self.item[int(flag)]['id'] 46 | length = 'hd' 47 | self.dir_name = self.item[int(flag)]['name'] 48 | url = f'https://dandanzan.net/fetch_plays/{id}/{length}' 49 | response = self.session.get(url) 50 | try: 51 | father_url = response.json()['video_plays'][self.pipe]['play_data'] 52 | f_fata = m3u8.load(father_url).data 53 | son_url = urljoin(father_url, f_fata['playlists'][0]['uri']) 54 | self.m3u8_obj.append(M3U8(son_url)) 55 | print('下载链接已获取') 56 | except Exception: 57 | raise '出现错误,电影资源不存在' 58 | 59 | def download_movie(self): 60 | self.m3u8_obj[0].get_ts_url() 61 | total = len(self.m3u8_obj[0].ts_urls) 62 | self.create_dir(1) 63 | print(f'视频一共的分片是{total}个...') 64 | with ThreadPoolExecutor(max_workers=16) as executor: 65 | futures = [] 66 | for j, data in enumerate(self.m3u8_obj[0].ts_urls): 67 | future = executor.submit(self.m3u8_obj[0].download, data, self.path, j) 68 | futures.append(future) 69 | wait(futures) 70 | executor.shutdown() 71 | self.m3u8_obj[0].merge(total, self.path) 72 | -------------------------------------------------------------------------------- /进阶篇/基础综合/dandanzan/variety.py: -------------------------------------------------------------------------------- 1 | import m3u8, re 2 | from movie import movie 3 | from lxml import etree 4 | import prettytable as pt 5 | from urllib.parse import urljoin 6 | from concurrent.futures import ThreadPoolExecutor, wait 7 | from M3U8 import M3U8 8 | 9 | 10 | class variety(movie): 11 | def search(self, key): 12 | params = { 13 | 'q': key, 14 | } 15 | response = self.session.get('https://dandanzan.net/so', params=params, headers=self.headers) 16 | tree = etree.HTML(response.text) 17 | li_list = tree.xpath('//div[@class="lists-content"]/ul/li') 18 | for li in li_list: 19 | try: 20 | a = li.xpath('./a/@href')[0] 21 | id = re.findall(self.r, a)[0] 22 | length = li.xpath('./a/div[1]/span/text()')[0].strip() 23 | name = li.xpath('./h2/a//text()')[0].strip() 24 | s = { 25 | 'url': 'https://dandanzan.net' + a, 26 | 'id': id, 27 | 'length': length, 28 | 'name': name 29 | } 30 | self.item.append(s) 31 | except: 32 | pass 33 | tb = pt.PrettyTable() 34 | tb.field_names = ['序号', '综艺名称', '最新一期'] 35 | tb.align = 'c' 36 | # 填充宽度 37 | tb.padding_width = 5 38 | count = 0 39 | for item in self.item: 40 | tb.add_row([count, item['name'], item['length']]) 41 | count += 1 42 | print(tb) 43 | 44 | def get_m3u8_urls(self, flag, num): 45 | id = self.item[int(flag)]['id'] 46 | length = num 47 | self.dir_name = self.item[int(flag)]['name'] 48 | url = f'https://dandanzan.net/fetch_plays/{id}/{length}' 49 | response = self.session.get(url) 50 | try: 51 | father_url = response.json()['video_plays'][self.pipe]['play_data'] 52 | f_fata = m3u8.load(father_url).data 53 | son_url = urljoin(father_url, f_fata['playlists'][0]['uri']) 54 | self.m3u8_obj.append(M3U8(son_url)) 55 | print('下载链接已获取') 56 | except Exception: 57 | raise '出现错误,电影资源不存在' 58 | 59 | def print_num(self, flag): 60 | url = self.item[int(flag)]['url'] 61 | resp = self.session.get(url, headers=self.headers) 62 | tree = etree.HTML(resp.text) 63 | li_list = tree.xpath('//ul[@id="eps-ul"]/li') 64 | num = [] 65 | for li in li_list: 66 | number = li.xpath('./@ep_slug')[0] 67 | num.append(number) 68 | table = pt.PrettyTable() 69 | table.field_names = ['期数1', '期数2', '期数3', '期数4', '期数5'] 70 | # 计算需要填充的空值数量 71 | num_padding = 5 - len(num) % 5 72 | # 填充空值 73 | num += [None] * num_padding 74 | for i in range(0, len(num), 5): 75 | table.add_row([*num[i:i + 5]]) 76 | print(table) 77 | 78 | def download(self, num): 79 | self.m3u8_obj[0].get_ts_url() 80 | total = len(self.m3u8_obj[0].ts_urls) 81 | self.create_dir(num) 82 | print(f'视频一共的分片是{total}个...') 83 | with ThreadPoolExecutor(max_workers=16) as executor: 84 | futures = [] 85 | for j, data in enumerate(self.m3u8_obj[0].ts_urls): 86 | future = executor.submit(self.m3u8_obj[0].download, data, self.path, j) 87 | futures.append(future) 88 | wait(futures) 89 | executor.shutdown() 90 | self.m3u8_obj[0].merge(total, self.path) 91 | -------------------------------------------------------------------------------- /进阶篇/基础综合/weibo全站爬取/base.py: -------------------------------------------------------------------------------- 1 | import json 2 | from random import uniform 3 | from time import sleep 4 | from typing import Union, Literal 5 | 6 | from curl_cffi import requests 7 | 8 | # 类型控制 9 | Accept = Literal['json', 'text', 'contents'] 10 | Method = Literal['get', 'post', 'POST', 'GET'] 11 | 12 | 13 | class Base: 14 | # 设置请求session 15 | session = requests.Session() 16 | # 请求头 17 | headers: dict = {} 18 | # 用户cookie 19 | cookies: dict = {} 20 | # 返回指定数据类型 21 | dataProcessors = { 22 | 'json': lambda resp: resp.json(), 23 | 'text': lambda resp: resp.text, 24 | 'contents': lambda resp: resp.content 25 | } 26 | # 请求方式 27 | methodProcessors = { 28 | 'get': requests.get, 29 | 'post': requests.post 30 | } 31 | 32 | def ajax_requests( 33 | self, url: str, 34 | method: Method, 35 | dataType: Accept, 36 | params: Union[dict, str, None], 37 | jsonData: Union[dict, None], 38 | retryTimes: int = 5, 39 | timeOut: int = 20 40 | ) -> Union[dict, str, bytes]: 41 | # 初始化请求发送器以及数据获取器 42 | dataProcessor = self.dataProcessors[dataType] 43 | methodProcessor = self.methodProcessors[method] 44 | for _ in range(retryTimes): 45 | try: 46 | response = methodProcessor( 47 | url=url, 48 | headers=self.headers, 49 | cookies=self.cookies, 50 | params=params, 51 | data=json.dumps(jsonData, ensure_ascii=False, separators=(',', ':')), 52 | timeout=timeOut 53 | ) 54 | return dataProcessor(response) 55 | except json.decoder.JSONDecodeError: 56 | raise ValueError(f'无法被解析为json格式,错误链接为: {url}') 57 | except Exception as e: 58 | sleep(uniform(1, 5)) 59 | print( 60 | f"错误链接: {url}", 61 | f"请求出现错误, 正在重试: {_}/{retryTimes}", 62 | f"错误信息为: {e}", 63 | sep='\n' 64 | ) 65 | else: 66 | raise '重试5次后仍然无法获取数据,可能是加密参数错误或者ip风控' 67 | -------------------------------------------------------------------------------- /进阶篇/基础综合/语音爬虫/baidu.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import os 3 | from mix_media import get 4 | 5 | 6 | def func(text, lan): 7 | params = { 8 | 'lan': lan, 9 | 'text': text, 10 | 'spd': '3', 11 | 'source': 'web', 12 | } 13 | 14 | response = requests.get('https://fanyi.baidu.com/gettts', params=params) 15 | return response.content 16 | 17 | 18 | def baidu(filepath, lan): 19 | with open(filepath, 'r', encoding='utf-8') as file: 20 | list = file.readlines() 21 | name = os.path.basename(filepath) 22 | if not os.path.exists(f'./media/baidu/{name}'): 23 | os.mkdir(f'./media/baidu/{name}') 24 | flag = 1 25 | while flag <= len(list): 26 | text = list[flag - 1].replace('\n', '') 27 | if text is not None: 28 | print(text) 29 | resp = func(text, lan) 30 | with open(f'./media/baidu/{name}/{flag}.mp3', 'wb') as file: 31 | file.write(resp) 32 | flag += 1 33 | get(f'media/baidu/{name}') 34 | -------------------------------------------------------------------------------- /进阶篇/基础综合/语音爬虫/main.py: -------------------------------------------------------------------------------- 1 | from baidu import baidu 2 | from youdao import youdao 3 | 4 | 5 | if __name__ == '__main__': 6 | while True: 7 | flag = input('请选择来源:(1.百度 2.有道 3.退出)\n') 8 | if flag == '3': 9 | break 10 | path = input('请输入文件路径:\n') 11 | lan = input('请输入文件语言:(zh, en, kr/kor[有道, 百度])\n') 12 | if flag == '1': 13 | baidu(path, lan) 14 | elif flag == '2': 15 | youdao(path, lan) 16 | -------------------------------------------------------------------------------- /进阶篇/基础综合/语音爬虫/media/baidu/kr/all.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/进阶篇/基础综合/语音爬虫/media/baidu/kr/all.mp3 -------------------------------------------------------------------------------- /进阶篇/基础综合/语音爬虫/media/baidu/wenku.txt/all.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/进阶篇/基础综合/语音爬虫/media/baidu/wenku.txt/all.mp3 -------------------------------------------------------------------------------- /进阶篇/基础综合/语音爬虫/media/youdao/kr/all.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/进阶篇/基础综合/语音爬虫/media/youdao/kr/all.mp3 -------------------------------------------------------------------------------- /进阶篇/基础综合/语音爬虫/media/youdao/kr_exam/all.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/进阶篇/基础综合/语音爬虫/media/youdao/kr_exam/all.mp3 -------------------------------------------------------------------------------- /进阶篇/基础综合/语音爬虫/media/youdao/wenku.txt/all.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/进阶篇/基础综合/语音爬虫/media/youdao/wenku.txt/all.mp3 -------------------------------------------------------------------------------- /进阶篇/基础综合/语音爬虫/mix_media.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | 5 | def get(dir_path): 6 | file_list = os.listdir(dir_path) 7 | # file_list.sort(key=lambda i: int(re.match(r'(\d+)', i).group())) 8 | contents = [] 9 | for cur_file in file_list: 10 | path = os.path.join(os.path.abspath(dir_path), cur_file) 11 | with open(path, 'rb') as fp: 12 | content = fp.read() 13 | contents.append(content) 14 | # os.remove(path) 15 | print(path) 16 | with open(f'{dir_path}/all.mp4', 'wb') as f: 17 | for c in contents: 18 | f.write(c) 19 | 20 | 21 | if __name__ == '__main__': 22 | path = input('请输入目录:\n') 23 | get(path) 24 | -------------------------------------------------------------------------------- /进阶篇/基础综合/语音爬虫/text/en: -------------------------------------------------------------------------------- 1 | SHA-256 uses a series of mathematical operations, 2 | including bitwise operations (such as XOR, AND, and OR), rotations, and additions, to transform the input message and the internal state of the algorithm through 64 rounds. 3 | Each round has its own specific set of constants and functions. 4 | The message is first padded to a multiple of 512 bits (the block size) with a 1 bit, 5 | followed by as many 0 bits as necessary to reach the end of the block, and then a 64-bit integer representing the original length of the message is appended. 6 | The resulting message is then split into 512-bit blocks, which are processed in sequence. 7 | Each block is first divided into 16 32-bit words, which are then expanded into 64 32-bit words using a function that involves XOR, rotations, and additions. 8 | The expanded words are then processed through a series of 64 rounds, each of which involves several steps: 9 | The round constant is added to one of the words. 10 | Several words are passed through a set of logical functions (such as XOR, AND, and OR) and then added to another word. 11 | The words are rotated by a certain number of bits. 12 | The words are passed through another set of logical functions and added to another word. 13 | At the end of the 64 rounds, the resulting words are added to the initial hash values, and the resulting values are the final hash of the message. 14 | The specific constants and functions used in each round are carefully chosen to provide a high level of security and resistance to various attacks. 15 | The entire process of SHA-256 is designed to be computationally expensive and difficult to reverse, making it a strong cryptographic hash function. -------------------------------------------------------------------------------- /进阶篇/基础综合/语音爬虫/text/kr: -------------------------------------------------------------------------------- 1 | 사람들은 모두 음식을 생각하면 즐거워질 것입니다 . 2 | “무엇을 , 어떻게 먹을까?” 3 | 아주 오랜 옛날부터 사람들은 이런 생각을 했습니다 . 그리고 음식과 음식을 먹는 방법은 나라마다 아주 다양합니다 . 4 | 한국 음식은 국물이 많기 때문에 숟가락과 젓가락을 사용하고 밥을 먹을 때도 그릇을 상 위에 놓고 숟가락으로 먹습니다 . 일본 사람들은 밥을 젓가락으로 먹기 때문에 밥그릇을 들고 먹어야 합니다 . 중국에는 튀긴 음식과 볶은 음식이 많아서 기름이 많고 뜨겁기 때문에 중국 젓가락은 한국 젓가락보다 더 깁니다 . 5 | 노르웨이는 겨울이 긴 나라입니다 . 그래서 사람들은 고기나 생선에 소금을 뿌려서 말린 후에 추운 경울에도 오랫동안 먹을 수 있는 음식을 만들었습니다 . 인도네시아는 날씨가 더워서 음식이 쉽게 상할 수 있기 때문에 볶은 음식이 많습니다 . 6 | 이렇게 나라마다 다른 식사 방법과 다양한 7 | 음식은 그 나라의 문화를 잘 보여 줍니다 . 다른 8 | 나라의 문화를 잘 알고 싶으면 그 나라의 9 | 음식을 드셔 보세요 . -------------------------------------------------------------------------------- /进阶篇/基础综合/语音爬虫/text/kr_exam.txt: -------------------------------------------------------------------------------- 1 | 당신은 무슨 요리를 좋아합니까 2 | 나는 불고기를 좋아합니다 3 | 불고기 맛이 어때요 4 | 이 식당에서 무슨 음식이 제일 맛있어요 5 | 불고기 하나랑 비빔밥 하나 주세요 6 | 백화점과 지하철이 있습니다 7 | 우리 집은 아버지 어머니 누나 그리고 저, 모두 네 식구입니다 8 | 저는 한국어를 좋아합니다 9 | 고향에는 한국 친구가 있습니다 10 | 깨끗하다 11 | 더러 워 12 | 추 워 13 | 뜨 거 운 14 | 조용하다 15 | 시끄럽다 16 | 널찍하다 17 | 좁은 18 | 비 싼 19 | 값싼 20 | 우리가족은 다섯명입니다. 21 | 아버지, 어머니, 외할머니, 형님과 저입니다. 22 | 어머니와 아버지는 선생님이고 형은 의사입니다. 23 | 그리고 저는 대학생입니다.저의 고향은 중국 사천성입니다 24 | 우리 가족은 모두 불고기를 좋아한다. 25 | 그렇지만 형는 회를 더 좋아해요. 26 | 내가 가장 좋아하는 한국음식은 비빔밥이다 27 | 내가 나중에 한국을 여행할 수 있기를 바랍니다 -------------------------------------------------------------------------------- /进阶篇/基础综合/语音爬虫/text/wenku.txt: -------------------------------------------------------------------------------- 1 | 今天教给大家一种免费白嫖百度文库付费文档的方法 2 | 首先我们选择自己想要下载的文档 3 | 然后复制文档的链接 4 | 打开浏览器无痕模式 5 | 点击链接旁的小锁 6 | 将所有cookie禁用 7 | 右键检查 8 | 点击左上角的平板 9 | 切换模式再刷新页面 10 | 就可以看到完整的文档了 11 | 然后再点击平板 12 | 就可以复制所有文档 13 | 学会的朋友们记得一件三联哦 14 | 关注我 15 | 我是一个爱挖漏洞的up主 16 | 继续带你免费白嫖资源 -------------------------------------------------------------------------------- /进阶篇/基础综合/语音爬虫/text/zh.txt: -------------------------------------------------------------------------------- 1 | 实现hash256算法的过程: 2 | 第一、初始化 3 | 将32位整数赋值给8个变量,这8个变量是工作变量,以及定义32位小常数 4 | 第二、填充 5 | 将需要加密的文本内容填充到能被512整除余448,填充方式是在文本末尾添加1和若干个0 6 | 如果数据模512大于448,则需要再添加一组512位填充 7 | 第三、处理消息分组 8 | 将填充后的数据分为若干个512位的数据块,每个数据块称为一个消息分组。 9 | 对于每个消息分组,需要进行一次消息扩展操作和64次压缩函数操作 10 | a.消息扩展 11 | 将一个消息分组扩展为64个32位整数 12 | 对于每一块,将块分解为16个32-bit的big-endian的字,记为w[0], …, w[15] 13 | 也就是说,前16个字直接由消息的第i个块分解得到 14 | b.压缩函数 15 | (1) 将8个工作变量的值赋给8个临时变量 16 | (2) 对扩展消息进行64轮的处理 17 | (3) 在完成64轮的压缩函数操作之后,将新的哈希值与当前消息分组之前的哈希值相加,生成最终的哈希值。 18 | 第四、输出 19 | 20 | -------------------------------------------------------------------------------- /进阶篇/基础综合/语音爬虫/text/zh_kr_exam: -------------------------------------------------------------------------------- 1 | 姓名 老师 姓 印度人 人 朋友 国家 公司职员 学生 大学生 警察 歌手 医生 护士 秘书 律师 2 | 谁 学校 课本 字典 杂志 钱包 橡皮 护照 钥匙 饼干 教室 银行 学生会馆 图书馆 洗手间 3 | 食堂 书店 邮政局 办公室 电脑 牛奶 啤酒 冰箱 百货店 地下铁 剧场 医院 公司 4 | 你喜欢吃什么菜 5 | 我喜欢烤肉 6 | 烤肉味道怎么样 7 | 这家店什么菜最好吃 8 | 请给我一份烤肉和一碗拌饭 9 | 有百货店和地下铁 10 | 我们家有爸爸妈妈姐姐还有我一共四口人 11 | 我喜欢韩国语 12 | 故乡有韩国朋友 -------------------------------------------------------------------------------- /进阶篇/基础综合/语音爬虫/youdao.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import os 3 | from mix_media import get 4 | 5 | 6 | def func(text, lan): 7 | cookies = { 8 | 'OUTFOX_SEARCH_USER_ID_NCOO': '1065325158.1443799', 9 | 'OUTFOX_SEARCH_USER_ID': '-527773617@180.168.188.248', 10 | } 11 | 12 | headers = { 13 | 'Accept': '*/*', 14 | 'Accept-Language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en;q=0.7', 15 | 'Connection': 'keep-alive', 16 | # 'Cookie': 'OUTFOX_SEARCH_USER_ID_NCOO=1065325158.1443799; OUTFOX_SEARCH_USER_ID=-527773617@180.168.188.248', 17 | 'Range': 'bytes=0-', 18 | 'Referer': 'https://fanyi.youdao.com/', 19 | 'Sec-Fetch-Dest': 'audio', 20 | 'Sec-Fetch-Mode': 'no-cors', 21 | 'Sec-Fetch-Site': 'same-site', 22 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36', 23 | 'sec-ch-ua': '"Chromium";v="110", "Not A(Brand";v="24", "Google Chrome";v="110"', 24 | 'sec-ch-ua-mobile': '?0', 25 | 'sec-ch-ua-platform': '"Windows"', 26 | } 27 | 28 | params = { 29 | 'audio': text, 30 | 'le': lan, 31 | } 32 | 33 | response = requests.get('https://dict.youdao.com/dictvoice', params=params, cookies=cookies, headers=headers) 34 | return response.content 35 | 36 | 37 | def youdao(filepath, lan): 38 | with open(filepath, 'r', encoding='utf-8') as file: 39 | list = file.readlines() 40 | name = os.path.basename(filepath) 41 | if not os.path.exists(f'./media/youdao/{name}'): 42 | os.mkdir(f'./media/youdao/{name}') 43 | flag = 1 44 | while flag <= len(list): 45 | text = list[flag - 1].strip() 46 | if text is not None: 47 | print(text) 48 | resp = func(text, lan) 49 | with open(f'./media/youdao/{name}/{flag}.mp3', 'wb') as file: 50 | file.write(resp) 51 | flag += 1 52 | get(f'media/youdao/{name}') 53 | -------------------------------------------------------------------------------- /进阶篇/基础综合/验证码相关/古诗文网验证码识别.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import ddddocr 3 | from lxml import etree 4 | 5 | if __name__ == '__main__': 6 | headers = { 7 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36' 8 | } 9 | url = 'https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx' 10 | page_text = requests.get(url=url, headers=headers).text 11 | tree = etree.HTML(page_text) 12 | # 将验证码图片保存到了本地 13 | code_img_src = 'https://so.gushiwen.cn/' + tree.xpath('//*[@id="imgCode"]/@src')[0] 14 | code_data = requests.get(url=code_img_src, headers=headers).content 15 | with open('./code.jpg', 'wb') as fp: 16 | fp.write(code_data) 17 | # 解析验证码 18 | ocr = ddddocr.DdddOcr() 19 | with open('code.jpg', 'rb') as f: 20 | img_bytes = f.read() 21 | res = ocr.classification(img_bytes) # 解析到的验证码数据 22 | -------------------------------------------------------------------------------- /进阶篇/基础综合/验证码相关/模拟登录.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import ddddocr 3 | from lxml import etree 4 | 5 | # 获取验证码信息以及页面的隐藏信息,在这里是viewstate和viewstategenerator 6 | if __name__ == "__main__": 7 | headers = { 8 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36' 9 | } 10 | url = 'https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx' 11 | page_text = requests.get(url=url, headers=headers).text 12 | tree = etree.HTML(page_text) 13 | # 获取验证码图片连接 14 | code_img_src = 'https://so.gushiwen.cn/' + tree.xpath('//*[@id="imgCode"]/@src')[0] 15 | # 获取viewstate的值 16 | viewstate = tree.xpath("//input[@id='__VIEWSTATE']/@value")[0] 17 | # 获取viewstategenerator的值 18 | viewstategenerator = tree.xpath("//input[@id='__VIEWSTATEGENERATOR']/@value")[0] 19 | 20 | # 将验证码图片保存到本地 21 | # code_data = requests.get(url=code_img_src, headers=headers).content 不可以这样使用,因为一旦请求,原本网页的验证码就会发生改变了 22 | # 这里我们使用requests中的Session()方法,将请求变成一个对象 23 | session = requests.Session() 24 | code_data = session.get(url=code_img_src, headers=headers).content 25 | with open('./code.jpg', 'wb') as fp: 26 | fp.write(code_data) 27 | # 解析验证码 28 | ocr = ddddocr.DdddOcr() 29 | with open('code.jpg', 'rb') as f: 30 | img_bytes = f.read() 31 | res = ocr.classification(img_bytes) # 解析到的验证码数据 32 | 33 | # 模拟登录发送post请求 34 | login_url = 'https://so.gushiwen.cn/user/login.aspx?from=http%3a%2f%2fso.gushiwen.cn%2fuser%2fcollect.aspx' 35 | data = { 36 | '__VIEWSTATE': viewstate, 37 | '__VIEWSTATEGENERATOR': viewstategenerator, 38 | 'from': 'http://so.gushiwen.cn/user/collect.aspx', 39 | 'email': '@qq.com', 40 | 'pwd': '', 41 | 'code': res, 42 | 'denglu': '登录', 43 | } 44 | # 注意此处也应该用session不然验证码也会刷新 45 | login_page_text = session.post(url=login_url, data=data, headers=headers).text 46 | 47 | with open('gushiwen.html', 'w', encoding='utf-8') as fp: 48 | fp.write(login_page_text) 49 | -------------------------------------------------------------------------------- /进阶篇/基础综合/验证码相关/模拟登录之后的数据爬取.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import ddddocr 3 | from lxml import etree 4 | 5 | # 获取验证码信息以及页面的隐藏信息,在这里是viewstate和viewstategenerator 6 | if __name__ == "__main__": 7 | headers = { 8 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36' 9 | } 10 | url = 'https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx' 11 | page_text = requests.get(url=url, headers=headers).text 12 | tree = etree.HTML(page_text) 13 | 14 | # 获取验证码图片连接 15 | code_img_src = 'https://so.gushiwen.cn/' + tree.xpath('//*[@id="imgCode"]/@src')[0] 16 | 17 | # 获取viewstate的值 18 | viewstate = tree.xpath("//input[@id='__VIEWSTATE']/@value")[0] 19 | 20 | # 获取viewstategenerator的值 21 | viewstategenerator = tree.xpath("//input[@id='__VIEWSTATEGENERATOR']/@value")[0] 22 | 23 | # 将验证码图片保存到本地 24 | # code_data = requests.get(url=code_img_src, headers=headers).content 不可以这样使用,因为一旦请求,原本网页的验证码就会发生改变了 25 | # 这里我们使用requests中的session()方法,将请求变成一个对象 26 | session = requests.Session() 27 | code_data = session.get(url=code_img_src, headers=headers).content 28 | with open('./code.jpg', 'wb') as fp: 29 | fp.write(code_data) 30 | 31 | # 解析验证码 32 | ocr = ddddocr.DdddOcr() 33 | with open('code.jpg', 'rb') as f: 34 | img_bytes = f.read() 35 | res = ocr.classification(img_bytes) # 解析到的验证码数据 36 | 37 | # 模拟登录发送post请求 38 | login_url = 'https://so.gushiwen.cn/user/login.aspx?from=http%3a%2f%2fso.gushiwen.cn%2fuser%2fcollect.aspx' 39 | data = { 40 | '__VIEWSTATE': viewstate, 41 | '__VIEWSTATEGENERATOR': viewstategenerator, 42 | 'from': 'http://so.gushiwen.cn/user/collect.aspx', 43 | 'email': '', 44 | 'pwd': '', 45 | 'code': res, 46 | 'denglu': '登录', 47 | } 48 | # 注意此处也应该用session不然验证码也会刷新 49 | session.post(url=login_url, data=data, headers=headers) 50 | 51 | detail_url = 'https://so.gushiwen.cn/user/collect.aspx?type=m&id=3760950&sort=t' 52 | detail_page_text = session.get(url=detail_url, headers=headers).text 53 | with open('infor.html', 'w', encoding='utf-8') as fp: 54 | fp.write(detail_page_text) 55 | -------------------------------------------------------------------------------- /进阶篇/基础综合/验证码相关/验证码测试.py: -------------------------------------------------------------------------------- 1 | import ddddocr 2 | 3 | ocr = ddddocr.DdddOcr() 4 | # 简单的图片数字英文识别 5 | with open('1.png', 'rb') as f: 6 | img_bytes = f.read() 7 | res = ocr.classification(img_bytes) 8 | 9 | print(res) -------------------------------------------------------------------------------- /进阶篇/爬虫轮子/README.md: -------------------------------------------------------------------------------- 1 | # 个人对requests库的二次封装 2 | 3 | > 对于爬虫常见的发送请求以及日志记录和响应校验进行了二次封装。 4 | > 5 | > 只需要在新的类继承CrawlBase即可,发送请求的函数使用do_request 6 | > 7 | > 可以设置中间件以及发生前校验和发送后校验 8 | > 9 | 10 | #### 未来设想 11 | 12 | 1. 增加用户池 13 | 2. 结构优化 14 | 3. 把请求响应都封装 15 | 4. 去重自动入库 16 | 5. 等等 -------------------------------------------------------------------------------- /验证码篇/滑块篇/阿里系226/226.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | import re 4 | from urllib.parse import urlparse, parse_qs 5 | 6 | import playwright.sync_api 7 | from playwright.sync_api import sync_playwright 8 | 9 | # 存放滑块的页面 10 | FILEPATH = '' 11 | 12 | # 拦截验证的路由,自己写一下url, 格式参照playwright官网 13 | INTERRUPT_ROUTE = '' 14 | 15 | # 指定谷歌浏览器路径,以debug模式打开,如果已经打开了debug,下面四行代码可以注释掉 16 | # chrome_path = r'"C:\Program Files\Google\Chrome\Application\chrome.exe"' 17 | # debugging_port = "--remote-debugging-port=9999" 18 | # 19 | # command = f"{chrome_path} {debugging_port}" 20 | # subprocess.Popen(command, shell=True) 21 | 22 | # 创建的ws链接 23 | WS_URL = 'http://localhost:your_port' 24 | 25 | 26 | pattern = re.compile(r'\((.*)\)', re.S) 27 | 28 | headers = { 29 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36', 30 | } 31 | 32 | 33 | def get_226() -> dict: 34 | result: dict = {} 35 | 36 | def intercept_xhr(route: playwright.sync_api.Route): 37 | params = parse_qs(urlparse(route.request.url).query) 38 | result['t'] = params['t'][0] 39 | # 这里不指定headers会出现意想不到的错误 40 | resp = route.fetch(headers=headers) 41 | data = json.loads(pattern.findall(resp.text())[0]) 42 | # 我们获取到了数据是不是应该返还给result 43 | print(data) 44 | route.fulfill(response=resp) 45 | 46 | with sync_playwright() as p: 47 | # 创建一个ws链接 48 | browser = p.chromium.connect_over_cdp(WS_URL) 49 | # 使用浏览器的上下文创建页面 50 | content = browser.contexts[0] 51 | page = content.new_page() 52 | # 设置拦截规则 53 | page.route(INTERRUPT_ROUTE, intercept_xhr) 54 | page.goto(FILEPATH) 55 | # 开始滑动,获取对应的东西,在滑动距离增加一些随机值 56 | btn = page.locator('#nc_1_n1z') 57 | btn_position = btn.bounding_box() 58 | new_x = btn_position['x'] + random.randint(290, 310) 59 | new_y = btn_position['y'] 60 | page.mouse.click(btn_position['x'], btn_position['y']) 61 | # 滑动了 62 | page.mouse.down() 63 | page.mouse.move(new_x, new_y) 64 | page.mouse.up() 65 | # 等待一段时间以观察拖动效果 66 | page.wait_for_timeout(1000) 67 | # 关闭所有 68 | page.close() 69 | content.close() 70 | browser.close() 71 | # 返回结果 72 | return result 73 | -------------------------------------------------------------------------------- /验证码篇/滑块篇/阿里系226/README.md: -------------------------------------------------------------------------------- 1 | ## 使用 2 | 3 | 我们需要修改自己的存放滑块的位置,创建的ws地址以及写一下拦截的url和自己把数据返还回去 4 | 5 | 如果没有以debug模式打开浏览器并指定端口需要运行注释掉的代码 6 | 7 | 仅作学习分享,严禁非法使用 8 | 9 | 注: 10 | > 我们需要把滑块的html保存到本地,并且要做一些调整,可以删除部分除开滑块的逻辑 11 | > 12 | > 比如一些无用的样式,无用的dom元素,保留需要触发的逻辑即可。 13 | > 14 | > 这样可以大幅度提高效率,如果对效率没有要求也可以全网页保存,记得把一些js文件路径和css文件 15 | > 路径补全即可 16 | > 17 | > 对于高并发的情况,这个方法有待测试,因为是和本地浏览器以websocket方式链接,所以会比使用 18 | > 有头模式的反检测浏览器效率快一些,高并发情况可以自行进行测试 19 | > 20 | > 此方法适用于绝大部分人机校验,比还原算法节省99%工作量,大家可以根据自己的需求,自行选择 21 | > 绕过方式 22 | -------------------------------------------------------------------------------- /验证码篇/滑块篇/飞瓜登录验证码定制阿里系滑块/README.md: -------------------------------------------------------------------------------- 1 | ## 使用 2 | 3 | 我们需要修改自己的存放滑块的位置,创建的ws地址以及写一下拦截的url和自己把数据返还回去 4 | 5 | 如果没有以debug模式打开浏览器并指定端口需要运行注释掉的代码 6 | 7 | 仅作学习分享,严禁非法使用 8 | 9 | 注: 10 | > 我们需要把滑块的html保存到本地,并且要做一些调整,可以删除部分除开滑块的逻辑 11 | > 12 | > 比如一些无用的样式,无用的dom元素,保留需要触发的逻辑即可。 13 | > 14 | > 这样可以大幅度提高效率,如果对效率没有要求也可以全网页保存,记得把一些js文件路径和css文件 15 | > 路径补全即可 16 | > 17 | > 对于高并发的情况,这个方法有待测试,因为是和本地浏览器以websocket方式链接,所以会比使用 18 | > 有头模式的反检测浏览器效率快一些,高并发情况可以自行进行测试 19 | > 20 | > 此方法适用于绝大部分人机校验,比还原算法节省99%工作量,大家可以根据自己的需求,自行选择 21 | > 绕过方式 22 | -------------------------------------------------------------------------------- /验证码篇/滑块篇/飞瓜登录验证码定制阿里系滑块/send.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from test import main 3 | phone = [] 4 | for _ in phone: 5 | headers = { 6 | 'Accept': '*/*', 7 | 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 8 | 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 9 | 'Origin': 'https://dy.feigua.cn', 10 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36', 11 | } 12 | token = main() 13 | 14 | data = { 15 | 'tel': _, 16 | 'sessionid': token['sessionid'][0], 17 | 'sig': token['sig'][0], 18 | 'token': token['token'][0], 19 | } 20 | 21 | response = requests.post('https://dy.feigua.cn/login/SendLoginMessageCode', headers=headers, data=data) 22 | print(response.text) 23 | -------------------------------------------------------------------------------- /验证码篇/滑块篇/飞瓜登录验证码定制阿里系滑块/test.py: -------------------------------------------------------------------------------- 1 | import random 2 | from urllib.parse import parse_qs 3 | 4 | from playwright.sync_api import Playwright, sync_playwright 5 | 6 | # 存放滑块的页面 7 | FILEPATH = '' 8 | 9 | # 拦截验证的路由,自己写一下url, 格式参照playwright官网 10 | INTERRUPT_ROUTE = '' 11 | 12 | # 指定谷歌浏览器路径,以debug模式打开,如果已经打开了debug,下面四行代码可以注释掉 13 | # chrome_path = r'"C:\Program Files\Google\Chrome\Application\chrome.exe"' 14 | # debugging_port = "--remote-debugging-port=9999" 15 | # 16 | # command = f"{chrome_path} {debugging_port}" 17 | # subprocess.Popen(command, shell=True) 18 | 19 | # 创建的ws链接 20 | WS_URL = 'http://localhost:your_port' 21 | 22 | 23 | def run(playwright: Playwright) -> dict: 24 | result: dict = {} 25 | 26 | # 拦截发送验证码api,把参数截获 27 | def intercept_xhr(route): 28 | data = parse_qs(route.request.post_data) 29 | route.abort() 30 | # 自行将data传出 31 | print(data) 32 | 33 | browser = playwright.chromium.connect_over_cdp(WS_URL) 34 | content = browser.contexts[0] 35 | 36 | page = content.new_page() 37 | page.route(INTERRUPT_ROUTE, intercept_xhr) 38 | page.goto(FILEPATH) 39 | # 进行点击,进入滑块状态 40 | page.get_by_role("link", name="注册 / 登录").click() 41 | page.get_by_role("link", name="手机登录").click() 42 | page.get_by_text("验证码登录").click() 43 | page.get_by_role("textbox", name="请输入绑定手机号码").click() 44 | page.get_by_role("textbox", name="请输入绑定手机号码").fill("手机号") 45 | page.get_by_role("link", name="获取验证码").click() 46 | # 有可能出现两种id 47 | try: 48 | btn = page.locator('#nc_2_n1z') 49 | btn_position = btn.bounding_box(timeout=10000) 50 | except: 51 | btn = page.locator('#nc_1_n1z') 52 | btn_position = btn.bounding_box() 53 | # 获取滑动位置 54 | new_x = btn_position['x'] + random.randint(390, 400) 55 | new_y = btn_position['y'] 56 | page.mouse.click(btn_position['x'], btn_position['y']) 57 | # 滑动 58 | page.mouse.down() 59 | page.mouse.move(new_x, new_y) 60 | page.mouse.up() 61 | # 稍等一下 62 | page.wait_for_timeout(2000) 63 | # 关闭所有 64 | page.close() 65 | content.close() 66 | browser.close() 67 | return result 68 | 69 | 70 | def main(): 71 | # 用于导出 72 | with sync_playwright() as playwright: 73 | a = run(playwright) 74 | return a 75 | 76 | 77 | if __name__ == '__main__': 78 | for _ in range(10): 79 | print(main()) 80 | --------------------------------------------------------------------------------