├── .bat ├── .github └── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── custom.md │ └── feature_request.md ├── .gitignore ├── .python-version ├── Images ├── Cookie.png ├── Icon.png ├── SQL.png ├── qrcode.png ├── 一级.png ├── 二级.png ├── 列表.png ├── 各种ID.png ├── 处理二维码.png ├── 微博主体处理流程.png ├── 数据流向.png ├── 详细.png └── 页面展示.png ├── README.md ├── WeiBoCrawler ├── README.md ├── __init__.py ├── config.toml ├── database │ ├── __init__.py │ ├── sql.py │ └── sql_record.py ├── pack │ ├── BaseDownloader.py │ ├── __init__.py │ ├── get_body_data.py │ ├── get_comment1_data.py │ ├── get_comment2_data.py │ └── get_list_data.py ├── parse │ ├── __init__.py │ ├── parse_list_html.py │ ├── process_body.py │ ├── process_comment.py │ └── process_list.py ├── request │ ├── __init__.py │ ├── get_body_request.py │ ├── get_comment_request.py │ ├── get_cookies.py │ ├── get_list_request.py │ ├── get_rum_request(unuse).py │ ├── request.toml │ └── util.py └── util │ ├── __init__.py │ ├── cookie.py │ ├── custom.py │ ├── database.py │ ├── decorator.py │ ├── log.py │ ├── path.py │ ├── process.py │ └── show_qrcode.py ├── pyproject.toml ├── requirements.txt ├── uv.lock └── web ├── main.py ├── util └── __init__.py └── web_pages ├── Cookie └── Cookie.py ├── 搜索 ├── 一级评论搜索.py ├── 二级评论搜索.py ├── 列表搜索.py └── 详细页搜索.py └── 查询 └── 查询.py /.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | .venv\Scripts\streamlit.exe run web/main.py 3 | pause -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/custom.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Custom issue template 3 | about: Describe this issue template's purpose here. 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | 11 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.json 3 | 4 | 5 | demo.* 6 | test.* 7 | test_* 8 | 9 | # uv.lock 10 | # pyproject.toml 11 | # .python-version 12 | 13 | .vscode/ 14 | app.log 15 | 16 | 数据库.db 17 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.10 2 | -------------------------------------------------------------------------------- /Images/Cookie.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouyi207/WeiBoCrawler/7ba003df2e0ab67c8304c12b8cc4d2288887b4a9/Images/Cookie.png -------------------------------------------------------------------------------- /Images/Icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouyi207/WeiBoCrawler/7ba003df2e0ab67c8304c12b8cc4d2288887b4a9/Images/Icon.png -------------------------------------------------------------------------------- /Images/SQL.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouyi207/WeiBoCrawler/7ba003df2e0ab67c8304c12b8cc4d2288887b4a9/Images/SQL.png -------------------------------------------------------------------------------- /Images/qrcode.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouyi207/WeiBoCrawler/7ba003df2e0ab67c8304c12b8cc4d2288887b4a9/Images/qrcode.png -------------------------------------------------------------------------------- /Images/一级.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouyi207/WeiBoCrawler/7ba003df2e0ab67c8304c12b8cc4d2288887b4a9/Images/一级.png -------------------------------------------------------------------------------- /Images/二级.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouyi207/WeiBoCrawler/7ba003df2e0ab67c8304c12b8cc4d2288887b4a9/Images/二级.png -------------------------------------------------------------------------------- /Images/列表.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouyi207/WeiBoCrawler/7ba003df2e0ab67c8304c12b8cc4d2288887b4a9/Images/列表.png -------------------------------------------------------------------------------- /Images/各种ID.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouyi207/WeiBoCrawler/7ba003df2e0ab67c8304c12b8cc4d2288887b4a9/Images/各种ID.png -------------------------------------------------------------------------------- /Images/处理二维码.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouyi207/WeiBoCrawler/7ba003df2e0ab67c8304c12b8cc4d2288887b4a9/Images/处理二维码.png -------------------------------------------------------------------------------- /Images/微博主体处理流程.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouyi207/WeiBoCrawler/7ba003df2e0ab67c8304c12b8cc4d2288887b4a9/Images/微博主体处理流程.png -------------------------------------------------------------------------------- /Images/数据流向.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouyi207/WeiBoCrawler/7ba003df2e0ab67c8304c12b8cc4d2288887b4a9/Images/数据流向.png -------------------------------------------------------------------------------- /Images/详细.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouyi207/WeiBoCrawler/7ba003df2e0ab67c8304c12b8cc4d2288887b4a9/Images/详细.png -------------------------------------------------------------------------------- /Images/页面展示.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouyi207/WeiBoCrawler/7ba003df2e0ab67c8304c12b8cc4d2288887b4a9/Images/页面展示.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | WeiBoCrawler 3 |

WeiBoCrawler

4 |
5 | 6 | # 欢迎!如果好用点个 star 🌟 呗!🤗 7 | 8 | 😉😉😉 **本项目打算长期维护,欢迎大家 Pull requests 成为 Contributor** 😉😉😉 9 | 10 | 😘😘😘 **如果发现 bug, 可以通过提 [Issues](https://github.com/zhouyi207/WeiBoCrawler/issues) 或添加微信: woyaolz 沟通 !** 😘😘😘 11 | 12 | ### 😁该项目是什么? 13 | 14 | 该项目主要用于对微博进行数据采集,包括微博详细页内容、微博评论内容、微博转发量、微博点赞量,微博评论量等信息,方便做学术研究时采集数据。 15 | 16 | ### 😋为什么使用本项目? 17 | 18 | - **简单:** 快速上手,只需几行代码即可完成数据采集。 19 | - **高效:** 采用异步请求和异步存储的方式,大大提高数据采集效率。 20 | - **可视化:** 利用 streamlit 编写了一个可视化界面,方便用户进行数据采集和数据查询。 21 | - **数据库:** 将 tinydb 改为 SQL 数据库,可以连接自定义数据库。 22 | - **Cookies:** 不需要手动输入 cookies,扫码自动获取 cookies。 23 | 24 | ### 🥂更新修复 25 | - 2025.04.11 解决高级检索选择日期只能选择10年范围之内的日期问题。 26 | - 2025.03.31 解决高级检索时间问题,同时删除了检索出现微博推荐的 “可能感兴趣” 的无关数据。 27 | - 2025.03.02 web前端获取cookie使用线程进行优化,替换掉 PIL.Image 库将二维码展示在网页中。 28 | - 2025.02.23 添加一个错误报错提示,先获取 cookie 才能生成 config.toml 文件,否则会报错。 29 | 30 | ## 🚤快速上手 31 | 32 | ### 1. 下载本项目 33 | 34 | 在指定目录下使用 **git 命令克隆本项目** 或 **下载本项目的 zip 包然后解压**。 35 | 36 | ```bash 37 | git clone https://github.com/zhouyi207/WeiBoCrawler.git 38 | ``` 39 | 40 | ### 2. 安装依赖 41 | 42 | 在项目根目录下使用 **pip 命令安装依赖**,注意这里的 Python 版本是 3.10 版本。 43 | 44 | ```bash 45 | pip install -r requirements.txt 46 | ``` 47 | 48 | ### 3. 运行程序 49 | 50 | 在项目根目录下使用 **streamlit 命令运行程序**。 51 | 52 | ```bash 53 | streamlit run web/main.py 54 | ``` 55 | 56 | 57 | 58 |
59 | 60 |

成功运行🥳🥳🥳

61 |
62 | 63 | ## 🎨 界面展示 64 | 65 | ### 1. 列表搜索 66 | 67 |
68 | 69 |

列表搜索

70 |
71 | 72 | 73 | ### 2. 详细页搜索 74 | 75 |
76 | 77 |

详细搜索

78 |
79 | 80 | ### 3. 一级评论搜索 81 | 82 | 83 |
84 | 85 |

一级评论搜索

86 |
87 | 88 | 89 | ### 4. 二级评论搜索 90 | 91 |
92 | 93 |

二级评论搜索

94 |
95 | 96 | ### 5. SQL 数据库查询 97 | 98 |
99 | 100 |

SQL 数据库查询

101 |
102 | 103 | ## 🧑‍🎓项目相关 104 | 105 | ### 1. 主体处理 106 | 107 |
108 | 109 |
110 | 111 | ### 2. UID 和 MID 112 | 113 |
114 | 115 |
116 | 117 | ### 3. 数据流向 118 | 119 |
120 | 121 |
122 | 123 | 124 | 125 | ## 📱联系 126 | 127 |
128 | 129 |
130 | 131 | 132 | 133 | ## ⚠️⚠️⚠️ 注意事项 134 | 135 | 本项目仅用于学术研究,**请勿用于商业用途**。 136 | -------------------------------------------------------------------------------- /WeiBoCrawler/README.md: -------------------------------------------------------------------------------- 1 | - [x] 已完成 2 | - [ ] 未完成 3 | 4 | 2025.01.28 5 | - [ ] params flow 这个字段表示按热度还是按时间,1. 表示热度,2. 表示时间。在这里目前只有 comment_request 使用到了变化的字段,其他并没有用到,设置的是固定的。 6 | - [x] pack.get_commment1_data.py get_commment2_data.py 这两文件中进度条有问题,进度条 desp 和 进度条 total 需要修改一下。由于没有预先设置 totol,会导致默认为 100 7 | - [ ] 进度条不够美观,特别是 comment 请求 8 | - [x] pack 可以重构解耦一下,使用抽象基类 9 | - [x] 除了 get_commment1_data.py get_commment2_data.py 这两文件,异步都没怎么用,应该先创建 task 然后使用 asyncio.gather(*tasks) 注册 task 10 | - [x] 差距为已被 修改为 差距为 一倍 11 | 12 | 2025.01.29 13 | 14 | - [x] 解析数据库. 在 parse 目录下 制作 process_xxx_json(TinyDB.table) -> pd.DataFrame 函数, 在这里实现一下数据库去重的逻辑(TinyDB)好像并没有去重的逻辑 15 | - [x] 构建请求中的 headers 可以装在 client 中.(不可以,有的请求需要处理 headers) 16 | - [x] 给 list 的 request 结果 添加 微博id 参数,与 body 保持一致. 17 | - [x] 前端初步搭建:数据展示. 18 | - [x] 模块的路径导入最好改用相对文件本身路径而不是使用项目路径 19 | - [x] drop_table_duplicates 函数 暂时使用最简单的列表去重法, 后续可以考虑使用 hash 去重等方法优化.. 20 | 21 | 2025.1.30 22 | 23 | - [x] 如果要实现更好的数据库效果,可以根据 mid 合并而不是 list body comment 分别展示,必须要实现字段统一. 24 | - [ ] 由于是将所有请求的结果都保存在数据库,而展示的结果都是经过字段处理后的结果,需要给一个功能寻找指定数据的源数据. 25 | - [ ] 给 uitl 添加 __all__ = [] 26 | - [x] 在下载前后检测数据表的状态,将变化的状态保存下来,方便知道新下载到哪里. 27 | - [ ] get_body_data 中 数据表名为 id 改为需要 给定数据表名. 28 | - [ ] 在 BaseDownloader.py 文件中添加日志功能,观察输出. 29 | - [ ] 抽象类出现带参数的装饰器报错 30 | 31 | 向下面这样是不行的... 32 | 33 | 34 | ```python 35 | def retry_timeout_decorator_asyncio(retry_times: int = 3) -> Callable: 36 | def _retry_timeout_decorator_asyncio(func: Callable) -> Callable: 37 | """超时重试装饰器(异步) 38 | 39 | Args: 40 | retry_times (int): 重试次数. Defaults to 3. 41 | 42 | Returns: 43 | Callable: 装饰后的函数 44 | """ 45 | async def wrapper(*args, **kwargs): # 将 wrapper 改为异步函数 46 | attempts = 0 47 | while attempts < retry_times: 48 | try: 49 | return await func(*args, **kwargs) # 调用异步函数并使用 await 50 | except httpx.TimeoutException as e: 51 | attempts += 1 52 | if attempts < retry_times: 53 | logging.warning(f"请求超时,正在进行第 {attempts} 次重试...") 54 | else: 55 | logging.error(f"请求超时,重试次数已达到最大值,请检查网络连接或重试次数!错误原因{e}") 56 | return wrapper 57 | return _retry_timeout_decorator_asyncio 58 | ``` 59 | 60 | 61 | 2025.01.31 62 | 63 | - [ ] tinydb 这个玩意啊,5700条数据的时候插入一下要 1s,这是什么逆天的速度,我靠了.....想办法用其他数据库把,这玩意太影响速度了..... 64 | - [ ] database 解耦,方便使用定义的数据库. 65 | - [ ] 在使用 sqlalchemy 库操作数据库的时候,sessionmaker 中设置 expire_on_commit=False 可以避免在提交事务时自动刷新对象的状态,从而提高性能,但可能会出现脏读的现象,但是就我们的操作而言,单线程异步是不会出现脏读的情况的. 66 | 67 | 68 | 在设置 sessionmaker 中 expire_on_commit=True 的时候,在提交事务时自动刷新对象的状态,以异步为例子 69 | 70 | ```python 71 | async def async_add_records(self, records: list[ListRecord | BodyRecord | Comment1Record | Comment2Record ]) -> list[int]: 72 | """异步插入记录 73 | 74 | Args: 75 | records (list[ListRecord | BodyRecord | Comment1Record | Comment2Record ]): 记录列表 76 | 77 | Returns: 78 | list[int]: id列表 79 | """ 80 | async with self.async_session() as session: 81 | try: 82 | session.add_all(records) 83 | await session.commit() 84 | return [record.id for record in records] 85 | except Exception as e: 86 | await session.rollback() 87 | logging.error(f"插入记录时出现异常: {e}", exc_info=True) 88 | return [] 89 | ``` 90 | 91 | 如果 expire_on_commit=True, 那么在提交事务时,会自动刷新对象的状态,即重新查询数据库中的数据,以确保数据的一致性。但是这里的查询是同步的,而 session 是异步会话,会出现在异步会话中调用同步功能的操作,这是一个 bug. 正确的处理方式是,使用异步去刷新 records 的状态. 92 | 93 | 94 | ```python 95 | async def async_add_records(self, records: list[ListRecord | BodyRecord | Comment1Record | Comment2Record ]) -> list[int]: 96 | """异步插入记录 97 | 98 | Args: 99 | records (list[ListRecord | BodyRecord | Comment1Record | Comment2Record ]): 记录列表 100 | 101 | Returns: 102 | list[int]: id列表 103 | """ 104 | async with self.async_session() as session: 105 | try: 106 | session.add_all(records) 107 | await session.commit() 108 | # 修改的地方 109 | ids = [] 110 | for record in records: 111 | await session.refresh(record) 112 | ids.append(record.id) 113 | return ids 114 | except Exception as e: 115 | await session.rollback() 116 | logging.error(f"插入记录时出现异常: {e}", exc_info=True) 117 | return [] 118 | ``` 119 | 120 | 这样就可以了. 121 | 122 | 123 | - [x] tinydb 5700 条数据后要1s一条, sqlite 150000 条数据后 0.02s 一条. 我宣布我不认识 tinydb.... 124 | - [ ] tmd... sqlalchemy 在设置 relationship 的时候, 如果从表有多个外键,主表调用 relationship 函数中 foreign_keys 没用啊,老是报错, 只能使用 primaryjoin 函数来操作... 好煞笔. 125 | - [ ] 我宣布 sqlalchemy 是个很傻鸟的库,妈的,定义那么多类型完全看不过来是干鸡毛,看你开源的份上作者我就不骂你了.... **peewee** 持续关注! -------------------------------------------------------------------------------- /WeiBoCrawler/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhouyi207/WeiBoCrawler/7ba003df2e0ab67c8304c12b8cc4d2288887b4a9/WeiBoCrawler/__init__.py -------------------------------------------------------------------------------- /WeiBoCrawler/config.toml: -------------------------------------------------------------------------------- 1 | [database] 2 | path = "../数据库.db" 3 | 4 | [cookies] 5 | SCF = "" 6 | SUB = "" 7 | SUBP = "" 8 | ALF = "" 9 | SINAGLOBAL = "" 10 | _s_tentry = "" 11 | Apache = "" 12 | ULV = "" 13 | XSRF-TOKEN = "" 14 | PC_TOKEN = "" 15 | WBPSESS = "" 16 | ALC = "" 17 | X-CSRF-TOKEN = "" 18 | 19 | [cookies_info] 20 | update_time = "2025-02-20 20:02:45" 21 | -------------------------------------------------------------------------------- /WeiBoCrawler/database/__init__.py: -------------------------------------------------------------------------------- 1 | from .sql import DatabaseManager , BodyRecord, Comment1Record, Comment2Record, RecordFrom 2 | from ..util import database_config 3 | 4 | 5 | db_path = database_config.path 6 | 7 | db = DatabaseManager( 8 | sync_db_url=f'sqlite:///{db_path}', # 同步模式 9 | async_db_url=f'sqlite+aiosqlite:///{db_path}' # 异步模式 10 | ) 11 | 12 | __all__ = ["db", "BodyRecord", "Comment1Record", "Comment2Record", "RecordFrom"] -------------------------------------------------------------------------------- /WeiBoCrawler/database/sql.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import select, inspect, create_engine, text 2 | from sqlalchemy.orm import sessionmaker 3 | from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker 4 | from .sql_record import Base, BodyRecord, Comment1Record, Comment2Record, RecordFrom 5 | from ..util import logging 6 | from typing import Any 7 | 8 | 9 | class DatabaseManager: 10 | """数据库的增删改查 11 | 12 | """ 13 | def __init__(self, sync_db_url: str, async_db_url: str): 14 | """初始化数据库 15 | 16 | Args: 17 | sync_db_url (str): 同步的数据库连接字符串 18 | async_db_url (str): 异步的数据库连接字符串 19 | """ 20 | # 引擎 21 | self.sync_engine = create_engine(sync_db_url) 22 | self.async_engine = create_async_engine(async_db_url) 23 | 24 | # 会话工厂 25 | self.sync_session = sessionmaker(self.sync_engine, expire_on_commit=False) 26 | self.async_session = async_sessionmaker(self.async_engine, class_=AsyncSession, expire_on_commit=False) 27 | 28 | # 创建表 29 | self.sync_create_tables() 30 | 31 | def sync_create_tables(self): 32 | """同步创建表 33 | 34 | """ 35 | Base.metadata.create_all(self.sync_engine) 36 | 37 | async def async_create_tables(self): 38 | """异步创建表 39 | 40 | """ 41 | async with self.async_engine.begin() as conn: 42 | await conn.run_sync(Base.metadata.create_all) 43 | 44 | 45 | def sync_add_records(self, records: list[ BodyRecord | Comment1Record | Comment2Record ]) -> list[int]: 46 | """同步插入记录 47 | 48 | Args: 49 | records (list[ BodyRecord | Comment1Record | Comment2Record ]): 记录列表 50 | 51 | Returns: 52 | list[int]: id列表 53 | """ 54 | with self.sync_session() as session: 55 | try: 56 | session.add_all(records) 57 | session.commit() 58 | return [record.id for record in records] 59 | except Exception as e: 60 | session.rollback() 61 | logging.error(f"插入记录时出现异常: {e}", exc_info=True) 62 | return [] 63 | 64 | async def async_add_records(self, records: list[ BodyRecord | Comment1Record | Comment2Record ]) -> list[int]: 65 | """异步插入记录 66 | 67 | Args: 68 | records (list[ BodyRecord | Comment1Record | Comment2Record ]): 记录列表 69 | 70 | Returns: 71 | list[int]: id列表 72 | """ 73 | async with self.async_session() as session: 74 | try: 75 | session.add_all(records) 76 | await session.commit() 77 | return [record.id for record in records] 78 | except Exception as e: 79 | await session.rollback() 80 | logging.error(f"插入记录时出现异常: {e}", exc_info=True) 81 | return [] 82 | 83 | def sync_get_records_by_ids(self, model: BodyRecord | Comment1Record | Comment2Record , ids: list[int]) -> list[ BodyRecord | Comment1Record | Comment2Record ]: 84 | """同步查询记录 85 | 86 | Args: 87 | model ( BodyRecord | Comment1Record | Comment2Record ): 搜索类 88 | ids (list[int]): 搜索id列表 89 | 90 | Returns: 91 | list[ BodyRecord | Comment1Record | Comment2Record ]: 搜索列表 92 | """ 93 | with self.sync_session() as session: 94 | return session.query(model).filter(model.id.in_(ids)).all() 95 | 96 | async def async_get_records_by_ids(self, model: BodyRecord | Comment1Record | Comment2Record , ids: list[int]) -> list[ BodyRecord | Comment1Record | Comment2Record ]: 97 | """异步查询记录 98 | 99 | Args: 100 | model ( BodyRecord | Comment1Record | Comment2Record ): 搜索类 101 | ids (list[int]): 搜索id列表 102 | 103 | Returns: 104 | list[ BodyRecord | Comment1Record | Comment2Record ]: 搜索列表 105 | """ 106 | async with self.async_session() as session: 107 | stmt = select(model).where(model.id.in_(ids)) 108 | result = await session.execute(stmt) 109 | return result.scalars().all() 110 | 111 | def sync_update_record(self, model: BodyRecord | Comment1Record | Comment2Record , record_id: int, **kwargs) -> BodyRecord | Comment1Record | Comment2Record : 112 | """同步更新记录 113 | 114 | Args: 115 | model ( BodyRecord | Comment1Record | Comment2Record ): 更新类 116 | record_id (int): 更新id 117 | kwargs: 更新的字段和值 118 | 119 | Returns: 120 | BodyRecord | Comment1Record | Comment2Record : 更新类 121 | """ 122 | with self.sync_session() as session: 123 | record = session.get(model, record_id) 124 | if record: 125 | for key, value in kwargs.items(): 126 | setattr(record, key, value) 127 | try: 128 | session.commit() 129 | except Exception as e: 130 | session.rollback() 131 | logging.error(f"更新记录时出现异常: {e}", exc_info=True) 132 | return record 133 | 134 | async def async_update_record(self, model: BodyRecord | Comment1Record | Comment2Record , record_id: int, **kwargs) -> BodyRecord | Comment1Record | Comment2Record : 135 | """异步更新记录 136 | 137 | Args: 138 | model ( BodyRecord | Comment1Record | Comment2Record ): 更新类 139 | record_id (int): 更新id 140 | kwargs: 更新的字段和值 141 | 142 | Returns: 143 | BodyRecord | Comment1Record | Comment2Record : 更新记录 144 | """ 145 | async with self.async_session() as session: 146 | record = await session.get(model, record_id) 147 | if record: 148 | for key, value in kwargs.items(): 149 | setattr(record, key, value) 150 | try: 151 | await session.commit() 152 | except Exception as e: 153 | await session.rollback() 154 | logging.error(f"更新记录时出现异常: {e}", exc_info=True) 155 | return record 156 | 157 | def sync_delete_record(self, model: BodyRecord | Comment1Record | Comment2Record , record_id: int) -> BodyRecord | Comment1Record | Comment2Record : 158 | """同步删除记录 159 | 160 | Args: 161 | model ( BodyRecord | Comment1Record | Comment2Record ): 删除类 162 | record_id (int): 删除id 163 | 164 | Returns: 165 | BodyRecord | Comment1Record | Comment2Record : 删除记录 166 | """ 167 | with self.sync_session() as session: 168 | record = session.get(model, record_id) 169 | if record: 170 | try: 171 | session.delete(record) 172 | session.commit() 173 | except Exception as e: 174 | session.rollback() 175 | logging.error(f"删除记录时出现异常: {e}", exc_info=True) 176 | return record 177 | 178 | async def async_delete_record(self, model: BodyRecord | Comment1Record | Comment2Record , record_id: int) -> BodyRecord | Comment1Record | Comment2Record : 179 | """异步删除记录 180 | 181 | Args: 182 | model ( BodyRecord | Comment1Record | Comment2Record ): 删除类 183 | record_id (int): 删除id 184 | """ 185 | async with self.async_session() as session: 186 | record = await session.get(model, record_id) 187 | if record: 188 | try: 189 | await session.delete(record) 190 | await session.commit() 191 | except Exception as e: 192 | await session.rollback() 193 | logging.error(f"删除记录时出现异常: {e}", exc_info=True) 194 | return record 195 | 196 | def sync_get_table_names(self) -> list[str]: 197 | """同步获取表名 198 | 199 | Returns: 200 | list[str]: 表名列表 201 | """ 202 | inspector = inspect(self.sync_engine) 203 | return inspector.get_table_names() 204 | 205 | async def async_get_table_names(self) -> list[str]: 206 | """异步获取表名 207 | 208 | Returns: 209 | list[str]: 表名列表 210 | """ 211 | inspector = inspect(self.sync_engine) 212 | return inspector.get_table_names() 213 | 214 | def sync_get_records(self, model: BodyRecord | Comment1Record | Comment2Record, limit: int = 100, offset: int = 0) -> list[BodyRecord | Comment1Record | Comment2Record]: 215 | """同步获取数据 limit 和 offset 216 | 217 | Args: 218 | model (BodyRecord | Comment1Record | Comment2Record): 数据类型 219 | limit (int, optional): 数据大小. Defaults to 100. 220 | offset (int, optional): 数据偏移. Defaults to 0. 221 | 222 | Returns: 223 | list[BodyRecord | Comment1Record | Comment2Record]: 数据列表 224 | """ 225 | with self.sync_session() as session: 226 | records = session.query(model).limit(limit).offset(offset).all() 227 | return records 228 | 229 | async def async_get_records(self, model: BodyRecord | Comment1Record | Comment2Record, limit: int = 100, offset: int = 0): 230 | """异步获取数据 limit 和 offset 231 | 232 | Args: 233 | model (BodyRecord | Comment1Record | Comment2Record): 数据类型 234 | limit (int, optional): 数据大小. Defaults to 100. 235 | offset (int, optional): 数据偏移. Defaults to 0. 236 | 237 | Returns: 238 | list[BodyRecord | Comment1Record | Comment2Record]: 数据列表 239 | """ 240 | async with self.async_session() as session: 241 | records = await session.query(model).limit(limit).offset(offset).all() 242 | return records 243 | 244 | # 异步未实现 245 | def sync_get_distinct_category_names(self, ModelCol:Any) -> list[str]: 246 | """同步获取唯一分类名称 247 | 248 | Args: 249 | ModelCol (Any): Model 的 Col 例如 User.name 250 | 251 | Returns: 252 | list[str]: 名称列表 253 | """ 254 | with self.sync_session() as session: 255 | unique_names = session.query(ModelCol).distinct().all() 256 | return unique_names 257 | 258 | # 在这里直接写 SQL 吧,分类太多了.. 259 | 260 | def sql(self, sql_query:str): 261 | """在数据库中写sql 262 | 263 | Args: 264 | sql (str): sql语句 265 | 266 | return: list 267 | """ 268 | with self.sync_session() as session: 269 | result = session.execute(text(sql_query)) 270 | data_as_dicts_auto = [dict(zip(result.keys(), row)) for row in result] 271 | return data_as_dicts_auto 272 | 273 | __all__ = [BodyRecord, Comment1Record, Comment2Record, RecordFrom, DatabaseManager] -------------------------------------------------------------------------------- /WeiBoCrawler/database/sql_record.py: -------------------------------------------------------------------------------- 1 | import enum 2 | from datetime import datetime 3 | from sqlalchemy import BigInteger, JSON, Text, ForeignKey, Enum 4 | from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship 5 | 6 | 7 | class RecordFrom(enum.Enum): 8 | """主要在 BodyRecord 中使用,表示数据来源 9 | 10 | """ 11 | Html = "html" 12 | Api = "api" 13 | 14 | 15 | class Base(DeclarativeBase): 16 | """初始化 registry 属性 17 | 18 | """ 19 | ... 20 | 21 | class AbstractBase(Base): 22 | __abstract__ = True 23 | id: Mapped[int] = mapped_column(primary_key=True) 24 | mid: Mapped[int] = mapped_column(BigInteger) 25 | uid: Mapped[int] = mapped_column(BigInteger) 26 | search_for: Mapped[str] = mapped_column(Text) 27 | create_time: Mapped[datetime] = mapped_column(default=lambda: datetime.now()) 28 | json_data: Mapped[dict] = mapped_column(JSON) 29 | 30 | 31 | class BodyComment1(Base): 32 | """定义 BodyRecord 与 Comment1Record 的关联表 33 | 34 | """ 35 | __tablename__ = 'body_comment1_association' 36 | id: Mapped[int] = mapped_column(primary_key=True) 37 | body_mid: Mapped[int] = mapped_column(BigInteger, ForeignKey('BodyRecord.mid')) 38 | body_uid: Mapped[int] = mapped_column(BigInteger, ForeignKey('BodyRecord.uid')) 39 | comment1_f_mid: Mapped[int] = mapped_column(BigInteger, ForeignKey('Comment1Record.mid')) 40 | comment1_f_uid: Mapped[int] = mapped_column(BigInteger, ForeignKey('Comment1Record.uid')) 41 | 42 | 43 | class BodyComment2(Base): 44 | """定义 BodyRecord 与 Comment2Record 的关联表 45 | 46 | """ 47 | __tablename__ = 'body_comment2_association' 48 | id: Mapped[int] = mapped_column(primary_key=True) 49 | body_uid: Mapped[int] = mapped_column(BigInteger, ForeignKey('BodyRecord.uid')) 50 | comment2_f_uid: Mapped[int] = mapped_column(BigInteger, ForeignKey('Comment2Record.f_uid')) 51 | 52 | 53 | class Comment12(Base): 54 | """定义 Comment1Record 与 Comment2Record 的关联表 55 | 56 | """ 57 | __tablename__ = 'comment1_comment2_association' 58 | id: Mapped[int] = mapped_column(primary_key=True) 59 | comment1_mid: Mapped[int] = mapped_column(BigInteger, ForeignKey('Comment1Record.mid')) 60 | comment2_f_mid: Mapped[int] = mapped_column(BigInteger, ForeignKey('Comment2Record.f_mid')) 61 | 62 | 63 | class BodyRecord(AbstractBase): 64 | """存储 Body Record 的数据 65 | 66 | """ 67 | __tablename__ = 'BodyRecord' 68 | record_from: Mapped[RecordFrom] = mapped_column(Enum(RecordFrom)) 69 | 70 | # 定义关系字段 71 | comment1_records: Mapped[list["Comment1Record"]] = relationship( 72 | lazy=True, 73 | secondary="body_comment1_association", 74 | back_populates='body_records', 75 | primaryjoin="and_(BodyRecord.mid == body_comment1_association.c.body_mid, BodyRecord.uid == body_comment1_association.c.body_uid)", 76 | secondaryjoin="and_(Comment1Record.f_mid == body_comment1_association.c.comment1_f_mid, Comment1Record.f_uid == body_comment1_association.c.comment1_f_uid)", 77 | # cascade="all, delete-orphan", # 这里的 cascade 选项表示当 BodyRecord 被删除时,相关联的 Comment1Record 和 Comment2Record 也会被删除 !!!多对多禁止使用 78 | ) 79 | comment2_records: Mapped[list["Comment2Record"]] = relationship( 80 | lazy=True, 81 | secondary="body_comment2_association", 82 | back_populates='body_records', 83 | primaryjoin="BodyRecord.uid == body_comment2_association.c.body_uid", 84 | secondaryjoin="Comment2Record.f_uid == body_comment2_association.c.comment2_f_uid", 85 | # cascade="all, delete-orphan", # 这里的 cascade 选项表示当 BodyRecord 被删除时,相关联的 Comment1Record 和 Comment2Record 也会被删除 !!!多对多禁止使用 86 | ) 87 | 88 | def __repr__(self): 89 | return f"BodyRecord(id={self.id}, mid={self.mid}, uid={self.uid}, search_for='{self.search_for}', record_from='{self.record_from}', create_time={self.create_time})" 90 | 91 | 92 | class Comment1Record(AbstractBase): 93 | """存储 Comment Record 的数据 94 | 95 | """ 96 | __tablename__ = 'Comment1Record' 97 | f_mid: Mapped[int] = mapped_column(BigInteger) 98 | f_uid: Mapped[int] = mapped_column(BigInteger) 99 | 100 | # 定义关系字段 101 | body_records: Mapped[list["BodyRecord"]] = relationship( 102 | secondary="body_comment1_association", 103 | back_populates='comment1_records', 104 | primaryjoin="and_(Comment1Record.f_mid == body_comment1_association.c.comment1_f_mid, Comment1Record.f_uid == body_comment1_association.c.comment1_f_uid)", 105 | secondaryjoin="and_(BodyRecord.mid == body_comment1_association.c.body_mid, BodyRecord.uid == body_comment1_association.c.body_uid)" 106 | ) 107 | comment2_records: Mapped[list["Comment2Record"]] = relationship( 108 | secondary="comment1_comment2_association", 109 | back_populates='comment1_records', 110 | primaryjoin="Comment1Record.mid == comment1_comment2_association.c.comment1_mid", 111 | secondaryjoin="Comment2Record.f_mid == comment1_comment2_association.c.comment2_f_mid" 112 | ) 113 | 114 | def __repr__(self): 115 | return f"Comment1Record(id={self.id}, mid={self.mid}, uid={self.uid}, f_mid={self.f_mid}, f_uid={self.f_uid}, search_for='{self.search_for}')" 116 | 117 | 118 | class Comment2Record(AbstractBase): 119 | """存储 Comment Record 的数据 120 | 121 | """ 122 | __tablename__ = 'Comment2Record' 123 | f_mid: Mapped[int] = mapped_column(BigInteger) 124 | f_uid: Mapped[int] = mapped_column(BigInteger) 125 | 126 | # 定义关系字段 127 | body_records: Mapped[list["BodyRecord"]] = relationship( 128 | secondary="body_comment2_association", 129 | back_populates='comment2_records', 130 | primaryjoin="Comment2Record.f_uid == body_comment2_association.c.comment2_f_uid", 131 | secondaryjoin="BodyRecord.uid == body_comment2_association.c.body_uid" 132 | ) 133 | comment1_records: Mapped[list["Comment1Record"]] = relationship( 134 | secondary="comment1_comment2_association", 135 | back_populates='comment2_records', 136 | primaryjoin="Comment2Record.f_mid == comment1_comment2_association.c.comment2_f_mid", 137 | secondaryjoin="Comment1Record.mid == comment1_comment2_association.c.comment1_mid" 138 | ) 139 | 140 | def __repr__(self): 141 | return f"Comment2Record(id={self.id}, mid={self.mid}, uid={self.uid}, f_mid={self.f_mid}, f_uid={self.f_uid}, search_for='{self.search_for}')" 142 | -------------------------------------------------------------------------------- /WeiBoCrawler/pack/BaseDownloader.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from abc import ABC, abstractmethod 3 | from typing import Any 4 | 5 | import httpx 6 | from pydantic import BaseModel 7 | from ..database import db, BodyRecord, Comment1Record, Comment2Record, RecordFrom 8 | from ..util import CustomProgress, cookies_config, log_function_params, logging 9 | 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | class CommentID(BaseModel): 14 | uid: str 15 | mid: str 16 | 17 | 18 | class BaseDownloader(ABC): 19 | def __init__(self, *, table_name: str, concurrency: int = 100): 20 | self.table_name = table_name 21 | self.semaphore = asyncio.Semaphore(concurrency) 22 | self.db = db 23 | self.res_ids = [] 24 | 25 | @abstractmethod 26 | def _get_request_description(self) -> str: 27 | """获取进度条描述 28 | 29 | Returns: 30 | str: 进度条描述 31 | """ 32 | ... 33 | 34 | @abstractmethod 35 | def _get_request_params(self) -> list: 36 | """获取请求参数列表 37 | 38 | Returns: 39 | list: 请求参数列表 40 | """ 41 | ... 42 | 43 | @abstractmethod 44 | def _process_response(self, response: httpx.Response, *, param: Any) -> None: 45 | """处理请求并存储数据 46 | 47 | Args: 48 | response (httpx.Response): 需要处理的请求 49 | param (Any): 请求参数 50 | """ 51 | ... 52 | 53 | @abstractmethod 54 | async def _process_response_asyncio(self, response: httpx.Response, *, param: Any) -> None: 55 | """处理请求并存储数据 56 | 57 | Args: 58 | response (httpx.Response): 需要处理的请求 59 | param (Any): 请求参数 60 | """ 61 | ... 62 | 63 | @abstractmethod 64 | async def _download_single_asyncio(self, *, param:Any, client:httpx.Response, progress:CustomProgress, overall_task:int): 65 | """下载单个请求(异步) 66 | 67 | Args: 68 | param (Any): 请求参数 69 | client (httpx.Response): 请求客户端 70 | progress (CustomProgress): 进度条 71 | overall_task (int): 进度条任务ID 72 | """ 73 | ... 74 | 75 | @abstractmethod 76 | def _download_single_sync(self, *, param: Any, client:httpx.Response, progress:CustomProgress, overall_task:int): 77 | """下载单个请求(同步) 78 | 79 | Args: 80 | param (Any): 请求参数 81 | client (httpx.Response): 请求客户端 82 | progress (CustomProgress): 进度条 83 | overall_task (int): 进度条任务ID 84 | """ 85 | ... 86 | 87 | def _save_to_database(self, items: list[BodyRecord | Comment1Record | Comment2Record]) -> None: 88 | """保存数据到数据库 89 | 90 | Args: 91 | items (list[dict]): 数据列表 92 | """ 93 | res_ids = self.db.sync_add_records(items) 94 | self.res_ids.extend(res_ids) 95 | 96 | async def _save_to_database_asyncio(self, items: list[BodyRecord | Comment1Record | Comment2Record]) -> None: 97 | """保存数据到数据库(异步) 98 | 99 | Args: 100 | items (list[dict]): 数据列表 101 | """ 102 | res_ids = await self.db.async_add_records(items) 103 | self.res_ids.extend(res_ids) 104 | 105 | @log_function_params(logger=logger) 106 | def _check_response(self, response: httpx.Response) -> bool: 107 | """检查响应是否正常 108 | 109 | Args: 110 | response (httpx.Response): 接受到的响应 111 | 112 | Returns: 113 | bool: 有问题返回 False, 否则返回 True 114 | """ 115 | return response.status_code == httpx.codes.OK 116 | 117 | 118 | async def _download_asyncio(self): 119 | """异步下载数据 120 | 121 | """ 122 | with CustomProgress() as progress: 123 | overall_task = progress.add_task( 124 | description=self._get_request_description(), total=len(self._get_request_params()) 125 | ) 126 | async with httpx.AsyncClient(cookies=cookies_config.cookies) as client: 127 | tasks = [] 128 | for param in self._get_request_params(): 129 | async with self.semaphore: 130 | task = asyncio.create_task( 131 | self._download_single_asyncio( 132 | param=param, 133 | client=client, 134 | progress=progress, 135 | overall_task=overall_task, 136 | ) 137 | ) 138 | tasks.append(task) 139 | await asyncio.gather(*tasks) 140 | 141 | def _download_sync(self): 142 | """同步下载数据 143 | 144 | """ 145 | with CustomProgress() as progress: 146 | overall_task = progress.add_task( 147 | description=self._get_request_description(), total=len(self._get_request_params()) 148 | ) 149 | with httpx.Client(cookies=cookies_config.cookies) as client: 150 | for params in self._get_request_params(): 151 | self._download_single_sync(params, client, progress, overall_task) 152 | 153 | def download(self, asynchrony: bool = True) -> None: 154 | """整合异步下载和同步下载 155 | 156 | asynchrony = True 异步下载 157 | asynchrony = False 普通下载 158 | 159 | Args: 160 | asynchrony (bool, optional): 异步下载或者普通下载. Defaults to True. 161 | """ 162 | if asynchrony: 163 | try: 164 | loop = asyncio.get_running_loop() 165 | loop.run_until_complete(self._download_asyncio()) 166 | except RuntimeError: 167 | asyncio.run(self._download_asyncio()) 168 | else: 169 | self._download_sync() 170 | 171 | 172 | __all__ = [BaseDownloader, BodyRecord, Comment1Record, Comment2Record, RecordFrom] -------------------------------------------------------------------------------- /WeiBoCrawler/pack/__init__.py: -------------------------------------------------------------------------------- 1 | from .get_list_data import get_list_data 2 | from .get_body_data import get_body_data 3 | from .get_comment1_data import get_comment1_data 4 | from .get_comment2_data import get_comment2_data 5 | 6 | 7 | __all__ = [ 8 | "get_list_data", 9 | "get_body_data", 10 | "get_comment1_data", 11 | "get_comment2_data", 12 | ] -------------------------------------------------------------------------------- /WeiBoCrawler/pack/get_body_data.py: -------------------------------------------------------------------------------- 1 | import httpx 2 | from typing import Any 3 | from ..util import CustomProgress, retry_timeout_decorator, retry_timeout_decorator_asyncio 4 | from ..parse import process_body_resp 5 | from .BaseDownloader import BaseDownloader, BodyRecord, RecordFrom 6 | from ..request import get_body_response, get_body_response_asyncio 7 | 8 | 9 | class Downloader(BaseDownloader): 10 | def __init__(self, id: list[str] | str, *, table_name: str, concurrency: int = 100): 11 | """下载 Body 页面数据, 并保存在数据库的 id 表中, 数据库位置在 database_config 中. 12 | 13 | Args: 14 | id (Union[List[str], str]): 微博详细页 id, 或者 id 列表. 15 | table_name (str): 存储的位置(数据表名) 16 | concurrency (int, optional): 异步最大并发. Defaults to 100. 17 | """ 18 | super().__init__(table_name=table_name, concurrency=concurrency) 19 | 20 | if isinstance(id, str): 21 | self.ids = [id] 22 | else: 23 | self.ids = id 24 | 25 | def _get_request_description(self) -> str: 26 | """获取进度条描述 27 | 28 | Returns: 29 | str: 进度条描述 30 | """ 31 | return "download..." 32 | 33 | def _get_request_params(self) -> list: 34 | """获取请求参数列表 35 | 36 | Returns: 37 | list: 请求参数列表 38 | """ 39 | return self.ids 40 | 41 | def _process_items(self, items: list[dict]) -> list[BodyRecord]: 42 | """_summary_ 43 | 44 | Args: 45 | items (list[dict]): _description_ 46 | 47 | Returns: 48 | list[BodyRecord]: _description_ 49 | """ 50 | records = [] 51 | for item in items: 52 | mid = item.get("mid", None) 53 | uid = item.get("uid", None) 54 | record = BodyRecord( 55 | mid=mid, 56 | uid=uid, 57 | search_for=self.table_name, 58 | record_from=RecordFrom.Api, 59 | json_data = item 60 | ) 61 | records.append(record) 62 | return records 63 | 64 | def _process_response(self, response: httpx.Response, *, param: Any) -> None: 65 | """处理请求并存储数据 66 | 67 | Args: 68 | response (httpx.Response): 需要处理的请求 69 | param (Any): 请求参数 70 | """ 71 | items = process_body_resp(response) 72 | records = self._process_items(items) 73 | self._save_to_database(records) 74 | 75 | async def _process_response_asyncio(self, response: httpx.Response, *, param: Any) -> None: 76 | """处理请求并存储数据 77 | 78 | Args: 79 | response (httpx.Response): 需要处理的请求 80 | param (Any): 请求参数 81 | """ 82 | items = process_body_resp(response) 83 | records = self._process_items(items) 84 | await self._save_to_database_asyncio(records) 85 | 86 | @retry_timeout_decorator_asyncio 87 | async def _download_single_asyncio(self, *, param:Any, client:httpx.Response, progress:CustomProgress, overall_task:int): 88 | """下载单个请求(异步) 89 | 90 | Args: 91 | param (Any): 请求参数 92 | client (httpx.Response): 请求客户端 93 | progress (CustomProgress): 进度条 94 | overall_task (int): 进度条任务ID 95 | """ 96 | response = await get_body_response_asyncio( 97 | id=param, 98 | client=client) 99 | 100 | if self._check_response(response): 101 | await self._process_response_asyncio(response, param=param) 102 | 103 | progress.update(overall_task, advance=1, description=f"{param}") 104 | 105 | @retry_timeout_decorator 106 | def _download_single_sync(self, *, param: Any, client:httpx.Response, progress:CustomProgress, overall_task:int): 107 | """下载单个请求(同步) 108 | 109 | Args: 110 | param (Any): 请求参数 111 | client (httpx.Response): 请求客户端 112 | progress (CustomProgress): 进度条 113 | overall_task (int): 进度条任务ID 114 | """ 115 | response = get_body_response( 116 | id=param, 117 | client=client) 118 | if self._check_response(response): 119 | self._process_response(response, param=param) 120 | 121 | progress.update(overall_task, advance=1, description=f"{param}") 122 | 123 | 124 | 125 | def get_body_data(id: list[str] | str, *, table_name:str, asynchrony: bool = True) -> list: 126 | """获取 body 页面数据 127 | 128 | Args: 129 | id (Union[List[str], str]): 微博详细页 id, 或者 id 列表. 130 | table_name (str): 存储的位置(数据表名) 131 | asynchrony (bool, optional): _description_. Defaults to True. 132 | 133 | Returns: 134 | list: 存储在数据库中的 id 列表 135 | """ 136 | downloader = Downloader(id = id, table_name=table_name) 137 | downloader.download(asynchrony=asynchrony) 138 | return downloader.res_ids 139 | -------------------------------------------------------------------------------- /WeiBoCrawler/pack/get_comment1_data.py: -------------------------------------------------------------------------------- 1 | import httpx 2 | from ..request import get_comments_l1_response, get_comments_l1_response_asyncio 3 | from ..parse import process_comment_resp 4 | from typing import List, Union, Any 5 | from ..util import CustomProgress, retry_timeout_decorator, retry_timeout_decorator_asyncio 6 | from .BaseDownloader import BaseDownloader, CommentID, Comment1Record 7 | 8 | 9 | class Downloader(BaseDownloader): 10 | def __init__(self, *, uid: Union[List[str], str], mid: Union[List[str], str], table_name: str, concurrency: int = 100, max_failed_times: int = 20) -> None: 11 | """根据 uid 和 mid 下载评论数据,并保存在数据库的 mid 表中, 数据库位置在 database_config 中 12 | 13 | Args: 14 | uid (Union[List[str], str]): 用户 ID 15 | mid (Union[List[str], str]): 信息 ID 16 | table_name (str): 存储的位置(数据表名) 17 | concurrency (int, optional): 最大异步并发. Defaults to 100. 18 | max_failed_times (int, optional): 最大失败次数. Defaults to 20. 19 | 20 | Raises: 21 | ValueError: uid and mid must be both str or list and the length of uid and mid must be equal. 22 | """ 23 | super().__init__(table_name=table_name, concurrency=concurrency) 24 | 25 | if isinstance(uid, str) and isinstance(mid, str): 26 | self.ids = [CommentID(uid=uid, mid=mid)] 27 | elif isinstance(uid, list) and isinstance(mid, list) and len(uid) == len(mid): 28 | self.ids = [CommentID(uid=u, mid=m) for u, m in zip(uid, mid)] 29 | else: 30 | raise ValueError("uid and mid must be both str or list and the length of uid and mid must be equal") 31 | 32 | self.max_failed_times = max_failed_times 33 | 34 | 35 | def _get_request_description(self) -> str: 36 | """获取进度条描述 37 | 38 | Returns: 39 | str: 进度条描述 40 | """ 41 | return "download..." 42 | 43 | def _get_request_params(self) -> list: 44 | """获取请求参数列表 45 | 46 | Returns: 47 | list: 请求参数列表 48 | """ 49 | return self.ids 50 | 51 | def _process_items(self, items: list[dict]) -> list[Comment1Record]: 52 | """_summary_ 53 | 54 | Args: 55 | items (list[dict]): _description_ 56 | 57 | Returns: 58 | list[BodyRecord]: _description_ 59 | """ 60 | records = [] 61 | for item in items: 62 | f_mid = item.get("f_mid", None) 63 | f_uid = item.get("f_uid", None) 64 | mid = item.get("mid", None) 65 | uid = item.get("uid", None) 66 | record = Comment1Record( 67 | f_mid = f_mid, 68 | f_uid = f_uid, 69 | mid=mid, 70 | uid=uid, 71 | search_for=self.table_name, 72 | json_data = item 73 | ) 74 | records.append(record) 75 | return records 76 | 77 | def _process_response(self, response: httpx.Response, *, param: Any) -> None: 78 | """处理请求并存储数据 79 | 80 | Args: 81 | response (httpx.Response): 需要处理的请求 82 | table_name (str): 存储的位置(数据表名) 83 | """ 84 | resp_info, items = process_comment_resp(response) 85 | 86 | for item in items: 87 | item["f_mid"] = param.mid 88 | item["f_uid"] = param.uid 89 | 90 | records = self._process_items(items) 91 | self._save_to_database(records) 92 | return resp_info 93 | 94 | async def _process_response_asyncio(self, response: httpx.Response, *, param: Any) -> None: 95 | """处理请求并存储数据 96 | 97 | Args: 98 | response (httpx.Response): 需要处理的请求 99 | table_name (str): 存储的位置(数据表名) 100 | """ 101 | resp_info, items = process_comment_resp(response) 102 | 103 | for item in items: 104 | item["f_mid"] = param.mid 105 | item["f_uid"] = param.uid 106 | 107 | records = self._process_items(items) 108 | await self._save_to_database_asyncio(records) 109 | return resp_info 110 | 111 | @retry_timeout_decorator_asyncio 112 | async def _download_single_asyncio(self, *, param:Any, client:httpx.Response, progress:CustomProgress, overall_task:int): 113 | """下载单个请求(异步) 114 | 1. 在这里首先处理第一个评论,因为第一个评论是不需要 max_id 的,所以这里单独处理 115 | 2. 处理每一个评论响应的时候,通过 _process_response 方法获取到 resp_info 116 | 3. 其中 resp_info 包含 max_id, total_number, data_number. 其中 max_id 用于下一个请求, total_number 和 data_number 用于判断是否下载完成 117 | 4. comment 请求有其独有的特性, 在请求次数较多时, 会出现请求失败的情况, 一般来说 failed_times 的上限为 15, 这里取保守值 20. 118 | 119 | Args: 120 | param (Any): 请求参数 121 | client (httpx.Response): 请求客户端 122 | progress (CustomProgress): 进度条 123 | overall_task (int): 进度条任务ID 124 | """ 125 | response = await get_comments_l1_response_asyncio(uid=param.uid, mid=param.mid, client=client) 126 | if self._check_response(response): 127 | resp_info = await self._process_response_asyncio(response, param=param) 128 | max_id = resp_info.max_id 129 | total_number = resp_info.total_number 130 | count_data_number = resp_info.data_number 131 | failed_times = 0 if resp_info.data_number != 0 else 1 132 | 133 | task = progress.add_task(completed=count_data_number, total=total_number, description=f"{param.mid}: failed_times - {failed_times}") 134 | 135 | while (failed_times < self.max_failed_times) and (count_data_number < total_number): 136 | response = await get_comments_l1_response_asyncio(uid=param.uid, mid=param.mid, client=client, max_id=max_id) 137 | if self._check_response(response): 138 | resp_info = await self._process_response_asyncio(response, param=param) 139 | max_id = resp_info.max_id 140 | count_data_number += resp_info.data_number 141 | failed_times = 0 if resp_info.data_number != 0 else failed_times + 1 142 | 143 | progress.update(task, completed=count_data_number, total=total_number, description=f"{param.mid}: failed_times - {failed_times}") 144 | 145 | else: 146 | failed_times += 1 147 | 148 | progress.remove_task(task) 149 | progress.update(overall_task, advance=1, description=f"{param.mid}") 150 | 151 | @retry_timeout_decorator 152 | def _download_single_sync(self, *, param: Any, client:httpx.Response, progress:CustomProgress, overall_task:int): 153 | """下载单个请求(同步) 154 | 1. 在这里首先处理第一个评论,因为第一个评论是不需要 max_id 的,所以这里单独处理 155 | 2. 处理每一个评论响应的时候,通过 _process_response 方法获取到 resp_info 156 | 3. 其中 resp_info 包含 max_id, total_number, data_number. 其中 max_id 用于下一个请求, total_number 和 data_number 用于判断是否下载完成 157 | 4. comment 请求有其独有的特性, 在请求次数较多时, 会出现请求失败的情况, 一般来说 failed_times 的上限为 15, 这里取保守值 20. 158 | 159 | Args: 160 | param (Any): 请求参数 161 | client (httpx.Response): 请求客户端 162 | progress (CustomProgress): 进度条 163 | overall_task (int): 进度条任务ID 164 | """ 165 | response = get_comments_l1_response(uid=param.uid, mid=param.mid, client=client) 166 | if self._check_response(response): 167 | resp_info = self._process_response(response, param=param) 168 | max_id = resp_info.max_id 169 | total_number = resp_info.total_number 170 | count_data_number = resp_info.data_number 171 | failed_times = 0 if resp_info.data_number != 0 else 1 172 | 173 | task = progress.add_task(completed=count_data_number, total=total_number, description=f"{param.mid}: failed_times - {failed_times}") 174 | 175 | while (failed_times < self.max_failed_times) and (count_data_number < total_number): 176 | response = get_comments_l1_response(uid=param.uid, mid=param.mid, client=client, max_id=max_id) 177 | if self._check_response(response): 178 | resp_info = self._process_response(response, param=param) 179 | max_id = resp_info.max_id 180 | count_data_number += resp_info.data_number 181 | failed_times = 0 if resp_info.data_number != 0 else failed_times + 1 182 | 183 | progress.update(task, completed=count_data_number, total=total_number, description=f"{param.mid}: failed_times - {failed_times}") 184 | 185 | else: 186 | failed_times += 1 187 | 188 | progress.remove_task(task) 189 | progress.update(overall_task, advance=1, description=f"{param.mid}") 190 | 191 | 192 | def get_comment1_data(uid: Union[List[str], str], mid: Union[List[str], str], *, table_name:str, asynchrony: bool = True) -> list: 193 | """根据 uid 和 mid 下载评论数据,并保存在数据库的 mid 表中, 数据库位置在 database_config 中 194 | 195 | Args: 196 | uid (Union[List[str], str]): 用户 ID 197 | mid (Union[List[str], str]): 信息 ID 198 | table_name (str): 存储的位置(数据表名) 199 | concurrency (int, optional): 最大异步并发. Defaults to 100. 200 | 201 | Raises: 202 | ValueError: uid and mid must be both str or list and the length of uid and mid must be equal. 203 | 204 | Returns: 205 | list: 存储在数据库中的 id 列表 206 | """ 207 | downloader = Downloader(uid=uid, mid=mid, table_name=table_name) 208 | downloader.download(asynchrony=asynchrony) 209 | return downloader.res_ids -------------------------------------------------------------------------------- /WeiBoCrawler/pack/get_comment2_data.py: -------------------------------------------------------------------------------- 1 | import httpx 2 | from ..request import get_comments_l2_response, get_comments_l2_response_asyncio 3 | from ..parse import process_comment_resp 4 | from typing import List, Union, Any 5 | from ..util import CustomProgress, retry_timeout_decorator, retry_timeout_decorator_asyncio 6 | from .BaseDownloader import BaseDownloader, CommentID, Comment2Record 7 | 8 | 9 | 10 | class Downloader(BaseDownloader): 11 | def __init__(self, *, uid: Union[List[str], str], mid: Union[List[str], str], table_name: str, concurrency: int = 100, max_failed_times: int = 20) -> None: 12 | """根据 uid 和 mid 下载评论数据,并保存在数据库的 mid 表中, 数据库位置在 database_config 中 13 | 14 | Args: 15 | uid (Union[List[str], str]): 用户 ID 16 | mid (Union[List[str], str]): 信息 ID 17 | table_name (str): 存储的位置(数据表名) 18 | concurrency (int, optional): 最大异步并发. Defaults to 100. 19 | max_failed_times (int, optional): 最大失败次数. Defaults to 20. 20 | 21 | Raises: 22 | ValueError: uid and mid must be both str or list and the length of uid and mid must be equal. 23 | """ 24 | super().__init__(table_name=table_name ,concurrency=concurrency) 25 | 26 | if isinstance(uid, str) and isinstance(mid, str): 27 | self.ids = [CommentID(uid=uid, mid=mid)] 28 | elif isinstance(uid, list) and isinstance(mid, list) and len(uid) == len(mid): 29 | self.ids = [CommentID(uid=u, mid=m) for u, m in zip(uid, mid)] 30 | else: 31 | raise ValueError("uid and mid must be both str or list and the length of uid and mid must be equal") 32 | 33 | self.max_failed_times = max_failed_times 34 | 35 | 36 | def _get_request_description(self) -> str: 37 | """获取进度条描述 38 | 39 | Returns: 40 | str: 进度条描述 41 | """ 42 | return "download..." 43 | 44 | def _get_request_params(self) -> list: 45 | """获取请求参数列表 46 | 47 | Returns: 48 | list: 请求参数列表 49 | """ 50 | return self.ids 51 | 52 | 53 | def _process_items(self, items: list[dict]) -> list[Comment2Record]: 54 | """_summary_ 55 | 56 | Args: 57 | items (list[dict]): _description_ 58 | 59 | Returns: 60 | list[BodyRecord]: _description_ 61 | """ 62 | records = [] 63 | for item in items: 64 | f_mid = item.get("f_mid", None) 65 | f_uid = item.get("f_uid", None) 66 | mid = item.get("mid", None) 67 | uid = item.get("uid", None) 68 | record = Comment2Record( 69 | f_mid = f_mid, 70 | f_uid = f_uid, 71 | mid=mid, 72 | uid=uid, 73 | search_for=self.table_name, 74 | json_data = item 75 | ) 76 | records.append(record) 77 | return records 78 | 79 | def _process_response(self, response: httpx.Response, *, param: Any) -> None: 80 | """处理请求并存储数据 81 | 82 | Args: 83 | response (httpx.Response): 需要处理的请求 84 | param (Any): 请求参数 85 | """ 86 | resp_info, items = process_comment_resp(response) 87 | for item in items: 88 | item["f_mid"] = param.mid 89 | item["f_uid"] = param.uid 90 | 91 | records = self._process_items(items) 92 | self._save_to_database(records) 93 | return resp_info 94 | 95 | async def _process_response_asyncio(self, response: httpx.Response, *, param: Any) -> None: 96 | """处理请求并存储数据 97 | 98 | Args: 99 | response (httpx.Response): 需要处理的请求 100 | table_name (str): 存储的位置(数据表名) 101 | """ 102 | resp_info, items = process_comment_resp(response) 103 | 104 | for item in items: 105 | item["f_mid"] = param.mid 106 | item["f_uid"] = param.uid 107 | 108 | records = self._process_items(items) 109 | await self._save_to_database_asyncio(records) 110 | return resp_info 111 | 112 | @retry_timeout_decorator_asyncio 113 | async def _download_single_asyncio(self, *, param:Any, client:httpx.Response, progress:CustomProgress, overall_task:int): 114 | """下载单个请求(异步) 115 | 1. 在这里首先处理第一个评论,因为第一个评论是不需要 max_id 的,所以这里单独处理 116 | 2. 处理每一个评论响应的时候,通过 _process_response 方法获取到 resp_info 117 | 3. 其中 resp_info 包含 max_id, total_number, data_number. 其中 max_id 用于下一个请求, total_number 和 data_number 用于判断是否下载完成 118 | 4. comment 请求有其独有的特性, 在请求次数较多时, 会出现请求失败的情况, 一般来说 failed_times 的上限为 15, 这里取保守值 20. 119 | 120 | Args: 121 | param (Any): 请求参数 122 | client (httpx.Response): 请求客户端 123 | progress (CustomProgress): 进度条 124 | overall_task (int): 进度条任务ID 125 | """ 126 | response = await get_comments_l2_response_asyncio(uid=param.uid, mid=param.mid, client=client) 127 | if self._check_response(response): 128 | resp_info = await self._process_response_asyncio(response, param=param) 129 | max_id = resp_info.max_id 130 | total_number = resp_info.total_number 131 | count_data_number = resp_info.data_number 132 | failed_times = 0 if resp_info.data_number != 0 else 1 133 | 134 | task = progress.add_task(completed=count_data_number, total=total_number, description=f"{param.mid}: failed_times - {failed_times}") 135 | 136 | while (failed_times < self.max_failed_times) and (count_data_number < total_number): 137 | response = await get_comments_l2_response_asyncio(uid=param.uid, mid=param.mid, client=client, max_id=max_id) 138 | if self._check_response(response): 139 | resp_info = await self._process_response_asyncio(response, param=param) 140 | max_id = resp_info.max_id 141 | count_data_number += resp_info.data_number 142 | failed_times = 0 if resp_info.data_number != 0 else failed_times + 1 143 | 144 | progress.update(task, completed=count_data_number, total=total_number, description=f"{param.mid}: failed_times - {failed_times}") 145 | 146 | else: 147 | failed_times += 1 148 | 149 | progress.remove_task(task) 150 | progress.update(overall_task, advance=1, description=f"{param.mid}") 151 | 152 | @retry_timeout_decorator 153 | def _download_single_sync(self, *, param: Any, client:httpx.Response, progress:CustomProgress, overall_task:int): 154 | """下载单个请求(同步) 155 | 1. 在这里首先处理第一个评论,因为第一个评论是不需要 max_id 的,所以这里单独处理 156 | 2. 处理每一个评论响应的时候,通过 _process_response 方法获取到 resp_info 157 | 3. 其中 resp_info 包含 max_id, total_number, data_number. 其中 max_id 用于下一个请求, total_number 和 data_number 用于判断是否下载完成 158 | 4. comment 请求有其独有的特性, 在请求次数较多时, 会出现请求失败的情况, 一般来说 failed_times 的上限为 15, 这里取保守值 20. 159 | 160 | Args: 161 | param (Any): 请求参数 162 | client (httpx.Response): 请求客户端 163 | progress (CustomProgress): 进度条 164 | overall_task (int): 进度条任务ID 165 | """ 166 | response = get_comments_l2_response(uid=param.uid, mid=param.mid, client=client) 167 | if self._check_response(response): 168 | resp_info = self._process_response(response, param=param) 169 | max_id = resp_info.max_id 170 | total_number = resp_info.total_number 171 | count_data_number = resp_info.data_number 172 | failed_times = 0 if resp_info.data_number != 0 else 1 173 | 174 | task = progress.add_task(completed=count_data_number, total=total_number, description=f"{param.mid}: failed_times - {failed_times}") 175 | 176 | while (failed_times < self.max_failed_times) and (count_data_number < total_number): 177 | response = get_comments_l2_response(uid=param.uid, mid=param.mid, client=client, max_id=max_id) 178 | if self._check_response(response): 179 | resp_info = self._process_response(response, param=param) 180 | max_id = resp_info.max_id 181 | count_data_number += resp_info.data_number 182 | failed_times = 0 if resp_info.data_number != 0 else failed_times + 1 183 | 184 | progress.update(task, completed=count_data_number, total=total_number, description=f"{param.mid}: failed_times - {failed_times}") 185 | 186 | else: 187 | failed_times += 1 188 | 189 | progress.remove_task(task) 190 | progress.update(overall_task, advance=1, description=f"{param.mid}") 191 | 192 | def get_comment2_data(uid: Union[List[str], str], mid: Union[List[str], str], *, table_name: str, asynchrony: bool = True) -> list: 193 | """根据 uid 和 mid 下载评论数据,并保存在数据库的 mid 表中, 数据库位置在 database_config 中 194 | 195 | Args: 196 | uid (Union[List[str], str]): 用户 ID 197 | mid (Union[List[str], str]): 信息 ID 198 | table_name (str): 存储的位置(数据表名) 199 | concurrency (int, optional): 最大异步并发. Defaults to 100. 200 | 201 | Raises: 202 | ValueError: uid and mid must be both str or list and the length of uid and mid must be equal. 203 | 204 | Returns: 205 | list: 存储在数据库中的 id 列表 206 | """ 207 | downloader = Downloader(uid=uid, mid=mid, table_name=table_name) 208 | downloader.download(asynchrony=asynchrony) 209 | return downloader.res_ids -------------------------------------------------------------------------------- /WeiBoCrawler/pack/get_list_data.py: -------------------------------------------------------------------------------- 1 | import httpx 2 | from datetime import datetime 3 | from typing import Literal, Optional, Any 4 | from ..util import CustomProgress, retry_timeout_decorator, retry_timeout_decorator_asyncio 5 | from ..request import get_list_response_asyncio, get_list_response 6 | from ..parse import parse_list_html 7 | from .BaseDownloader import BaseDownloader, BodyRecord, RecordFrom 8 | 9 | 10 | class Downloader(BaseDownloader): 11 | def __init__(self, search_for: str, *, table_name: str, kind : Literal["综合", "实时", "高级"] = "综合", 12 | advanced_kind: Literal["综合", "热度", "原创"] = "综合", time_start: Optional[datetime] = None, time_end:Optional[datetime]=None, concurrency: int = 100): 13 | """下载 List 页面数据, 并保存在数据库的 search_for 表中, 数据库位置在 database_config 中. 14 | 15 | Args: 16 | search_for (str): 需要搜索的内容,如果是话题,需要在 search_for 前后都加上 # 17 | table_name (str): 存储的位置(数据表名) 18 | kind (Literal[, optional): 搜索类型可以是 综合,实时,高级(添加了综合,热度,原创筛选以及时间). Defaults to "综合". 19 | advanced_kind (Literal[, optional): 筛选条件,可以是综合,热度,原创. Defaults to "综合". 20 | time_start (Optional[datetime], optional): 起始时间,最大颗粒度为小时. Defaults to Optional[datetime]. 21 | time_end (Optional[datetime], optional): 结束时间,最大颗粒度为小时. Defaults to Optional[datetime]. 22 | concurrency (int, optional): 异步最大并发. Defaults to 100. 23 | """ 24 | super().__init__(table_name=table_name, concurrency=concurrency) 25 | 26 | self.search_for = search_for 27 | self.kind = kind 28 | self.advanced_kind = advanced_kind 29 | self.time_start = time_start 30 | self.time_end = time_end 31 | 32 | 33 | def _get_request_description(self) -> str: 34 | """获取进度条描述 35 | 36 | Returns: 37 | str: 进度条描述 38 | """ 39 | return "download..." 40 | 41 | def _get_request_params(self) -> list: 42 | """获取请求参数列表 43 | 44 | Returns: 45 | list: 请求参数列表 46 | """ 47 | return list(range(1, 51)) 48 | 49 | def _process_items(self, items: list[dict]) -> list[BodyRecord]: 50 | """_summary_ 51 | 52 | Args: 53 | items (list[dict]): _description_ 54 | 55 | Returns: 56 | list[BodyRecord]: _description_ 57 | """ 58 | records = [] 59 | for item in items: 60 | mid = item.get("mid", None) 61 | uid = item.get("uid", None) 62 | record = BodyRecord( 63 | mid=mid, 64 | uid=uid, 65 | search_for=self.table_name, 66 | record_from=RecordFrom.Html, 67 | json_data = item 68 | ) 69 | records.append(record) 70 | return records 71 | 72 | def _process_response(self, response: httpx.Response, *, param: Any) -> None: 73 | """处理请求并存储数据 74 | 75 | Args: 76 | response (httpx.Response): 需要处理的请求 77 | table_name (str): 存储的位置(数据表名) 78 | """ 79 | items = parse_list_html(response.text) 80 | records = self._process_items(items) 81 | self._save_to_database(records) 82 | 83 | async def _process_response_asyncio(self, response: httpx.Response, *, param: Any) -> None: 84 | """处理请求并存储数据 85 | 86 | Args: 87 | response (httpx.Response): 需要处理的请求 88 | param (Any): 请求参数 89 | """ 90 | items = parse_list_html(response.text) 91 | records = self._process_items(items) 92 | await self._save_to_database_asyncio(records) 93 | 94 | @retry_timeout_decorator_asyncio 95 | async def _download_single_asyncio(self, *, param:Any, client:httpx.Response, progress:CustomProgress, overall_task:int): 96 | """下载单个请求(异步) 97 | 98 | Args: 99 | param (Any): 请求参数 100 | client (httpx.Response): 请求客户端 101 | progress (CustomProgress): 进度条 102 | overall_task (int): 进度条任务ID 103 | """ 104 | response = await get_list_response_asyncio( 105 | search_for=self.search_for, 106 | kind=self.kind, 107 | advanced_kind=self.advanced_kind, 108 | time_start=self.time_start, 109 | time_end=self.time_end, 110 | page_index=param, 111 | client=client) 112 | 113 | if self._check_response(response): 114 | await self._process_response_asyncio(response, param=param) 115 | 116 | progress.update(overall_task, advance=1, description=f"{param}...") 117 | 118 | @retry_timeout_decorator 119 | def _download_single_sync(self, *, param: Any, client:httpx.Response, progress:CustomProgress, overall_task:int): 120 | """下载单个请求(同步) 121 | 122 | Args: 123 | param (Any): 请求参数 124 | client (httpx.Response): 请求客户端 125 | progress (CustomProgress): 进度条 126 | overall_task (int): 进度条任务ID 127 | """ 128 | response = get_list_response( 129 | search_for=self.search_for, 130 | kind=self.kind, 131 | advanced_kind=self.advanced_kind, 132 | time_start=self.time_start, 133 | time_end=self.time_end, 134 | page_index=param, 135 | client=client) 136 | 137 | if self._check_response(response): 138 | self._process_response(response, param=param) 139 | 140 | progress.update(overall_task, advance=1, description=f"{param}") 141 | 142 | 143 | def get_list_data(search_for: str, *, table_name: str, asynchrony: bool = True, kind : Literal["综合", "实时", "高级"] = "综合", 144 | advanced_kind: Literal["综合", "热度", "原创"] = "综合", time_start: Optional[datetime] = None, time_end:Optional[datetime]=None) -> list: 145 | """获取 List 页面数据 146 | 147 | Args: 148 | search_for (str): 需要搜索的内容,如果是话题,需要在 search_for 前后都加上 #. 149 | table_name (str): 存储的位置(数据表名) 150 | asynchrony (bool, optional): _description_. Defaults to True. 151 | kind (Literal[, optional): 搜索类型可以是 综合,实时,高级(添加了综合,热度,原创筛选以及时间). Defaults to "综合". 152 | advanced_kind (Literal[, optional): 筛选条件,可以是综合,热度,原创. Defaults to "综合". 153 | time_start (Optional[datetime], optional): 起始时间,最大颗粒度为小时. Defaults to None. 154 | time_end (Optional[datetime], optional): 结束时间,最大颗粒度为小时. Defaults to None. 155 | 156 | Returns: 157 | list: 存储在数据库中的 id 列表 158 | """ 159 | downloader = Downloader(search_for=search_for, table_name=table_name, kind=kind, advanced_kind=advanced_kind, time_start=time_start, time_end=time_end) 160 | downloader.download(asynchrony=asynchrony) 161 | return downloader.res_ids 162 | -------------------------------------------------------------------------------- /WeiBoCrawler/parse/__init__.py: -------------------------------------------------------------------------------- 1 | from .process_list import process_list_documents 2 | from .process_comment import process_comment_documents, process_comment_resp 3 | from .process_body import process_body_documents, process_body_resp 4 | from .parse_list_html import parse_list_html 5 | 6 | __all__ = [ 7 | "process_list_documents", 8 | "process_comment_documents", 9 | "process_body_documents", 10 | 11 | "parse_list_html", 12 | 13 | "process_body_resp", 14 | "process_comment_resp" 15 | ] -------------------------------------------------------------------------------- /WeiBoCrawler/parse/parse_list_html.py: -------------------------------------------------------------------------------- 1 | import re 2 | import parsel 3 | from typing import Optional, List 4 | from ..util import custom_validate_call, process_time_str 5 | 6 | 7 | @custom_validate_call 8 | def get_mid(select: parsel.Selector) -> Optional[str]: 9 | """获取微博的mid 10 | 11 | Args: 12 | select (parsel.Selector): 经过 parsel 解析 html 后得到的 Selector 对象 13 | 14 | Returns: 15 | Optional[str]: 微博的mid 16 | """ 17 | mid = select.xpath("//div[@mid]/@mid").get() 18 | return mid 19 | 20 | 21 | @custom_validate_call 22 | def get_uid(select: parsel.Selector) -> Optional[str]: 23 | """获取微博的uid 24 | 25 | Args: 26 | select (parsel.Selector): 经过 parsel 解析 html 后得到的 Selector 对象 27 | 28 | Returns: 29 | Optional[str]: 微博的uid 30 | """ 31 | uid = select.xpath("//a[@nick-name]/@href").get() 32 | if uid is None: 33 | return None 34 | else: 35 | uid = re.search(r"/(\d+)/?", uid).group(1) 36 | return uid 37 | 38 | @custom_validate_call 39 | def get_mblogid(select: parsel.Selector) -> Optional[str]: 40 | """获取微博的mblogid 41 | 42 | Args: 43 | select (parsel.Selector): 经过 parsel 解析 html 后得到的 Selector 对象 44 | 45 | Returns: 46 | Optional[str]: 微博的mblogid 47 | """ 48 | mblogid = select.xpath('//div[@class="from"]/a[1]/@href').get() 49 | if mblogid is None: 50 | return None 51 | else: 52 | mblogid = re.search(r"/(\w+)\?", mblogid).group(1) 53 | return mblogid 54 | 55 | 56 | @custom_validate_call 57 | def get_personal_name(select: parsel.Selector) -> Optional[str]: 58 | """获取微博的个人名称 59 | 60 | Args: 61 | select (parsel.Selector): 经过 parsel 解析 html 后得到的 Selector 对象 62 | 63 | Returns: 64 | Optional[str]: 微博的个人名称 65 | """ 66 | personal_name = select.xpath("//a[@nick-name]/@nick-name").get() 67 | return personal_name 68 | 69 | @custom_validate_call 70 | def get_personal_href(select: parsel.Selector) -> Optional[str]: 71 | """获取微博的个人主页 72 | 73 | Args: 74 | select (parsel.Selector): 经过 parsel 解析 html 后得到的 Selector 对象 75 | 76 | Returns: 77 | Optional[str]: 个人主页的 URL 78 | """ 79 | personal_href = select.xpath("//a[@nick-name]/@href").get() 80 | if personal_href is None: 81 | return None 82 | else: 83 | return "https:" + personal_href 84 | 85 | 86 | def get_weibo_href(select: parsel.Selector) -> Optional[str]: 87 | """获取微博的链接 88 | 89 | Args: 90 | select (parsel.Selector): 经过 parsel 解析 html 后得到的 Selector 对象 91 | 92 | Returns: 93 | Optional[str]: 微博的链接 94 | """ 95 | weibo_href = select.xpath('//div[@class="from"]/a[1]/@href').get() 96 | if weibo_href is None: 97 | return None 98 | else: 99 | return "https:" + weibo_href 100 | 101 | 102 | @custom_validate_call 103 | def get_publish_time(select: parsel.Selector) -> Optional[str]: 104 | """获取微博的发布时间 105 | 106 | Args: 107 | select (parsel.Selector): 经过 parsel 解析 html 后得到的 Selector 对象 108 | 109 | Returns: 110 | Optional[datetime]: 微博的发布时间 111 | """ 112 | publish_time_str = select.xpath('//div[@class="from"]/a[1]/text()').get() 113 | if publish_time_str is None: 114 | return publish_time_str 115 | else: 116 | publish_time = process_time_str(publish_time_str).strftime("%Y-%m-%d %H:%M:%S") 117 | return publish_time 118 | 119 | @custom_validate_call 120 | def get_content_from(select:parsel.Selector) -> Optional[str]: 121 | """获取微博的发送设备 122 | 123 | Args: 124 | select (parsel.Selector): 经过 parsel 解析 html 后得到的 Selector 对象 125 | 126 | Returns: 127 | Optional[str]: 微博的发送设备 128 | """ 129 | content_from = select.xpath('//div[@class="from"]/a[2]/text()').get() 130 | return content_from 131 | 132 | @custom_validate_call 133 | def get_content_all(select:parsel.Selector) -> Optional[str]: 134 | """获取微博的内容 135 | 136 | Args: 137 | select (parsel.Selector): 经过 parsel 解析 html 后得到的 Selector 对象 138 | 139 | Returns: 140 | Optional[str]: 微博的内容 141 | """ 142 | content_all = select.xpath('string(//p[@node-type="feed_list_content_full"])').get() 143 | content_all = re.sub(r"\n[ \t]+", "\n", content_all) 144 | content_all = re.sub(r"(? Optional[int]: 159 | """获取微博的转发数量 160 | 161 | Args: 162 | select (parsel.Selector): 经过 parsel 解析 html 后得到的 Selector 对象 163 | 164 | Returns: 165 | Optional[int]: 微博的转发数量 166 | """ 167 | retweet_num = select.xpath('string(//div[@class="card-act"]/ul[1]/li[1])').get() 168 | if retweet_num: 169 | retweet_num = re.findall(r"\d+", retweet_num) 170 | return int(retweet_num[0]) if retweet_num else 0 171 | else: 172 | return None 173 | 174 | 175 | @custom_validate_call 176 | def get_comment_num(select:parsel.Selector) -> Optional[int]: 177 | """获取微博的评论数量 178 | 179 | Args: 180 | select (parsel.Selector): 经过 parsel 解析 html 后得到的 Selector 对象 181 | 182 | Returns: 183 | Optional[int]: 微博的评论数量 184 | """ 185 | comment_num = select.xpath('string(//div[@class="card-act"]/ul[1]/li[2])').get() 186 | if comment_num: 187 | comment_num = re.findall(r"\d+", comment_num) 188 | return int(comment_num[0]) if comment_num else 0 189 | else: 190 | return None 191 | 192 | @custom_validate_call 193 | def get_star_num(select: parsel.Selector) -> Optional[int]: 194 | """获取微博的点赞数量 195 | 196 | Args: 197 | select (parsel.Selector): 经过 parsel 解析 html 后得到的 Selector 对象 198 | 199 | Returns: 200 | Optional[int]: 微博的点赞数量 201 | """ 202 | star_num = select.xpath('string(//div[@class="card-act"]/ul[1]/li[3])').get() 203 | if star_num: 204 | star_num = re.findall(r"\d+", star_num) 205 | return int(star_num[0]) if star_num else 0 206 | else: 207 | return None 208 | 209 | 210 | def parse_list_html(html: str) -> List[dict]: 211 | """解析微博列表主体的html 212 | 213 | Args: 214 | html (str): 爬虫获取到的 html 文本 215 | 216 | Returns: 217 | List[dict]: 整理后的 List[dict] 218 | """ 219 | select = parsel.Selector(html) 220 | check_div_mpage = select.css("div.m-page").get() 221 | if check_div_mpage is None: 222 | return [] 223 | else: 224 | div_list = select.xpath('//*[@id="pl_feedlist_index"]//div[@action-type="feed_list_item"]').getall() 225 | lst = [] 226 | for div_string in div_list: 227 | select = parsel.Selector(div_string) 228 | item = { 229 | "mid": get_mid(select), 230 | "uid": get_uid(select), 231 | "mblogid": get_mblogid(select), 232 | "personal_name": get_personal_name(select), 233 | "personal_href": get_personal_href(select), 234 | "weibo_href": get_weibo_href(select), 235 | "publish_time": get_publish_time(select), 236 | "content_from": get_content_from(select), 237 | "content_all": get_content_all(select), 238 | "retweet_num": get_retweet_num(select), 239 | "comment_num": get_comment_num(select), 240 | "star_num": get_star_num(select), 241 | } 242 | lst.append(item) 243 | return lst -------------------------------------------------------------------------------- /WeiBoCrawler/parse/process_body.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from ..util import process_base_document, process_base_documents 3 | 4 | def process_body_resp(resp): 5 | """处理详细页数据 6 | 7 | 这里一般都会收到正常的响应,所以只需要处理数据即可. 8 | Args: 9 | resp (httpx.Response): 接受到的响应. 10 | 11 | Returns: 12 | list[dict]: 响应的数据, 这里使用 list 包装一下(对齐其他的process请求). 13 | """ 14 | data = resp.json() 15 | transform_dict = { 16 | "mid": "mid", 17 | "uid": ["user", "idstr"], 18 | } 19 | data.update(process_base_document(data, transform_dict)) 20 | return [data] 21 | 22 | 23 | def process_body_documents(documents: list[dict]) -> pd.DataFrame: 24 | """将 documents 处理成 dataframe 的形式 25 | 26 | transform_dict = { 27 | "转发数量": "retweet_num", 28 | "评论数量": "comment_num", 29 | "点赞数量": "star_num", 30 | ... 31 | } 32 | 33 | Args: 34 | documents (list[dict]): 文档列表 35 | transform_dict (dict): 转换字典, key 是转化后的字段, value 是原始字段 36 | 37 | Returns: 38 | pd.DataFrame: (去重)处理后得到的表格 39 | """ 40 | transform_dict = { 41 | "mid": "mid", 42 | "uid": ["user", "idstr"], 43 | "mblogid": "mblogid", 44 | "个人昵称": ["user", "screen_name"], 45 | 46 | "用户性别": ["longText", "user", "gender"], 47 | 48 | "用户定位": ["longText","user", "location"], 49 | "用户粉丝": ["longText","user", "followers_count"], 50 | "用户累计评论数": ["user", "status_total_counter", "comment_cnt"], 51 | "用户累计转发数": ["user", "status_total_counter", "repost_cnt"], 52 | "用户累计点赞数": ["user", "status_total_counter", "like_cnt"], 53 | "用户累计评转赞": ["user", "status_total_counter", "total_cnt"], 54 | "发布时间": "created_at", 55 | "原生内容": "text", 56 | "展示内容": "text_raw", 57 | 58 | "转发数量": "reposts_count", 59 | "评论数量": "comments_count", 60 | "点赞数量": "attitudes_count", 61 | } 62 | df = process_base_documents(documents, transform_dict) 63 | return df -------------------------------------------------------------------------------- /WeiBoCrawler/parse/process_comment.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | import httpx 3 | from pydantic import BaseModel 4 | import pandas as pd 5 | from ..util import process_base_documents, process_base_document 6 | 7 | class CommmentResponseInfo(BaseModel): 8 | max_id: str 9 | total_number: int 10 | data_number: int 11 | 12 | 13 | 14 | def process_comment_resp(resp: httpx.Response) -> Tuple[CommmentResponseInfo, list]: 15 | """处理评论数据 16 | 17 | 这里有三种方式判断 resp 是否正常: 18 | 1. 正常响应头中会有 content-encoding:gzip, 而不正常的响应头中相应位置为 content-length: 117(或者其他) 19 | 2. 正常响应中会有 filter_group 字段, 不正常响应中没有该字段, 20 | 3. 无论正常还是非正常响应中都有 data 字段, 正常响应 data 字段内容为 [dict], 非正常响应 data 字段内容为 [] 21 | 22 | 目前使用第三种方法. 23 | 24 | Args: 25 | resp (httpx.Response): 接受到的响应. 26 | 27 | Returns: 28 | Tuple[dict, list]: 前面是 请求的信息(后面要用到), 后面是数据 29 | """ 30 | data = resp.json() 31 | max_id = data.get("max_id", "") 32 | total_number = data.get("total_number", 0) 33 | data_number = len(data.get("data", [])) 34 | 35 | data_list = data["data"] 36 | 37 | transform_dict = { 38 | "mid": "mid", 39 | "uid": ["user", "idstr"], 40 | } 41 | 42 | [data.update(process_base_document(data, transform_dict)) for data in data_list] 43 | 44 | resp_info = CommmentResponseInfo(max_id=str(max_id), total_number=int(total_number), data_number=data_number) 45 | return resp_info, data_list 46 | 47 | 48 | 49 | 50 | 51 | def process_comment_documents(documents: list[dict]) -> pd.DataFrame: 52 | """将表处理成 dataframe 的形式 53 | 54 | transform_dict = { 55 | "转发数量": "retweet_num", 56 | "评论数量": "comment_num", 57 | "点赞数量": "star_num", 58 | ... 59 | } 60 | 61 | Args: 62 | table (Table): 需要处理的表 63 | transform_dict (dict): 转换字典, key 是转化后的字段, value 是原始字段 64 | 65 | Returns: 66 | pd.DataFrame: (去重)处理后得到的表格 67 | """ 68 | transform_dict = { 69 | "f_mid": "f_mid", 70 | "f_uid": "f_uid", 71 | "mid": "mid", 72 | "uid": ["user", "id"], 73 | "个人昵称": ["user", "screen_name"], 74 | "用户性别": ["user", "gender"], 75 | "用户定位": ["user", "location"], 76 | "用户粉丝": ["user", "followers_count"], 77 | "用户累计评论数": ["user", "status_total_counter", "comment_cnt"], 78 | "用户累计转发数": ["user", "status_total_counter", "repost_cnt"], 79 | "用户累计点赞数": ["user", "status_total_counter", "like_cnt"], 80 | "用户累计评转赞": ["user", "status_total_counter", "total_cnt"], 81 | "发布时间": "created_at", 82 | "原生内容": "text", 83 | "展示内容": "text_raw", 84 | "评论数量": "total_number", 85 | "点赞数量": "like_counts", 86 | } 87 | df = process_base_documents(documents, transform_dict) 88 | return df -------------------------------------------------------------------------------- /WeiBoCrawler/parse/process_list.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from ..util import process_base_documents 3 | 4 | def process_list_documents(documents: list[dict]) -> pd.DataFrame: 5 | """将 documents 处理成 dataframe 的形式 6 | 7 | transform_dict = { 8 | "转发数量": "retweet_num", 9 | "评论数量": "comment_num", 10 | "点赞数量": "star_num", 11 | ... 12 | } 13 | 14 | Args: 15 | documents (list[dict]): 文档列表 16 | transform_dict (dict): 转换字典, key 是转化后的字段, value 是原始字段 17 | 18 | Returns: 19 | pd.DataFrame: (去重)处理后得到的表格 20 | """ 21 | transform_dict = { 22 | "mid": "mid", 23 | "uid": "uid", 24 | "mblogid": "mblogid", 25 | "个人昵称": "personal_name", 26 | "个人主页": "personal_href", 27 | "微博链接": "weibo_href", 28 | "发布时间": "publish_time", 29 | "内容来自": "content_from", 30 | "全部内容": "content_all", 31 | "转发数量": "retweet_num", 32 | "评论数量": "comment_num", 33 | "点赞数量": "star_num", 34 | } 35 | df = process_base_documents(documents, transform_dict) 36 | return df -------------------------------------------------------------------------------- /WeiBoCrawler/request/__init__.py: -------------------------------------------------------------------------------- 1 | from .get_list_request import get_list_response, get_list_response_asyncio 2 | from .get_body_request import get_body_response, get_body_response_asyncio 3 | from .get_comment_request import get_comments_l1_response, get_comments_l2_response, get_comments_l1_response_asyncio, get_comments_l2_response_asyncio 4 | from .get_cookies import get_qr_Info, get_qr_status 5 | 6 | __all__ = [ 7 | "get_list_response", 8 | "get_body_response", 9 | "get_comments_l1_response", 10 | "get_comments_l2_response", 11 | 12 | "get_list_response_asyncio", 13 | "get_body_response_asyncio", 14 | "get_comments_l1_response_asyncio", 15 | "get_comments_l2_response_asyncio", 16 | 17 | "get_qr_Info", 18 | "get_qr_status", 19 | ] -------------------------------------------------------------------------------- /WeiBoCrawler/request/get_body_request.py: -------------------------------------------------------------------------------- 1 | import httpx 2 | from .util import request_headers 3 | 4 | 5 | def build_body_params(id: str) -> tuple: 6 | """构建微博详细页参数 7 | 微博详细页id位置(https://weibo.com/{userid}/{id}?{params}): 8 | 1. 找到需要爬取的微博内容页, 比如: 9 | https://weibo.com/1644114654/OiZre8dir?refer_flag=1001030103_ -> id = OiZre8dir 10 | 11 | Args: 12 | id (str): 微博详细页id. 13 | 14 | Returns: 15 | tuple: (url, params, headers). 16 | """ 17 | headers = request_headers.body_headers 18 | url = "https://weibo.com/ajax/statuses/show" 19 | params = { 20 | "id": f"{id}", 21 | "locale": "zh-CN", 22 | "isGetLongText": "true" 23 | } 24 | return url, params, headers 25 | 26 | 27 | def get_body_response(id: str, *, client: httpx.Client) -> httpx.Response: 28 | """获取微博详细页的请求结果 29 | 微博详细页id位置(https://weibo.com/{userid}/{id}?{params}): 30 | 1. 找到需要爬取的微博内容页, 比如: 31 | https://weibo.com/1644114654/OiZre8dir?refer_flag=1001030103_ -> id = OiZre8dir 32 | 33 | Args: 34 | id (str): 微博详细页id. 35 | client (httpx.Client): 客户端. 36 | 37 | Returns: 38 | httpx.Response: 返回的请求结果. 39 | """ 40 | url, params, headers = build_body_params(id) 41 | response = client.get(url, params=params, headers=headers) 42 | return response 43 | 44 | 45 | async def get_body_response_asyncio(id:str, *, client: httpx.AsyncClient) -> httpx.Response: 46 | """获取微博详细页的请求结果(异步) 47 | 微博详细页id位置(https://weibo.com/{userid}/{id}?{params}): 48 | 1. 找到需要爬取的微博内容页, 比如: 49 | https://weibo.com/1644114654/OiZre8dir?refer_flag=1001030103_ -> id = OiZre8dir 50 | 51 | Args: 52 | id (str): 微博详细页id. 53 | client (httpx.AsyncClient): 异步客户端. 54 | 55 | Returns: 56 | httpx.Response: 返回的请求结果. 57 | """ 58 | url, params, headers = build_body_params(id) 59 | response = await client.get(url, params=params, headers=headers) 60 | return response -------------------------------------------------------------------------------- /WeiBoCrawler/request/get_comment_request.py: -------------------------------------------------------------------------------- 1 | import httpx 2 | from .util import request_headers 3 | from typing import Optional 4 | 5 | def build_comments_l1_params(uid: str, mid : str, *, max_id: Optional[str]=None) -> tuple: 6 | """构建微博主体一级评论的参数 7 | 8 | Args: 9 | uid (str): 微博的uid 10 | mid (str): 微博的mid 11 | max_id (str, optional): 是否是第一次请求该微博主体的评论, 如果是第一次, max_id 设置为 None; 否则设置为 max_id. Defaults to None. 12 | 13 | Returns: 14 | tuple: (url, params, headers). 15 | """ 16 | url = "https://weibo.com/ajax/statuses/buildComments" 17 | headers = request_headers.comment1_buildComments_headers 18 | 19 | params = { 20 | "is_reload": "1", 21 | "id": f"{mid}", 22 | "is_show_bulletin": "2", 23 | "is_mix": "0", 24 | "count": "20", 25 | "uid": f"{uid}", 26 | "fetch_level": "0", 27 | "locale": "zh-CN", 28 | } 29 | if max_id is not None: 30 | params["flow"] = "0" 31 | params["max_id"] = max_id 32 | 33 | return url, params, headers 34 | 35 | 36 | def build_comments_l2_params(uid: str, mid : str, *, max_id: Optional[str]=None) -> tuple: 37 | """构建微博主体二级评论的参数 38 | 39 | Args: 40 | uid (str): 微博的uid 41 | mid (str): 微博的mid 42 | max_id (str, optional): 是否是第一次请求该微博主体的评论, 如果是第一次, max_id 设置为 None; 否则设置为 max_id. Defaults to None. 43 | 44 | Returns: 45 | tuple: (url, params, headers). 46 | """ 47 | url = "https://weibo.com/ajax/statuses/buildComments" 48 | headers = request_headers.comment2_buildComments_headers 49 | 50 | params = { 51 | "flow": "0", # 0 表示按热度, 1 表示按时间 52 | "is_reload": "1", 53 | "id": f"{mid}", 54 | "is_show_bulletin": "2", 55 | "is_mix": "1", 56 | "fetch_level": "1", 57 | "count": "20", 58 | "uid": f"{uid}", 59 | "locale": "zh-CN" 60 | } 61 | 62 | if max_id is not None: 63 | params["max_id"] = max_id 64 | else: 65 | params["max_id"] = "0" 66 | 67 | return url, params, headers 68 | 69 | 70 | 71 | def get_comments_l1_response(uid: str, mid : str, *, client: httpx.Client, max_id: Optional[str]=None) -> httpx.Response: 72 | """获取微博主体的一级评论 73 | 74 | Args: 75 | uid (str): 微博的uid 76 | mid (str): 微博的mid 77 | client (httpx.Client): 需要的client 78 | max_id (str, optional): 是否是第一次请求该微博主体的评论, 如果是第一次, max_id 设置为 None; 否则设置为 max_id. Defaults to None. 79 | 80 | Returns: 81 | httpx.Response: 评论的响应 82 | """ 83 | url, params, headers = build_comments_l1_params(uid, mid, max_id=max_id) 84 | response = client.get(url, params=params, headers=headers) 85 | return response 86 | 87 | 88 | async def get_comments_l1_response_asyncio(uid: str, mid : str, *, client: httpx.AsyncClient, max_id: Optional[str]=None) -> httpx.Response: 89 | """获取微博主体的一级评论(异步) 90 | 91 | Args: 92 | uid (str): 微博的uid 93 | mid (str): 微博的mid 94 | client (httpx.AsyncClient): 需要的client 95 | max_id (str, optional): 是否是第一次请求该微博主体的评论, 如果是第一次, max_id 设置为 None; 否则设置为 max_id. Defaults to None. 96 | 97 | Returns: 98 | httpx.Response: 评论的响应 99 | """ 100 | url, params, headers = build_comments_l1_params(uid, mid, max_id=max_id) 101 | response = await client.get(url, params=params, headers=headers) 102 | return response 103 | 104 | 105 | def get_comments_l2_response(uid: str, mid : str, *, client: httpx.Client, max_id: Optional[str]=None): 106 | """获取微博主体的二级评论 107 | 108 | Args: 109 | uid (str): 微博的uid 110 | mid (str): 微博的mid 111 | client (httpx.Client): 需要的client 112 | max_id (str, optional): 是否是第一次请求该微博主体的评论, 如果是第一次, max_id 设置为 None; 否则设置为 max_id. Defaults to None. 113 | 114 | Returns: 115 | httpx.Response: 评论的响应 116 | """ 117 | url, params, headers = build_comments_l2_params(uid, mid, max_id=max_id) 118 | response = client.get(url, params=params, headers=headers) 119 | return response 120 | 121 | 122 | async def get_comments_l2_response_asyncio(uid: str, mid : str, *, client: httpx.AsyncClient, max_id: Optional[str]=None): 123 | """获取微博主体的二级评论(异步) 124 | 125 | Args: 126 | uid (str): 微博的uid 127 | mid (str): 微博的mid 128 | client (httpx.AsyncClient): 需要的client 129 | max_id (str, optional): 是否是第一次请求该微博主体的评论, 如果是第一次, max_id 设置为 None; 否则设置为 max_id. Defaults to None. 130 | 131 | Returns: 132 | httpx.Response: 评论的响应 133 | """ 134 | url, params, headers = build_comments_l2_params(uid, mid, max_id=max_id) 135 | response = await client.get(url, params=params, headers=headers) 136 | return response -------------------------------------------------------------------------------- /WeiBoCrawler/request/get_cookies.py: -------------------------------------------------------------------------------- 1 | import httpx 2 | from .util import request_headers 3 | from PIL import Image 4 | from io import BytesIO 5 | import time 6 | from ..util import logging 7 | 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | def get_login_signin_response(client:httpx.Client) -> httpx.Response: 12 | """主要是获取 cookies 中的 X-CSRF-TOKEN 字段 13 | 14 | Args: 15 | client (httpx.Client): 会话客户端 16 | 17 | Returns: 18 | httpx.Response: 目的是获取响应的 url 19 | """ 20 | headers = request_headers.login_signin_headers 21 | 22 | url = "https://passport.weibo.com/sso/signin" 23 | params = { 24 | "entry": "miniblog", 25 | "source": "miniblog", 26 | "disp": "popup", 27 | "url": "https://weibo.com/newlogin?tabtype=weibo&gid=102803&openLoginLayer=0&url=https%3A%2F%2Fweibo.com%2F", 28 | "from": "weibopro" 29 | } 30 | 31 | response = client.get(url, params=params, headers=headers) 32 | response.raise_for_status() 33 | return response 34 | 35 | 36 | def get_login_qrcode_response(client:httpx.Client, login_signin_url:str) -> httpx.Response: 37 | """主要是获取二维码的 id 以及 二维码的 url 路径 38 | 39 | Args: 40 | client (httpx.Client): 会话客户端 41 | login_signin_url (str): signin 请求的url 主要是需要设置 referer 字段 42 | 43 | Returns: 44 | httpx.Response: 主要是获取 qrid 字段 和 二维码的 url 45 | """ 46 | headers = request_headers.login_qrcode_headers 47 | headers["referer"] = login_signin_url 48 | headers["x-csrf-token"] = client.cookies.get("X-CSRF-TOKEN") 49 | 50 | url = "https://passport.weibo.com/sso/v2/qrcode/image" 51 | params = { 52 | "entry": "miniblog", 53 | "size": "180" 54 | } 55 | response = client.get(url, params=params, headers=headers) 56 | response.raise_for_status() 57 | return response 58 | 59 | 60 | def get_login_check_response(client:httpx.Client, login_signin_url:str, qrid:str) -> httpx.Response: 61 | """检查二维码状态:未使用,已扫描未确认,已确认,已过期 62 | 63 | Args: 64 | client (httpx.Client): 会话客户端 65 | login_signin_url (str): signin 请求的url 主要是需要设置 referer 字段 66 | qrid (str): 二维码的 id 67 | 68 | Returns: 69 | httpx.Response: 检查二维码状态 70 | """ 71 | headers = request_headers.login_final_headers 72 | headers["referer"] = login_signin_url 73 | headers["x-csrf-token"] = client.cookies["X-CSRF-TOKEN"] 74 | 75 | url = "https://passport.weibo.com/sso/v2/qrcode/check" 76 | params = { 77 | "entry": "miniblog", 78 | "source": "miniblog", 79 | "url": "https://weibo.com/newlogin?tabtype=weibo&gid=102803&openLoginLayer=0&url=https%3A%2F%2Fweibo.com%2F", 80 | "qrid": qrid, 81 | "disp": "popup" 82 | } 83 | response = client.get(url, headers=headers, params=params) 84 | response.raise_for_status() 85 | return response 86 | 87 | 88 | 89 | def get_login_final_response(client:httpx.Client, login_url:str) -> httpx.Response: 90 | """最终的登录请求 91 | 92 | Args: 93 | client (httpx.Client): 会话客户端 94 | login_url (str): 最终的登入 url 95 | 96 | 1. 在这里由于是重定向请求,所有在 client 中最好设置 follow_redirects=True. 97 | 2. 最终的 response 不知道为啥一直是 403 请求,但是 cookies 是成功获取得到了的. 98 | 99 | Returns: 100 | httpx.Response: 没啥用 101 | """ 102 | response = client.get(login_url) 103 | # response.raise_for_status() 104 | return response 105 | 106 | 107 | def download_image(url:str, show:bool=False): 108 | """下载并打开图片用来扫描 109 | 110 | Args: 111 | url (str): 二维码图片地址 112 | show (bool, optional): 是否显示图片. Defaults to False. 113 | """ 114 | try: 115 | response = httpx.get(url) 116 | response.raise_for_status() 117 | image_content = BytesIO(response.content) 118 | image = Image.open(image_content) 119 | 120 | if show: 121 | image.show() 122 | 123 | return image 124 | 125 | except httpx.RequestError as e: 126 | print(f"请求发生错误: {e}") 127 | except Exception as e: 128 | print(f"发生其他错误: {e}") 129 | 130 | 131 | def get_qr_status(client:httpx.Client, login_signin_url:str, qrid:str) -> dict | None: 132 | """获取二维码的状态 133 | 134 | Args: 135 | client (httpx.Client): 会话客户端 136 | login_signin_url (str): 登入验证 url 137 | qrid (str): qr 的 id 138 | 139 | Returns: 140 | dict | None: 返回 cookies 或者 None 141 | """ 142 | while True: 143 | login_check_response = get_login_check_response(client, login_signin_url=login_signin_url, qrid=qrid) 144 | login_check_response.encoding = "utf-8" 145 | login_check_json_data = login_check_response.json() 146 | 147 | retcode = login_check_json_data.get("retcode") 148 | if retcode in [20000000, 50114001, 50114002]: 149 | if login_check_json_data.get("retcode") == 20000000: 150 | login_url = login_check_json_data.get("data").get("url") 151 | # 这里的 response 是一个重定向的响应, 其最终结果状态是 403 但是好像在重定向的过程中会设置一些 cookie 信息 152 | get_login_final_response(client, login_url=login_url) 153 | return dict(client.cookies) 154 | else: 155 | logging.info(f"二维码状态码: {login_check_json_data.get('retcode')}, 状态信息: {login_check_json_data.get('msg')}") 156 | else: 157 | return None 158 | 159 | time.sleep(1) 160 | 161 | 162 | 163 | def get_qr_Info() -> list[Image.Image, httpx.Client, str, str]: 164 | """最终获取 cookies 的函数 165 | 166 | Returns: 167 | list[Image.Image, httpx.Client, str, str]: 返回图片,会话客户端,登入验证 url,qr 的 id 168 | """ 169 | client = httpx.Client(follow_redirects=True) 170 | 171 | login_signin_response = get_login_signin_response(client) 172 | login_signin_url = str(login_signin_response.url) 173 | 174 | login_qrcode_response = get_login_qrcode_response(client, login_signin_url=login_signin_url) 175 | qrcode_json_data = login_qrcode_response.json().get("data") 176 | 177 | qrid = qrcode_json_data.get("qrid") 178 | image_path = qrcode_json_data.get("image") 179 | image = download_image(image_path) 180 | return image, client, login_signin_url, qrid -------------------------------------------------------------------------------- /WeiBoCrawler/request/get_list_request.py: -------------------------------------------------------------------------------- 1 | import httpx 2 | from copy import deepcopy 3 | from typing import Literal, Optional 4 | from datetime import datetime 5 | from .util import request_headers 6 | 7 | 8 | def build_list_params(search_for: str, page_index: int, *, kind : Literal["综合", "实时", "高级"] = "综合", 9 | advanced_kind: Literal["综合", "热度", "原创"] = "综合", time_start: Optional[datetime] = None, time_end: Optional[datetime]=None) -> tuple: 10 | """构建列表页参数 11 | 12 | Args: 13 | search_for (str): 需要搜索的内容,如果是话题,需要在 search_for 前后都加上 #. 14 | page_index (int): 页码. 15 | kind (Literal[, optional): 搜索类型可以是 综合,实时,高级(添加了综合,热度,原创筛选以及时间). Defaults to "综合". 16 | advanced_kind (Literal[, optional): 筛选条件,可以是综合,热度,原创. Defaults to "综合". 17 | time_start (Optional[datetime], optional): 起始时间,最大颗粒度为小时. Defaults to None. 18 | time_end (Optional[datetime], optional): 结束时间,最大颗粒度为小时. Defaults to None. 19 | 20 | Returns: 21 | httpx.Response: 返回列表页响应 22 | """ 23 | url_with_params_dic = { 24 | "综合":{ 25 | "url" : "https://s.weibo.com/weibo", 26 | "params": {"q": search_for, "Refer": "weibo_weibo", "page": page_index}, 27 | }, 28 | "实时":{ 29 | "url" : "https://s.weibo.com/realtime", 30 | "params": {"q": search_for, "rd": "realtime", "tw": "realtime", "Refer": "weibo_realtime", "page": page_index}, 31 | }, 32 | "高级":{ 33 | "url" : "https://s.weibo.com/weibo", 34 | "params": {"q": search_for, "suball": "1", "Refer": "g", "page": page_index}, 35 | }, 36 | } 37 | 38 | url_with_params = url_with_params_dic[kind] 39 | if kind == "高级": 40 | if advanced_kind == "综合": 41 | url_with_params["params"]["typeall"] = "1" 42 | if advanced_kind == "热度": 43 | url_with_params["params"]["xsort"] = "hot" 44 | if advanced_kind == "原创": 45 | url_with_params["params"]["scope"] = "ori" 46 | 47 | time_start = time_start.strftime("%Y-%m-%d-%H") if time_start else "" 48 | time_end = time_end.strftime("%Y-%m-%d-%H") if time_end else "" 49 | 50 | url_with_params["params"]["timescope"] = f"custom:{time_start}:{time_end}" 51 | 52 | headers = request_headers.body_headers 53 | 54 | if url_with_params["params"]["page"] > 1: 55 | referer_url_with_params = deepcopy(url_with_params) 56 | referer_url_with_params["params"]["page"] = url_with_params["params"]["page"] - 1 57 | headers["referer"] = str(httpx.URL(url_with_params["url"], params=referer_url_with_params["params"])) 58 | 59 | url = httpx.URL(url=url_with_params["url"], params=url_with_params["params"]) 60 | return url, headers 61 | 62 | 63 | def get_list_response(search_for: str, page_index: int, *, client: httpx.Client, kind : Literal["综合", "实时", "高级"] = "综合", 64 | advanced_kind: Literal["综合", "热度", "原创"] = "综合", time_start: Optional[datetime] = None, time_end: Optional[datetime]=None) -> httpx.Response: 65 | """获取列表页响应 66 | 67 | Args: 68 | search_for (str): 需要搜索的内容,如果是话题,需要在 search_for 前后都加上 #. 69 | page_index (int): 页码. 70 | client (httpx.Client): 客户端. 71 | kind (Literal[, optional): 搜索类型可以是 综合,实时,高级(添加了综合,热度,原创筛选以及时间). Defaults to "综合". 72 | advanced_kind (Literal[, optional): 筛选条件,可以是综合,热度,原创. Defaults to "综合". 73 | time_start (Optional[datetime], optional): 起始时间,最大颗粒度为小时. Defaults to None. 74 | time_end (Optional[datetime], optional): 结束时间,最大颗粒度为小时. Defaults to None. 75 | 76 | Returns: 77 | httpx.Response: 返回列表页响应 78 | """ 79 | url, headers = build_list_params(search_for, page_index, kind=kind, advanced_kind=advanced_kind, time_start=time_start, time_end=time_end) 80 | response = client.get(url, headers=headers) 81 | return response 82 | 83 | 84 | async def get_list_response_asyncio(search_for: str, page_index: int, *, client: httpx.AsyncClient, kind : Literal["综合", "实时", "高级"] = "综合", 85 | advanced_kind: Literal["综合", "热度", "原创"] = "综合", time_start: Optional[datetime] = None, time_end: Optional[datetime] = None) -> httpx.Response: 86 | """获取列表页响应(异步) 87 | 88 | Args: 89 | search_for (str): 需要搜索的内容,如果是话题,需要在 search_for 前后都加上 #. 90 | page_index (int): 页码. 91 | client (httpx.AsyncClient): 异步客户端. 92 | kind (Literal[, optional): 搜索类型可以是 综合,实时,高级(添加了综合,热度,原创筛选以及时间). Defaults to "综合". 93 | advanced_kind (Literal[, optional): 筛选条件,可以是综合,热度,原创. Defaults to "综合". 94 | time_start (Optional[datetime], optional): 起始时间,最大颗粒度为小时. Defaults to None. 95 | time_end (Optional[datetime], optional): 结束时间,最大颗粒度为小时. Defaults to None. 96 | 97 | Returns: 98 | httpx.Response: 返回列表页响应 99 | """ 100 | url, headers = build_list_params(search_for, page_index, kind=kind, advanced_kind=advanced_kind, time_start=time_start, time_end=time_end) 101 | response = await client.get(url, headers=headers) 102 | return response 103 | -------------------------------------------------------------------------------- /WeiBoCrawler/request/get_rum_request(unuse).py: -------------------------------------------------------------------------------- 1 | # rum 不需要构建 2 | 3 | import httpx 4 | import json 5 | 6 | def get_rum_level_one_response(buildComments_url): 7 | 8 | headers = { 9 | "accept": "*/*", 10 | "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", 11 | "content-type": "multipart/form-data; boundary=----WebKitFormBoundaryvnSjtxxxjv6x1pFT", 12 | "origin": "https://weibo.com", 13 | "priority": "u=1, i", 14 | "referer": "https://weibo.com/2803301701/PblVL5Bg5", 15 | "sec-ch-ua": "\"Not A(Brand\";v=\"8\", \"Chromium\";v=\"132\", \"Microsoft Edge\";v=\"132\"", 16 | "sec-ch-ua-mobile": "?0", 17 | "sec-ch-ua-platform": "\"Windows\"", 18 | "sec-fetch-dest": "empty", 19 | "sec-fetch-mode": "cors", 20 | "sec-fetch-site": "same-origin", 21 | "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0", 22 | "x-xsrf-token": "seBDSEeh70cZTEWGWWkFmxxG" 23 | } 24 | 25 | cookies = { 26 | "SCF": "AnQhEA08TUG9ln2r7R0-cHMvj3KTSZb-85kfIcXTHqooYhjTcn-UkaGS5792LpSqqbJApBlXrIheowZ1k4aYR1Q.", 27 | "SUB": "_2A25Kkj8dDeRhGeFJ4lIT9CzNyj6IHXVp7j7VrDV8PUNbmtAYLVT5kW9NfsmQ4UzJuUOhUQbYBkUvv3HADVVzl9Ig", 28 | "SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9W5Oj.LmOvr7_7fS8d6lYxiZ5JpX5KzhUgL.FoMN1K5EShzpeKz2dJLoIp7LxKML1KBLBKnLxKqL1hnLBoMNS0.7eoBEeK2E", 29 | "ALF": "02_1740495949", 30 | "SINAGLOBAL": "970667482772.5692.1737903974414", 31 | "ULV": "1737903974460:1:1:1:970667482772.5692.1737903974414:", 32 | "XSRF-TOKEN": "seBDSEeh70cZTEWGWWkFmxxG", 33 | "WBPSESS": "2bPq4LTfaY-EnTnt8h5hWX9KGoz50scMNqd4lpDCT8IiCLnpv2C9Z_Kk8JVbYkIyBQ0eFNYccRFpnV_A6ntYbwjqG_PAbMAldrAdPPf_XvQiQHrkm_9GFJunwjaIeUwiupJQv3fNpU5K1Xq-CCdaFg==" 34 | } 35 | 36 | entry = { 37 | "name": "https://weibo.com/ajax/statuses/buildComments?flow=0&is_reload=1&id=5127059131334865&is_show_bulletin=2&is_mix=0&max_id=139293859600042&count=20&uid=2803301701&fetch_level=0&locale=zh-CN", 38 | "entryType": "resource", 39 | "startTime": "327212.7000000002", 40 | "duration": "493.20000000018626", 41 | "initiatorType": "xmlhttprequest", 42 | "deliveryType": "", 43 | "nextHopProtocol": "h2", 44 | "renderBlockingStatus": "non-blocking", 45 | "workerStart": 0, 46 | "redirectStart": 0, 47 | "redirectEnd": 0, 48 | "fetchStart": "327212.7000000002", 49 | "domainLookupStart": "327212.7000000002", 50 | "domainLookupEnd": "327212.7000000002", 51 | "connectStart": "327212.7000000002", 52 | "secureConnectionStart": "327212.7000000002", 53 | "connectEnd": "327212.7000000002", 54 | "requestStart": "327226.7000000002", 55 | "responseStart": "327702.6000000001", 56 | "firstInterimResponseStart": 0, 57 | "responseEnd": "327705.9000000004", 58 | "transferSize": 11971, 59 | "encodedBodySize": 11671, 60 | "decodedBodySize": 72237, 61 | "responseStatus": 200, 62 | "serverTiming": [], 63 | "dns": 0, 64 | "tcp": 0, 65 | "ttfb": "475.89999999990687", 66 | "pathname": "https://weibo.com/ajax/statuses/buildComments", 67 | "speed": 0 68 | } 69 | 70 | files = { 71 | "entry": (None, json.dumps(entry)), 72 | "request_id": (None, ""), 73 | } 74 | 75 | url = "https://weibo.com/ajax/log/rum" 76 | response = httpx.post(url, headers=headers, cookies=cookies, files=files) 77 | 78 | print(response.headers) 79 | 80 | url = "https://weibo.com/ajax/statuses/buildComments?flow=0&is_reload=1&id=139293862124853&is_show_bulletin=2&is_mix=0&max_id=139568722411765&count=20&uid=2803301701&fetch_level=0&locale=zh-CN" 81 | get_rum_level_one_response(url) -------------------------------------------------------------------------------- /WeiBoCrawler/request/request.toml: -------------------------------------------------------------------------------- 1 | # 需要配置 referer 2 | [list_headers] 3 | accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7" 4 | accept-language = "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6" 5 | priority = "u=0, i" 6 | sec-ch-ua = "\"Not A(Brand\";v=\"8\", \"Chromium\";v=\"132\", \"Microsoft Edge\";v=\"132\"" 7 | sec-ch-ua-mobile = "?0" 8 | sec-ch-ua-platform = "\"Windows\"" 9 | sec-fetch-dest = "document" 10 | sec-fetch-mode = "navigate" 11 | sec-fetch-site = "none" 12 | sec-fetch-user = "?1" 13 | upgrade-insecure-requests = "1" 14 | user-agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0" 15 | 16 | # 需要配置 referer 17 | [body_headers] 18 | accept = "application/json, text/plain, */*" 19 | accept-language = "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6" 20 | client-version = "v2.47.25" 21 | priority = "u=1, i" 22 | referer = "https://weibo.com/1644114654/OiZre8dir?refer_flag=1001030103_" 23 | sec-ch-ua = "\"Not A(Brand\";v=\"8\", \"Chromium\";v=\"132\", \"Microsoft Edge\";v=\"132\"" 24 | sec-ch-ua-mobile = "?0" 25 | sec-ch-ua-platform = "\"Windows\"" 26 | sec-fetch-dest = "empty" 27 | sec-fetch-mode = "cors" 28 | sec-fetch-site = "same-origin" 29 | server-version = "v2025.01.23.1" 30 | user-agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0" 31 | x-requested-with = "XMLHttpRequest" 32 | x-xsrf-token = "seBDSEeh70cZTEWGWWkFmxxG" 33 | 34 | 35 | # 需要配置 referer x-xsrf-token 36 | [comment1_buildComments_headers] 37 | accept = "application/json, text/plain, */*" 38 | accept-language = "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6" 39 | client-version = "v2.47.25" 40 | priority = "u=1, i" 41 | referer = "https://weibo.com/1644114654/OiZre8dir" 42 | sec-ch-ua = "\"Not A(Brand\";v=\"8\", \"Chromium\";v=\"132\", \"Microsoft Edge\";v=\"132\"" 43 | sec-ch-ua-mobile = "?0" 44 | sec-ch-ua-platform = "\"Windows\"" 45 | sec-fetch-dest = "empty" 46 | sec-fetch-mode = "cors" 47 | sec-fetch-site = "same-origin" 48 | server-version = "v2025.01.23.1" 49 | user-agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0" 50 | x-requested-with = "XMLHttpRequest" 51 | x-xsrf-token = "seBDSEeh70cZTEWGWWkFmxxG" 52 | 53 | # 需要配置 referer 54 | [comment2_buildComments_headers] 55 | accept = "application/json, text/plain, */*" 56 | accept-language = "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6" 57 | client-version = "v2.47.25" 58 | priority = "u=1, i" 59 | referer = "https://weibo.com/1644114654/OiZre8dir" 60 | sec-ch-ua = "\"Not A(Brand\";v=\"8\", \"Chromium\";v=\"132\", \"Microsoft Edge\";v=\"132\"" 61 | sec-ch-ua-mobile = "?0" 62 | sec-ch-ua-platform = "\"Windows\"" 63 | sec-fetch-dest = "empty" 64 | sec-fetch-mode = "cors" 65 | sec-fetch-site = "same-origin" 66 | server-version = "v2025.01.23.1" 67 | user-agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0" 68 | x-requested-with = "XMLHttpRequest" 69 | x-xsrf-token = "seBDSEeh70cZTEWGWWkFmxxG" 70 | 71 | 72 | # 需要配置 referer x-xsrf-token 73 | [comment1_rum_headers] 74 | accept = "*/*" 75 | accept-language = "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6" 76 | content-type = "multipart/form-data; boundary=----WebKitFormBoundaryP8RPAfGDm1mdduKE" 77 | origin = "https://weibo.com" 78 | priority = "u=1, i" 79 | referer = "https://weibo.com/1644114654/OiZre8dir" 80 | sec-ch-ua = "\"Not A(Brand\";v=\"8\", \"Chromium\";v=\"132\", \"Microsoft Edge\";v=\"132\"" 81 | sec-ch-ua-mobile = "?0" 82 | sec-ch-ua-platform = "\"Windows\"" 83 | sec-fetch-dest = "empty" 84 | sec-fetch-mode = "cors" 85 | sec-fetch-site = "same-origin" 86 | user-agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0" 87 | x-xsrf-token = "seBDSEeh70cZTEWGWWkFmxxG" 88 | 89 | 90 | # 需要配置 referer x-xsrf-token 91 | [comment2_rum_headers] 92 | accept = "*/*" 93 | accept-language = "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6" 94 | content-type = "multipart/form-data; boundary=----WebKitFormBoundary0CRQdyFBn3rj8Xh2" 95 | origin = "https://weibo.com" 96 | priority = "u=1, i" 97 | referer = "https://weibo.com/1644114654/OiZre8dir" 98 | sec-ch-ua = "\"Not A(Brand\";v=\"8\", \"Chromium\";v=\"132\", \"Microsoft Edge\";v=\"132\"" 99 | sec-ch-ua-mobile = "?0" 100 | sec-ch-ua-platform = "\"Windows\"" 101 | sec-fetch-dest = "empty" 102 | sec-fetch-mode = "cors" 103 | sec-fetch-site = "same-origin" 104 | user-agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0" 105 | x-xsrf-token = "seBDSEeh70cZTEWGWWkFmxxG" 106 | 107 | 108 | # 需要配置 referer x-xsrf-token 109 | [login_signin_headers] 110 | accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7" 111 | accept-language = "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6" 112 | cache-control = "max-age=0" 113 | priority = "u=0, i" 114 | sec-ch-ua = "\"Not A(Brand\";v=\"8\", \"Chromium\";v=\"132\", \"Microsoft Edge\";v=\"132\"" 115 | sec-ch-ua-mobile = "?0" 116 | sec-ch-ua-platform = "\"Windows\"" 117 | sec-fetch-dest = "document" 118 | sec-fetch-mode = "navigate" 119 | sec-fetch-site = "same-origin" 120 | sec-fetch-user = "?1" 121 | upgrade-insecure-requests = "1" 122 | user-agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0" 123 | 124 | 125 | # 需要配置 referer x-xsrf-token 126 | [login_qrcode_headers] 127 | accept = "application/json, text/plain, */*" 128 | accept-language = "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6" 129 | priority = "u=1, i" 130 | sec-ch-ua = "\"Not A(Brand\";v=\"8\", \"Chromium\";v=\"132\", \"Microsoft Edge\";v=\"132\"" 131 | sec-ch-ua-mobile = "?0" 132 | sec-ch-ua-platform = "\"Windows\"" 133 | sec-fetch-dest = "empty" 134 | sec-fetch-mode = "cors" 135 | sec-fetch-site = "same-origin" 136 | user-agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0" 137 | x-requested-with = "XMLHttpRequest" 138 | 139 | 140 | # 需要配置 referer x-xsrf-token 141 | [login_final_headers] 142 | accept = "application/json, text/plain, */*" 143 | accept-language = "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6" 144 | priority = "u=1, i" 145 | sec-ch-ua = "\"Not A(Brand\";v=\"8\", \"Chromium\";v=\"132\", \"Microsoft Edge\";v=\"132\"" 146 | sec-ch-ua-mobile = "?0" 147 | sec-ch-ua-platform = "\"Windows\"" 148 | sec-fetch-dest = "empty" 149 | sec-fetch-mode = "cors" 150 | sec-fetch-site = "same-origin" 151 | user-agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0" 152 | x-requested-with = "XMLHttpRequest" -------------------------------------------------------------------------------- /WeiBoCrawler/request/util.py: -------------------------------------------------------------------------------- 1 | import toml 2 | from pathlib import Path 3 | from ..util import RequestHeaders 4 | 5 | module_path = Path(__file__).parent 6 | 7 | request_headers = RequestHeaders.model_validate(toml.load(module_path / "./request.toml")) 8 | 9 | __all__ = [ request_headers ] -------------------------------------------------------------------------------- /WeiBoCrawler/util/__init__.py: -------------------------------------------------------------------------------- 1 | from .path import config_path 2 | from .log import logging 3 | from .database import database_config 4 | from .cookie import cookies_config 5 | from .decorator import log_function_params, retry_timeout_decorator, retry_timeout_decorator_asyncio, custom_validate_call 6 | from .custom import CustomProgress, RequestHeaders 7 | from .process import process_time_str, process_base_document, process_base_documents 8 | 9 | __all__ = [ 10 | "logging", 11 | 12 | "config_path", 13 | 14 | "database_config", 15 | "cookies_config", 16 | 17 | "log_function_params", 18 | "retry_timeout_decorator", 19 | "retry_timeout_decorator_asyncio", 20 | "custom_validate_call", 21 | 22 | "CustomProgress", 23 | "RequestHeaders", 24 | 25 | "process_time_str", 26 | "process_base_document", 27 | "process_base_documents", 28 | ] -------------------------------------------------------------------------------- /WeiBoCrawler/util/cookie.py: -------------------------------------------------------------------------------- 1 | import toml 2 | from pydantic import BaseModel 3 | from .path import config_path 4 | 5 | class CookiesConfig(BaseModel): 6 | """这个类主要用来保存 Cookies 7 | 8 | Attributes: 9 | cookies (dict): 微博的cookies 10 | cookies_info (datetime): 更新时间 11 | """ 12 | cookies: dict 13 | cookies_info: dict 14 | 15 | cookies_config = CookiesConfig.model_validate(toml.load(config_path)) 16 | -------------------------------------------------------------------------------- /WeiBoCrawler/util/custom.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | from rich.progress import ( 3 | BarColumn, 4 | MofNCompleteColumn, 5 | Progress, 6 | TextColumn, 7 | TimeElapsedColumn, 8 | ) 9 | 10 | 11 | 12 | class CustomProgress: 13 | """自定义进度条 14 | 15 | Attributes: 16 | progress (Progress): 进度条 17 | """ 18 | def __init__(self): 19 | self.progress = Progress( 20 | BarColumn(), 21 | MofNCompleteColumn(), 22 | TimeElapsedColumn(), 23 | TextColumn("[progress.description]{task.description}", justify="left"), 24 | ) 25 | 26 | def __enter__(self): 27 | self.progress.start() 28 | return self.progress 29 | 30 | def __exit__(self, exc_type, exc_val, exc_tb): 31 | self.progress.stop() 32 | 33 | 34 | 35 | class RequestHeaders(BaseModel): 36 | """这个类主要用来保存一些请求参数的东西 37 | 38 | Attributes: 39 | body_headers (dict): 微博主页的请求头 40 | comment1_buildComments_headers (dict): 评论区buildComments的请求头 41 | comment1_rum_headers (dict): 评论区rum的请求头 42 | .... 43 | """ 44 | list_headers: dict 45 | body_headers: dict 46 | comment1_buildComments_headers: dict 47 | comment1_rum_headers: dict 48 | comment2_buildComments_headers: dict 49 | comment2_rum_headers: dict 50 | login_signin_headers:dict 51 | login_qrcode_headers:dict 52 | login_final_headers:dict -------------------------------------------------------------------------------- /WeiBoCrawler/util/database.py: -------------------------------------------------------------------------------- 1 | import toml 2 | from .path import module_path, config_path, Path 3 | from pydantic import BaseModel, field_validator 4 | 5 | 6 | class DatabaseConfig(BaseModel): 7 | path: str 8 | 9 | @field_validator('path') 10 | def modify_module_path(cls, value): 11 | if Path(value).is_absolute(): 12 | return str(value) 13 | else: 14 | return str(module_path / value) 15 | 16 | 17 | database_config = DatabaseConfig.model_validate(toml.load(config_path)["database"]) 18 | -------------------------------------------------------------------------------- /WeiBoCrawler/util/decorator.py: -------------------------------------------------------------------------------- 1 | from .log import logging 2 | from typing import Callable 3 | import httpx 4 | from pydantic import validate_call 5 | 6 | def custom_validate_call(func: Callable) -> Callable: 7 | return validate_call(func, config={"arbitrary_types_allowed": True}, validate_return=True) 8 | 9 | def log_function_params(logger: logging.Logger=logging): 10 | """记录函数的参数和返回值 11 | 12 | Args: 13 | func (Callable): 需要装饰的函数 14 | 15 | Returns: 16 | Callable: 装饰后的函数 17 | """ 18 | def log_function_params_(func:Callable) -> Callable: 19 | def wrapper(*args, **kwargs): 20 | # 记录函数名和参数 21 | args_repr = [repr(a) for a in args] 22 | kwargs_repr = [f"{k}={v!r}" for k, v in kwargs.items()] 23 | signature = ", ".join(args_repr + kwargs_repr) 24 | logger.info(f"Calling Function {func.__name__}({signature})") 25 | 26 | # 调用原函数 27 | result = func(*args, **kwargs) 28 | 29 | # 记录返回值 30 | logger.info(f"Function {func.__name__} returned {result!r}") 31 | return result 32 | return wrapper 33 | return log_function_params_ 34 | 35 | 36 | def retry_timeout_decorator(func: Callable) -> Callable: 37 | """超时重试装饰器 38 | 39 | Args: 40 | retry_times (int): 重试次数. Defaults to 3. 41 | 42 | Returns: 43 | Callable: 装饰后的函数 44 | """ 45 | retry_times = 3 46 | def wrapper(*args, **kwargs): 47 | attempts = 0 48 | while attempts < retry_times: 49 | try: 50 | return func(*args, **kwargs) 51 | except httpx.TimeoutException as e: 52 | attempts += 1 53 | if attempts < retry_times: 54 | logging.warning(f"请求超时,正在进行第 {attempts} 次重试...") 55 | else: 56 | logging.error(f"请求超时,重试次数已达到最大值,请检查网络连接或重试次数!错误原因{e}") 57 | return wrapper 58 | 59 | 60 | def retry_timeout_decorator_asyncio(func: Callable) -> Callable: 61 | """超时重试装饰器(异步) 62 | 63 | Args: 64 | retry_times (int): 重试次数. Defaults to 3. 65 | 66 | Returns: 67 | Callable: 装饰后的函数 68 | """ 69 | retry_times = 3 70 | async def wrapper(*args, **kwargs): # 将 wrapper 改为异步函数 71 | attempts = 0 72 | while attempts < retry_times: 73 | try: 74 | return await func(*args, **kwargs) # 调用异步函数并使用 await 75 | except httpx.TimeoutException as e: 76 | attempts += 1 77 | if attempts < retry_times: 78 | logging.warning(f"请求超时,正在进行第 {attempts} 次重试...") 79 | else: 80 | logging.error(f"请求超时,重试次数已达到最大值,请检查网络连接或重试次数!错误原因{e}") 81 | return wrapper -------------------------------------------------------------------------------- /WeiBoCrawler/util/log.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from .path import module_path 3 | 4 | 5 | # 配置日志 6 | logging.basicConfig( 7 | filename=module_path / "./app.log", 8 | level=logging.INFO, 9 | format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', 10 | encoding="utf-8", 11 | ) -------------------------------------------------------------------------------- /WeiBoCrawler/util/path.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | 4 | module_path = Path(__file__).parent.parent 5 | 6 | config_path = module_path / "./config.toml" -------------------------------------------------------------------------------- /WeiBoCrawler/util/process.py: -------------------------------------------------------------------------------- 1 | import re 2 | from datetime import datetime, timedelta 3 | import pandas as pd 4 | 5 | 6 | def process_time_str(time_str:str) -> datetime: 7 | """这段代码是用来解析微博的时间字段的 8 | 1. 处理 年、月、日、时、分 9 | 2. 处理 分钟前,小时前,这里不处理秒前 10 | 11 | Args: 12 | time_str (str): 微博时间字段 13 | 14 | Returns: 15 | datatime: 返回时间字段 16 | """ 17 | datetime_now = datetime.now() 18 | 19 | if "年" in time_str: 20 | year = re.search(r"(\d{4})年", time_str).group(1) 21 | else: 22 | year = datetime_now.year 23 | if "月" in time_str: 24 | month = re.search(r"(\d{1,2})月", time_str).group(1) 25 | else: 26 | month = datetime_now.month 27 | if "日" in time_str: 28 | day = re.search(r"(\d{1,2})日", time_str).group(1) 29 | else: 30 | day = datetime_now.day 31 | if ":" in time_str: 32 | hour = re.search(r"(\d{1,2}):", time_str).group(1) 33 | minute = re.search(r":(\d{1,2})", time_str).group(1) 34 | else: 35 | hour = datetime_now.hour 36 | minute = datetime_now.minute 37 | 38 | datetime_now = datetime(int(year), int(month), int(day), int(hour), int(minute)) 39 | 40 | if "分钟前" in time_str: 41 | minute_before = re.search(r"(\d+)分钟前", time_str).group(1) 42 | datetime_now = datetime_now - timedelta(minutes=int(minute_before)) 43 | if "小时前" in time_str: 44 | hour_before = re.search(r"(\d+)小时前", time_str).group(1) 45 | datetime_now = datetime_now - timedelta(hours=int(hour_before)) 46 | 47 | return datetime_now 48 | 49 | 50 | 51 | def drop_documents_duplicates(documents: list[dict]) -> None: 52 | """dict 列表去重 53 | 这里暂时使用最简单的列表去重法, 后续可以考虑使用 hash 去重等方法优化.. 54 | 55 | Args: 56 | list[dict]: 去重后的表 57 | """ 58 | unique_document = [] 59 | for document in documents: 60 | if document not in unique_document: 61 | unique_document.append(document) 62 | 63 | return unique_document 64 | 65 | 66 | def process_base_document(document: dict, transform_dict: dict) -> dict: 67 | """将 document 处理成字典的形式 68 | 69 | transform_dict = { 70 | "转发数量": "retweet_num", 71 | "评论数量": "comment_num", 72 | "点赞数量": "star_num 73 | ... 74 | } 75 | 76 | Args: 77 | document (dict): 文档 78 | transform_dict (dict): 转换字典, key 是转化后的字段, value 是原始字段 79 | 80 | Returns: 81 | dict: 处理后的字典 82 | """ 83 | item = {} 84 | 85 | for key, value in transform_dict.items(): 86 | if isinstance(value, str): 87 | final_value = document.get(value, None) 88 | 89 | elif isinstance(value, list): 90 | final_value = document 91 | for v in value: 92 | if final_value is None: 93 | break 94 | final_value = final_value.get(v, None) 95 | 96 | item[key] = final_value 97 | return item 98 | 99 | 100 | def process_base_documents(documents: list[dict], transform_dict: dict) -> pd.DataFrame: 101 | """将 documents 处理成 dataframe 的形式 102 | 103 | transform_dict = { 104 | "转发数量": "retweet_num", 105 | "评论数量": "comment_num", 106 | "点赞数量": "star_num", 107 | ... 108 | } 109 | 110 | Args: 111 | documents (list[dict]): 文档列表 112 | transform_dict (dict): 转换字典, key 是转化后的字段, value 是原始字段 113 | 114 | Returns: 115 | pd.DataFrame: (去重)处理后得到的表格 116 | """ 117 | items = [process_base_document(document, transform_dict) for document in documents] 118 | df = pd.DataFrame(items) 119 | df.drop_duplicates(inplace=True) 120 | return df -------------------------------------------------------------------------------- /WeiBoCrawler/util/show_qrcode.py: -------------------------------------------------------------------------------- 1 | from pyzbar.pyzbar import decode 2 | from PIL import Image 3 | import qrcode 4 | 5 | def show_qrcode(img_path:str): 6 | """在控制台显示二维码 7 | 8 | Args: 9 | img_path (str): 二维码路径 10 | """ 11 | img = Image.open('gen.png') 12 | decoded_data = decode(img) 13 | data = decoded_data[0].data.decode('utf-8') 14 | qr = qrcode.QRCode() 15 | qr.add_data(data) 16 | qr.make() 17 | qr.print_ascii() -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "weibocrawler" 3 | version = "0.1.0" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.10" 7 | dependencies = [ 8 | "aiosqlite>=0.20.0", 9 | "httpx>=0.28.1", 10 | "pandas>=2.0.3", 11 | "parsel>=1.9.1", 12 | "pydantic>=2.10.6", 13 | "pyzbar>=0.1.9", 14 | "qrcode>=8.0", 15 | "sqlalchemy>=2.0.37", 16 | "streamlit>=1.41.1", 17 | "toml>=0.10.2", 18 | ] 19 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # This file was autogenerated by uv via the following command: 2 | # 这个文件是使用下列命名自动生成的: 3 | # uv pip compile pyproject.toml -o requirements.txt 4 | aiosqlite==0.20.0 5 | # via weibocrawler (pyproject.toml) 6 | altair==5.5.0 7 | # via streamlit 8 | annotated-types==0.7.0 9 | # via pydantic 10 | anyio==4.8.0 11 | # via httpx 12 | attrs==25.1.0 13 | # via 14 | # jsonschema 15 | # referencing 16 | blinker==1.9.0 17 | # via streamlit 18 | cachetools==5.5.1 19 | # via streamlit 20 | certifi==2024.12.14 21 | # via 22 | # httpcore 23 | # httpx 24 | # requests 25 | charset-normalizer==3.4.1 26 | # via requests 27 | click==8.1.8 28 | # via streamlit 29 | colorama==0.4.6 30 | # via 31 | # click 32 | # qrcode 33 | cssselect==1.2.0 34 | # via parsel 35 | exceptiongroup==1.2.2 36 | # via anyio 37 | gitdb==4.0.12 38 | # via gitpython 39 | gitpython==3.1.44 40 | # via streamlit 41 | greenlet==3.1.1 42 | # via sqlalchemy 43 | h11==0.14.0 44 | # via httpcore 45 | httpcore==1.0.7 46 | # via httpx 47 | httpx==0.28.1 48 | # via weibocrawler (pyproject.toml) 49 | idna==3.10 50 | # via 51 | # anyio 52 | # httpx 53 | # requests 54 | jinja2==3.1.5 55 | # via 56 | # altair 57 | # pydeck 58 | jmespath==1.0.1 59 | # via parsel 60 | jsonschema==4.23.0 61 | # via altair 62 | jsonschema-specifications==2024.10.1 63 | # via jsonschema 64 | lxml==5.3.0 65 | # via parsel 66 | markdown-it-py==3.0.0 67 | # via rich 68 | markupsafe==3.0.2 69 | # via jinja2 70 | mdurl==0.1.2 71 | # via markdown-it-py 72 | narwhals==1.24.1 73 | # via altair 74 | numpy==2.2.2 75 | # via 76 | # pandas 77 | # pydeck 78 | # streamlit 79 | packaging==24.2 80 | # via 81 | # altair 82 | # parsel 83 | # streamlit 84 | pandas==2.2.3 85 | # via 86 | # weibocrawler (pyproject.toml) 87 | # streamlit 88 | parsel==1.10.0 89 | # via weibocrawler (pyproject.toml) 90 | pillow==11.1.0 91 | # via streamlit 92 | protobuf==5.29.3 93 | # via streamlit 94 | pyarrow==19.0.0 95 | # via streamlit 96 | pydantic==2.10.6 97 | # via weibocrawler (pyproject.toml) 98 | pydantic-core==2.27.2 99 | # via pydantic 100 | pydeck==0.9.1 101 | # via streamlit 102 | pygments==2.19.1 103 | # via rich 104 | python-dateutil==2.9.0.post0 105 | # via pandas 106 | pytz==2024.2 107 | # via pandas 108 | pyzbar==0.1.9 109 | # via weibocrawler (pyproject.toml) 110 | qrcode==8.0 111 | # via weibocrawler (pyproject.toml) 112 | referencing==0.36.2 113 | # via 114 | # jsonschema 115 | # jsonschema-specifications 116 | requests==2.32.3 117 | # via streamlit 118 | rich==13.9.4 119 | # via streamlit 120 | rpds-py==0.22.3 121 | # via 122 | # jsonschema 123 | # referencing 124 | six==1.17.0 125 | # via python-dateutil 126 | smmap==5.0.2 127 | # via gitdb 128 | sniffio==1.3.1 129 | # via anyio 130 | sqlalchemy==2.0.37 131 | # via weibocrawler (pyproject.toml) 132 | streamlit==1.41.1 133 | # via weibocrawler (pyproject.toml) 134 | tenacity==9.0.0 135 | # via streamlit 136 | toml==0.10.2 137 | # via 138 | # weibocrawler (pyproject.toml) 139 | # streamlit 140 | tornado==6.4.2 141 | # via streamlit 142 | typing-extensions==4.12.2 143 | # via 144 | # aiosqlite 145 | # altair 146 | # anyio 147 | # pydantic 148 | # pydantic-core 149 | # referencing 150 | # rich 151 | # sqlalchemy 152 | # streamlit 153 | tzdata==2025.1 154 | # via pandas 155 | urllib3==2.3.0 156 | # via requests 157 | w3lib==2.2.1 158 | # via parsel 159 | watchdog==6.0.0 160 | # via streamlit 161 | -------------------------------------------------------------------------------- /web/main.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | # 在使用绝对路径的时候, 只有初次能够成功!如果修改页面就会报错. 4 | # 使用相对路径的话不是基于项目的, 而是基于运行 streamlit run main.py 的路径. 5 | 6 | 7 | st.set_page_config( 8 | page_title="微博爬虫数据分析", 9 | page_icon="💻", 10 | layout="wide", 11 | initial_sidebar_state="expanded", 12 | ) 13 | 14 | 15 | pg = st.navigation({ 16 | "Cookie": [ 17 | st.Page("./web_pages/Cookie/Cookie.py", title="Cookie", icon=":material/add_circle:") 18 | ], 19 | "下载": [ 20 | st.Page("./web_pages/搜索/列表搜索.py", title="列表搜索", icon=":material/add_circle:"), 21 | st.Page("./web_pages/搜索/详细页搜索.py", title="详细页搜索", icon=":material/add_circle:"), 22 | st.Page("./web_pages/搜索/一级评论搜索.py", title="一级评论搜索", icon=":material/add_circle:"), 23 | st.Page("./web_pages/搜索/二级评论搜索.py", title="二级评论搜索", icon=":material/add_circle:"), 24 | ], 25 | "查询": [ 26 | st.Page("./web_pages/查询/查询.py", title="SQL语句查询", icon=":material/add_circle:") 27 | ], 28 | }) 29 | 30 | pg.run() -------------------------------------------------------------------------------- /web/util/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append(".") 3 | 4 | from WeiBoCrawler.database import db, BodyRecord, Comment1Record, Comment2Record 5 | from WeiBoCrawler.pack import get_list_data, get_body_data, get_comment1_data, get_comment2_data 6 | from WeiBoCrawler.parse import process_list_documents, process_comment_documents, process_body_documents 7 | from WeiBoCrawler.request import get_qr_Info, get_qr_status 8 | from WeiBoCrawler.util import config_path, cookies_config 9 | 10 | 11 | __all__ = [ 12 | "config_path", 13 | "cookies_config", 14 | 15 | "get_qr_Info", 16 | "get_qr_status", 17 | 18 | "get_list_data", 19 | "get_body_data", 20 | "get_comment1_data", 21 | "get_comment2_data", 22 | 23 | "db", 24 | "BodyRecord", 25 | "Comment1Record", 26 | "Comment2Record", 27 | 28 | "process_body_documents", 29 | "process_list_documents", 30 | "process_comment_documents", 31 | ] -------------------------------------------------------------------------------- /web/web_pages/Cookie/Cookie.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import toml 3 | from util import cookies_config, config_path, get_qr_Info, get_qr_status 4 | from datetime import datetime 5 | from threading import Thread 6 | from streamlit.runtime.scriptrunner import add_script_run_ctx, get_script_run_ctx 7 | 8 | if 'Thread' not in st.session_state: 9 | st.session_state["Thread"] = None 10 | 11 | def set_cookies(cookies): 12 | if cookies is not None: 13 | cookies_config.cookies.update(cookies) 14 | cookies_config.cookies_info["update_time"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S") 15 | config_data = toml.load(config_path) 16 | 17 | config_data["cookies"].update(cookies_config.cookies) 18 | config_data["cookies_info"].update(cookies_config.cookies_info) 19 | 20 | with open(config_path, "w", encoding="utf-8") as f: 21 | toml.dump(config_data, f) 22 | else: 23 | st.error("获取 cookies 失败!!!!!!!") 24 | 25 | 26 | def get_cookies(client, login_signin_url, qrid): 27 | cookies = get_qr_status(client, login_signin_url, qrid) 28 | if cookies is None: 29 | st.error("获取 cookies 失败!!!!!!!") 30 | else: 31 | set_cookies(cookies) 32 | client.close() 33 | 34 | 35 | @st.dialog("使用微博APP扫码登录") 36 | def scan_code(): 37 | if st.session_state["Thread"] is not None and st.session_state["Thread"].is_alive(): 38 | st.image(image=st.session_state["image"]) 39 | else: 40 | image, client, login_signin_url, qrid = get_qr_Info() 41 | st.session_state["image"] = image 42 | st.image(image=image) 43 | 44 | st.session_state["Thread"] = Thread(target=get_cookies, args=(client, login_signin_url, qrid)) 45 | add_script_run_ctx(st.session_state["Thread"], get_script_run_ctx()) 46 | st.session_state["Thread"].start() 47 | 48 | cols = st.columns([1, 1, 15]) 49 | cols[0].button("更新", key="update", on_click=scan_code, type="secondary", use_container_width=True) 50 | if cols[1].button("刷新", key="rerun", type="secondary", use_container_width=True): 51 | st.rerun() 52 | st.write(cookies_config) -------------------------------------------------------------------------------- /web/web_pages/搜索/一级评论搜索.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from util import get_comment1_data, db, Comment1Record, process_comment_documents 3 | 4 | cols = st.columns([4, 4, 3, 1, 2, 2], vertical_alignment="bottom") 5 | cols[0].text_input("uid 列表(用空格分隔)", value="2035895904 1749277070", key="uid") 6 | cols[1].text_input("mid 列表(用空格分隔)", value="5096904217856018 5045463240409185", key="mid") 7 | cols[2].text_input("存储表名", value="test", key="table_name") 8 | 9 | cols[-1].button("搜索", type="primary", key="comment1_button") 10 | 11 | if st.session_state["comment1_button"]: 12 | uids = st.session_state["uid"].split() 13 | mids = st.session_state["mid"].split() 14 | 15 | if st.session_state["table_name"] == "" or mids == [] or uids == []: 16 | st.warning("uid列表,mid列表存储表名不能为空") 17 | elif len(mids) != len(uids): 18 | st.warning("uid列表和mid列表长度必须一致") 19 | else: 20 | with st.spinner("搜索中(进展在控制台)..."): 21 | res_ids = get_comment1_data(uid=uids, mid=mids, table_name=st.session_state["table_name"]) 22 | with st.spinner("导入中(进展在控制台)..."): 23 | records = db.sync_get_records_by_ids(Comment1Record, res_ids) 24 | documents = [record.json_data for record in records] 25 | st.session_state["comment1"] = process_comment_documents(documents) 26 | 27 | if "comment1" in st.session_state: 28 | st.dataframe(st.session_state["comment1"]) -------------------------------------------------------------------------------- /web/web_pages/搜索/二级评论搜索.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from util import get_comment2_data, db, Comment2Record, process_comment_documents 3 | 4 | cols = st.columns([4, 4, 3, 1, 2, 2], vertical_alignment="bottom") 5 | cols[0].text_input("uid 列表(用空格分隔)", value="1644114654 1644114654 1644114654", key="uid") 6 | cols[1].text_input("mid 列表(用空格分隔)", value="5045280045531535 5045270515551948 5045277713760776", key="mid") 7 | cols[2].text_input("存储表名", value="test", key="table_name") 8 | 9 | cols[-1].button("搜索", type="primary", key="comment2_button") 10 | 11 | if st.session_state["comment2_button"]: 12 | uids = st.session_state["uid"].split() 13 | mids = st.session_state["mid"].split() 14 | 15 | if st.session_state["table_name"] == "" or mids == [] or uids == []: 16 | st.warning("uid列表,mid列表存储表名不能为空") 17 | elif len(mids) != len(uids): 18 | st.warning("uid列表和mid列表长度必须一致") 19 | else: 20 | with st.spinner("搜索中(进展在控制台)..."): 21 | res_ids = get_comment2_data(uid=uids, mid=mids, table_name=st.session_state["table_name"]) 22 | with st.spinner("导入中(进展在控制台)..."): 23 | records = db.sync_get_records_by_ids(Comment2Record, res_ids) 24 | documents = [record.json_data for record in records] 25 | st.session_state["comment2"] = process_comment_documents(documents) 26 | 27 | if "comment2" in st.session_state: 28 | st.dataframe(st.session_state["comment2"]) -------------------------------------------------------------------------------- /web/web_pages/搜索/列表搜索.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from util import get_list_data, db, BodyRecord, process_list_documents 3 | from datetime import date 4 | 5 | cols = st.columns([3, 3, 1, 1, 2, 2, 2, 2], vertical_alignment="bottom") 6 | cols[0].text_input("搜索内容(话题需要在前后加上#)", value="姜平", key="search_for") 7 | cols[1].text_input("存储表名", value="test", key="table_name") 8 | cols[2].selectbox("搜索类型", options=["综合", "实时", "高级"], key="kind") 9 | cols[3].selectbox("筛选条件", options=["综合", "热度", "原创"], key="advanced_kind", disabled=st.session_state["kind"]!= "高级") 10 | cols[4].date_input("起始时间", value="today", min_value=date(year=2000, month=1, day=1), key="start", disabled=st.session_state["kind"]!= "高级") 11 | cols[5].date_input("结束时间", value="today", key="end", min_value=date(year=2000, month=1, day=1), disabled=st.session_state["kind"]!= "高级") 12 | 13 | cols[-1].button("搜索", type="primary", key="list_button") 14 | 15 | if st.session_state["list_button"]: 16 | if st.session_state["search_for"] == "" or st.session_state["table_name"] == "": 17 | st.warning("搜索内容和存储表名不能为空") 18 | else: 19 | with st.spinner("搜索中(进展在控制台)..."): 20 | res_ids = get_list_data(search_for=st.session_state["search_for"], table_name=st.session_state["table_name"], 21 | kind=st.session_state["kind"], advanced_kind=st.session_state["advanced_kind"], time_start=st.session_state["start"], time_end=st.session_state["end"]) 22 | with st.spinner("导入中(进展在控制台)..."): 23 | records = db.sync_get_records_by_ids(BodyRecord, res_ids) 24 | documents = [record.json_data for record in records] 25 | st.session_state["list"] = process_list_documents(documents) 26 | 27 | if "list" in st.session_state: 28 | st.dataframe(st.session_state["list"]) -------------------------------------------------------------------------------- /web/web_pages/搜索/详细页搜索.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from util import get_body_data, db, BodyRecord, process_body_documents 3 | 4 | cols = st.columns([7, 3, 2, 2, 2], vertical_alignment="bottom") 5 | cols[0].text_input("搜索id列表(用空格分隔)", value="OEEV7wXHY Oj0PXme8I OiZre8dir Oj0zUmucE", key="ids") 6 | cols[1].text_input("存储表名", value="test", key="table_name") 7 | 8 | cols[-1].button("搜索", type="primary", key="body_button") 9 | 10 | if st.session_state["body_button"]: 11 | ids = st.session_state["ids"].split() 12 | if st.session_state["table_name"] == "" or ids == []: 13 | st.warning("搜索id列表和存储表名不能为空") 14 | else: 15 | with st.spinner("搜索中(进展在控制台)..."): 16 | res_ids = get_body_data(id=ids, table_name=st.session_state["table_name"]) 17 | with st.spinner("导入中(进展在控制台)..."): 18 | records = db.sync_get_records_by_ids(BodyRecord, res_ids) 19 | documents = [record.json_data for record in records] 20 | st.session_state["body"] = process_body_documents(documents) 21 | 22 | if "body" in st.session_state: 23 | st.dataframe(st.session_state["body"]) -------------------------------------------------------------------------------- /web/web_pages/查询/查询.py: -------------------------------------------------------------------------------- 1 | from util import db 2 | import streamlit as st 3 | import pandas as pd 4 | 5 | cols = st.columns([10, 1], vertical_alignment="bottom") 6 | 7 | cols[0].text_input(label="sql(切记这里要记得写limit,不然卡死你)", placeholder="写sql语句", value="select * from BodyRecord limit 100 offset 10;", key="sql") 8 | cols[1].button("执行sql", key="sql_button") 9 | 10 | if st.session_state.get("sql_button"): 11 | df = pd.DataFrame(db.sql(st.session_state.sql)) 12 | st.session_state["sql_result"] = df 13 | 14 | 15 | if "sql_result" in st.session_state: 16 | st.write(st.session_state["sql_result"]) --------------------------------------------------------------------------------