├── .bat
├── .github
    └── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── custom.md
    │   └── feature_request.md
├── .gitignore
├── .python-version
├── Images
    ├── Cookie.png
    ├── Icon.png
    ├── SQL.png
    ├── qrcode.png
    ├── 一级.png
    ├── 二级.png
    ├── 列表.png
    ├── 各种ID.png
    ├── 处理二维码.png
    ├── 微博主体处理流程.png
    ├── 数据流向.png
    ├── 详细.png
    └── 页面展示.png
├── README.md
├── WeiBoCrawler
    ├── README.md
    ├── __init__.py
    ├── config.toml
    ├── database
    │   ├── __init__.py
    │   ├── sql.py
    │   └── sql_record.py
    ├── pack
    │   ├── BaseDownloader.py
    │   ├── __init__.py
    │   ├── get_body_data.py
    │   ├── get_comment1_data.py
    │   ├── get_comment2_data.py
    │   └── get_list_data.py
    ├── parse
    │   ├── __init__.py
    │   ├── parse_list_html.py
    │   ├── process_body.py
    │   ├── process_comment.py
    │   └── process_list.py
    ├── request
    │   ├── __init__.py
    │   ├── get_body_request.py
    │   ├── get_comment_request.py
    │   ├── get_cookies.py
    │   ├── get_list_request.py
    │   ├── get_rum_request(unuse).py
    │   ├── request.toml
    │   └── util.py
    └── util
    │   ├── __init__.py
    │   ├── cookie.py
    │   ├── custom.py
    │   ├── database.py
    │   ├── decorator.py
    │   ├── log.py
    │   ├── path.py
    │   ├── process.py
    │   └── show_qrcode.py
├── pyproject.toml
├── requirements.txt
├── uv.lock
└── web
    ├── main.py
    ├── util
        └── __init__.py
    └── web_pages
        ├── Cookie
            └── Cookie.py
        ├── 搜索
            ├── 一级评论搜索.py
            ├── 二级评论搜索.py
            ├── 列表搜索.py
            └── 详细页搜索.py
        └── 查询
            └── 查询.py


/.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | .venv\Scripts\streamlit.exe run web/main.py
3 | pause


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Desktop (please complete the following information):**
27 |  - OS: [e.g. iOS]
28 |  - Browser [e.g. chrome, safari]
29 |  - Version [e.g. 22]
30 | 
31 | **Smartphone (please complete the following information):**
32 |  - Device: [e.g. iPhone6]
33 |  - OS: [e.g. iOS8.1]
34 |  - Browser [e.g. stock browser, safari]
35 |  - Version [e.g. 22]
36 | 
37 | **Additional context**
38 | Add any other context about the problem here.
39 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/custom.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Custom issue template
 3 | about: Describe this issue template's purpose here.
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.json
 3 | 
 4 | 
 5 | demo.*
 6 | test.*
 7 | test_*
 8 | 
 9 | # uv.lock
10 | # pyproject.toml
11 | # .python-version
12 | 
13 | .vscode/
14 | app.log
15 | 
16 | 数据库.db
17 | 


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.10
2 | 


--------------------------------------------------------------------------------
/Images/Cookie.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouyi207/WeiBoCrawler/7ba003df2e0ab67c8304c12b8cc4d2288887b4a9/Images/Cookie.png


--------------------------------------------------------------------------------
/Images/Icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouyi207/WeiBoCrawler/7ba003df2e0ab67c8304c12b8cc4d2288887b4a9/Images/Icon.png


--------------------------------------------------------------------------------
/Images/SQL.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouyi207/WeiBoCrawler/7ba003df2e0ab67c8304c12b8cc4d2288887b4a9/Images/SQL.png


--------------------------------------------------------------------------------
/Images/qrcode.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouyi207/WeiBoCrawler/7ba003df2e0ab67c8304c12b8cc4d2288887b4a9/Images/qrcode.png


--------------------------------------------------------------------------------
/Images/一级.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouyi207/WeiBoCrawler/7ba003df2e0ab67c8304c12b8cc4d2288887b4a9/Images/一级.png


--------------------------------------------------------------------------------
/Images/二级.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouyi207/WeiBoCrawler/7ba003df2e0ab67c8304c12b8cc4d2288887b4a9/Images/二级.png


--------------------------------------------------------------------------------
/Images/列表.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouyi207/WeiBoCrawler/7ba003df2e0ab67c8304c12b8cc4d2288887b4a9/Images/列表.png


--------------------------------------------------------------------------------
/Images/各种ID.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouyi207/WeiBoCrawler/7ba003df2e0ab67c8304c12b8cc4d2288887b4a9/Images/各种ID.png


--------------------------------------------------------------------------------
/Images/处理二维码.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouyi207/WeiBoCrawler/7ba003df2e0ab67c8304c12b8cc4d2288887b4a9/Images/处理二维码.png


--------------------------------------------------------------------------------
/Images/微博主体处理流程.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouyi207/WeiBoCrawler/7ba003df2e0ab67c8304c12b8cc4d2288887b4a9/Images/微博主体处理流程.png


--------------------------------------------------------------------------------
/Images/数据流向.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouyi207/WeiBoCrawler/7ba003df2e0ab67c8304c12b8cc4d2288887b4a9/Images/数据流向.png


--------------------------------------------------------------------------------
/Images/详细.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouyi207/WeiBoCrawler/7ba003df2e0ab67c8304c12b8cc4d2288887b4a9/Images/详细.png


--------------------------------------------------------------------------------
/Images/页面展示.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouyi207/WeiBoCrawler/7ba003df2e0ab67c8304c12b8cc4d2288887b4a9/Images/页面展示.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align=center>
  2 | <img src="./Images/Icon.png" alt="WeiBoCrawler" width=100 height=80 style="margin-top:20px;"></img>
  3 | <h1>WeiBoCrawler</h1>
  4 | </div>
  5 | 
  6 | # 欢迎！如果好用点个 star 🌟 呗！🤗
  7 | 
  8 | 😉😉😉 **本项目打算长期维护，欢迎大家 Pull requests 成为 Contributor** 😉😉😉
  9 | 
 10 | 😘😘😘 **如果发现 bug, 可以通过提 [Issues](https://github.com/zhouyi207/WeiBoCrawler/issues) 或添加微信: woyaolz 沟通 ！** 😘😘😘
 11 | 
 12 | ### 😁该项目是什么?
 13 | 
 14 | 该项目主要用于对微博进行数据采集，包括微博详细页内容、微博评论内容、微博转发量、微博点赞量，微博评论量等信息，方便做学术研究时采集数据。
 15 | 
 16 | ### 😋为什么使用本项目?
 17 | 
 18 | - **简单:** 快速上手，只需几行代码即可完成数据采集。
 19 | - **高效:** 采用异步请求和异步存储的方式，大大提高数据采集效率。
 20 | - **可视化:** 利用 streamlit 编写了一个可视化界面，方便用户进行数据采集和数据查询。
 21 | - **数据库:** 将 tinydb 改为 SQL 数据库，可以连接自定义数据库。
 22 | - **Cookies:** 不需要手动输入 cookies，扫码自动获取 cookies。
 23 | 
 24 | ### 🥂更新修复
 25 | - 2025.04.11 解决高级检索选择日期只能选择10年范围之内的日期问题。
 26 | - 2025.03.31 解决高级检索时间问题，同时删除了检索出现微博推荐的 “可能感兴趣” 的无关数据。
 27 | - 2025.03.02 web前端获取cookie使用线程进行优化，替换掉 PIL.Image 库将二维码展示在网页中。
 28 | - 2025.02.23 添加一个错误报错提示，先获取 cookie 才能生成 config.toml 文件，否则会报错。
 29 | 
 30 | ## 🚤快速上手
 31 | 
 32 | ### 1. 下载本项目
 33 | 
 34 | 在指定目录下使用 **git 命令克隆本项目** 或 **下载本项目的 zip 包然后解压**。
 35 | 
 36 | ```bash
 37 | git clone https://github.com/zhouyi207/WeiBoCrawler.git
 38 | ```
 39 | 
 40 | ### 2. 安装依赖
 41 | 
 42 | 在项目根目录下使用 **pip 命令安装依赖**，注意这里的 Python 版本是 3.10 版本。
 43 | 
 44 | ```bash
 45 | pip install -r requirements.txt
 46 | ```
 47 | 
 48 | ### 3. 运行程序
 49 | 
 50 | 在项目根目录下使用 **streamlit 命令运行程序**。
 51 | 
 52 | ```bash
 53 | streamlit run web/main.py
 54 | ```
 55 | 
 56 | 
 57 | 
 58 | <div align=center>
 59 | <img src="./Images/页面展示.png"  width=600 style="margin-top:30px;margin-bottom:20px"></img>
 60 | <p style="font-size:15px; font-weight:bold">成功运行🥳🥳🥳</p>
 61 | </div>
 62 | 
 63 | ## 🎨 界面展示
 64 | 
 65 | ### 1. 列表搜索
 66 | 
 67 | <div align=center>
 68 | <img src="./Images/列表.png"  width=600 style="margin-top:30px;margin-bottom:5px"></img>
 69 | <p style="font-size:15px; font-weight:bold">列表搜索</p>
 70 | </div>
 71 | 
 72 | 
 73 | ### 2. 详细页搜索
 74 | 
 75 | <div align=center>
 76 | <img src="./Images/详细.png"  width=600 style="margin-top:30px;margin-bottom:5px"></img>
 77 | <p style="font-size:15px; font-weight:bold">详细搜索</p>
 78 | </div>
 79 | 
 80 | ### 3. 一级评论搜索
 81 | 
 82 | 
 83 | <div align=center>
 84 | <img src="./Images/一级.png"  width=600 style="margin-top:30px;margin-bottom:5px"></img>
 85 | <p style="font-size:15px; font-weight:bold">一级评论搜索</p>
 86 | </div>
 87 | 
 88 | 
 89 | ### 4. 二级评论搜索
 90 | 
 91 | <div align=center>
 92 | <img src="./Images/二级.png"  width=600 style="margin-top:30px;margin-bottom:5px"></img>
 93 | <p style="font-size:15px; font-weight:bold">二级评论搜索</p>
 94 | </div>
 95 | 
 96 | ### 5. SQL 数据库查询
 97 | 
 98 | <div align=center>
 99 | <img src="./Images/SQL.png"  width=600 style="margin-top:30px;margin-bottom:5px"></img>
100 | <p style="font-size:15px; font-weight:bold">SQL 数据库查询</p>
101 | </div>
102 | 
103 | ## 🧑‍🎓项目相关
104 | 
105 | ### 1. 主体处理
106 | 
107 | <div align=center>
108 | <img src="./Images/微博主体处理流程.png"  width=540 style="margin-top:30px;margin-bottom:20px"></img>
109 | </div>
110 | 
111 | ### 2. UID 和 MID
112 | 
113 | <div align=center>
114 | <img src="./Images/各种ID.png"  width=600 style="margin-top:30px;margin-bottom:20px"></img>
115 | </div>
116 | 
117 | ### 3. 数据流向
118 | 
119 | <div align=center>
120 | <img src="./Images/数据流向.png"  width=600 style="margin-top:30px;margin-bottom:20px"></img>
121 | </div>
122 | 
123 | 
124 | 
125 | ## 📱联系
126 | 
127 | <div align=center>
128 | <img src="./Images/qrcode.png"  width=300 style="margin-top:30px;margin-bottom:20px"></img>
129 | </div>
130 | 
131 | 
132 | 
133 | ## ⚠️⚠️⚠️ 注意事项
134 | 
135 | 本项目仅用于学术研究，**请勿用于商业用途**。
136 | 


--------------------------------------------------------------------------------
/WeiBoCrawler/README.md:
--------------------------------------------------------------------------------
  1 | - [x] 已完成
  2 | - [ ] 未完成
  3 | 
  4 | 2025.01.28
  5 | - [ ] params flow 这个字段表示按热度还是按时间，1. 表示热度，2. 表示时间。在这里目前只有 comment_request 使用到了变化的字段，其他并没有用到，设置的是固定的。
  6 | - [x] pack.get_commment1_data.py get_commment2_data.py 这两文件中进度条有问题，进度条 desp 和 进度条 total 需要修改一下。由于没有预先设置 totol，会导致默认为 100
  7 | - [ ] 进度条不够美观，特别是 comment 请求
  8 | - [x] pack 可以重构解耦一下，使用抽象基类
  9 | - [x] 除了 get_commment1_data.py get_commment2_data.py 这两文件，异步都没怎么用，应该先创建 task 然后使用 asyncio.gather(*tasks) 注册 task
 10 | - [x] 差距为已被 修改为 差距为 一倍
 11 | 
 12 | 2025.01.29
 13 | 
 14 | - [x] 解析数据库. 在 parse 目录下 制作 process_xxx_json(TinyDB.table) -> pd.DataFrame 函数, 在这里实现一下数据库去重的逻辑(TinyDB)好像并没有去重的逻辑
 15 | - [x] 构建请求中的 headers 可以装在 client 中.（不可以，有的请求需要处理 headers）
 16 | - [x] 给 list 的 request 结果 添加 微博id 参数，与 body 保持一致.
 17 | - [x] 前端初步搭建：数据展示.
 18 | - [x] 模块的路径导入最好改用相对文件本身路径而不是使用项目路径
 19 | - [x] drop_table_duplicates 函数 暂时使用最简单的列表去重法, 后续可以考虑使用 hash 去重等方法优化..
 20 | 
 21 | 2025.1.30
 22 | 
 23 | - [x] 如果要实现更好的数据库效果，可以根据 mid 合并而不是 list body comment 分别展示，必须要实现字段统一.
 24 | - [ ] 由于是将所有请求的结果都保存在数据库，而展示的结果都是经过字段处理后的结果，需要给一个功能寻找指定数据的源数据.
 25 | - [ ] 给 uitl 添加 __all__ = []
 26 | - [x] 在下载前后检测数据表的状态，将变化的状态保存下来，方便知道新下载到哪里.
 27 | - [ ] get_body_data 中 数据表名为 id 改为需要 给定数据表名.
 28 | - [ ] 在 BaseDownloader.py 文件中添加日志功能，观察输出.
 29 | - [ ] 抽象类出现带参数的装饰器报错
 30 | 
 31 | 向下面这样是不行的...
 32 | 
 33 | 
 34 | ```python
 35 | def retry_timeout_decorator_asyncio(retry_times: int = 3) -> Callable:
 36 |     def _retry_timeout_decorator_asyncio(func: Callable) -> Callable:
 37 |         """超时重试装饰器(异步)
 38 | 
 39 |         Args:
 40 |             retry_times (int): 重试次数. Defaults to 3.
 41 | 
 42 |         Returns:
 43 |             Callable: 装饰后的函数
 44 |         """
 45 |         async def wrapper(*args, **kwargs):  # 将 wrapper 改为异步函数
 46 |             attempts = 0
 47 |             while attempts < retry_times:
 48 |                 try:
 49 |                     return await func(*args, **kwargs)  # 调用异步函数并使用 await
 50 |                 except httpx.TimeoutException as e:
 51 |                     attempts += 1
 52 |                     if attempts < retry_times:
 53 |                         logging.warning(f"请求超时，正在进行第 {attempts} 次重试...")
 54 |                     else:
 55 |                         logging.error(f"请求超时，重试次数已达到最大值，请检查网络连接或重试次数！错误原因{e}")
 56 |         return wrapper
 57 |     return _retry_timeout_decorator_asyncio
 58 | ```
 59 | 
 60 | 
 61 | 2025.01.31
 62 | 
 63 | - [ ] tinydb 这个玩意啊，5700条数据的时候插入一下要 1s，这是什么逆天的速度，我靠了.....想办法用其他数据库把，这玩意太影响速度了.....
 64 | - [ ] database 解耦，方便使用定义的数据库.
 65 | - [ ] 在使用 sqlalchemy 库操作数据库的时候，sessionmaker 中设置 expire_on_commit=False 可以避免在提交事务时自动刷新对象的状态，从而提高性能，但可能会出现脏读的现象，但是就我们的操作而言，单线程异步是不会出现脏读的情况的.
 66 | 
 67 | 
 68 | 在设置 sessionmaker 中 expire_on_commit=True 的时候，在提交事务时自动刷新对象的状态，以异步为例子
 69 | 
 70 | ```python
 71 |     async def async_add_records(self, records: list[ListRecord | BodyRecord | Comment1Record | Comment2Record ]) -> list[int]:
 72 |         """异步插入记录
 73 |         
 74 |         Args:
 75 |             records (list[ListRecord | BodyRecord | Comment1Record | Comment2Record ]): 记录列表
 76 |         
 77 |         Returns:
 78 |             list[int]: id列表
 79 |         """
 80 |         async with self.async_session() as session:
 81 |             try:
 82 |                 session.add_all(records)
 83 |                 await session.commit()
 84 |                 return [record.id for record in records]
 85 |             except Exception as e:
 86 |                 await session.rollback()
 87 |                 logging.error(f"插入记录时出现异常: {e}", exc_info=True)
 88 |                 return []
 89 | ```
 90 | 
 91 | 如果 expire_on_commit=True, 那么在提交事务时，会自动刷新对象的状态，即重新查询数据库中的数据，以确保数据的一致性。但是这里的查询是同步的，而 session 是异步会话，会出现在异步会话中调用同步功能的操作，这是一个 bug. 正确的处理方式是，使用异步去刷新 records 的状态.
 92 | 
 93 | 
 94 | ```python
 95 |     async def async_add_records(self, records: list[ListRecord | BodyRecord | Comment1Record | Comment2Record ]) -> list[int]:
 96 |         """异步插入记录
 97 |         
 98 |         Args:
 99 |             records (list[ListRecord | BodyRecord | Comment1Record | Comment2Record ]): 记录列表
100 |         
101 |         Returns:
102 |             list[int]: id列表
103 |         """
104 |         async with self.async_session() as session:
105 |             try:
106 |                 session.add_all(records)
107 |                 await session.commit()
108 |                 # 修改的地方
109 |                 ids = []
110 |                 for record in records:
111 |                     await session.refresh(record)
112 |                     ids.append(record.id)
113 |                 return ids
114 |             except Exception as e:
115 |                 await session.rollback()
116 |                 logging.error(f"插入记录时出现异常: {e}", exc_info=True)
117 |                 return []
118 | ```
119 | 
120 | 这样就可以了.
121 | 
122 | 
123 | - [x] tinydb 5700 条数据后要1s一条, sqlite 150000 条数据后 0.02s 一条. 我宣布我不认识 tinydb.... 
124 | - [ ] tmd... sqlalchemy 在设置 relationship 的时候, 如果从表有多个外键，主表调用 relationship 函数中 foreign_keys 没用啊，老是报错, 只能使用 primaryjoin 函数来操作... 好煞笔.
125 | - [ ] 我宣布 sqlalchemy 是个很傻鸟的库，妈的，定义那么多类型完全看不过来是干鸡毛，看你开源的份上作者我就不骂你了....  **peewee** 持续关注!


--------------------------------------------------------------------------------
/WeiBoCrawler/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhouyi207/WeiBoCrawler/7ba003df2e0ab67c8304c12b8cc4d2288887b4a9/WeiBoCrawler/__init__.py


--------------------------------------------------------------------------------
/WeiBoCrawler/config.toml:
--------------------------------------------------------------------------------
 1 | [database]
 2 | path = "../数据库.db"
 3 | 
 4 | [cookies]
 5 | SCF = ""
 6 | SUB = ""
 7 | SUBP = ""
 8 | ALF = ""
 9 | SINAGLOBAL = ""
10 | _s_tentry = ""
11 | Apache = ""
12 | ULV = ""
13 | XSRF-TOKEN = ""
14 | PC_TOKEN = ""
15 | WBPSESS = ""
16 | ALC = ""
17 | X-CSRF-TOKEN = ""
18 | 
19 | [cookies_info]
20 | update_time = "2025-02-20 20:02:45"
21 | 


--------------------------------------------------------------------------------
/WeiBoCrawler/database/__init__.py:
--------------------------------------------------------------------------------
 1 | from .sql import DatabaseManager , BodyRecord, Comment1Record, Comment2Record, RecordFrom
 2 | from ..util import database_config
 3 | 
 4 | 
 5 | db_path = database_config.path
 6 | 
 7 | db = DatabaseManager(
 8 |         sync_db_url=f'sqlite:///{db_path}',  # 同步模式
 9 |         async_db_url=f'sqlite+aiosqlite:///{db_path}'  # 异步模式
10 | )
11 | 
12 | __all__ = ["db", "BodyRecord", "Comment1Record", "Comment2Record", "RecordFrom"]


--------------------------------------------------------------------------------
/WeiBoCrawler/database/sql.py:
--------------------------------------------------------------------------------
  1 | from sqlalchemy import select, inspect, create_engine, text
  2 | from sqlalchemy.orm import sessionmaker
  3 | from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession, async_sessionmaker
  4 | from .sql_record import Base, BodyRecord, Comment1Record, Comment2Record, RecordFrom
  5 | from ..util import logging
  6 | from typing import Any
  7 | 
  8 | 
  9 | class DatabaseManager:
 10 |     """数据库的增删改查
 11 | 
 12 |     """
 13 |     def __init__(self, sync_db_url: str, async_db_url: str):
 14 |         """初始化数据库
 15 | 
 16 |         Args:
 17 |             sync_db_url (str): 同步的数据库连接字符串
 18 |             async_db_url (str): 异步的数据库连接字符串
 19 |         """
 20 |         # 引擎
 21 |         self.sync_engine = create_engine(sync_db_url)
 22 |         self.async_engine = create_async_engine(async_db_url)
 23 | 
 24 |         # 会话工厂
 25 |         self.sync_session = sessionmaker(self.sync_engine, expire_on_commit=False)
 26 |         self.async_session = async_sessionmaker(self.async_engine, class_=AsyncSession, expire_on_commit=False)
 27 | 
 28 |         # 创建表
 29 |         self.sync_create_tables()
 30 | 
 31 |     def sync_create_tables(self):
 32 |         """同步创建表
 33 |         
 34 |         """
 35 |         Base.metadata.create_all(self.sync_engine)
 36 | 
 37 |     async def async_create_tables(self):
 38 |         """异步创建表
 39 |         
 40 |         """
 41 |         async with self.async_engine.begin() as conn:
 42 |             await conn.run_sync(Base.metadata.create_all)
 43 | 
 44 | 
 45 |     def sync_add_records(self, records: list[ BodyRecord | Comment1Record | Comment2Record ]) -> list[int]:
 46 |         """同步插入记录
 47 | 
 48 |         Args:
 49 |             records (list[ BodyRecord | Comment1Record | Comment2Record ]): 记录列表
 50 | 
 51 |         Returns:
 52 |             list[int]: id列表
 53 |         """
 54 |         with self.sync_session() as session:
 55 |             try:
 56 |                 session.add_all(records)
 57 |                 session.commit()
 58 |                 return [record.id for record in records]
 59 |             except Exception as e:
 60 |                 session.rollback()
 61 |                 logging.error(f"插入记录时出现异常: {e}", exc_info=True)
 62 |                 return []
 63 | 
 64 |     async def async_add_records(self, records: list[ BodyRecord | Comment1Record | Comment2Record ]) -> list[int]:
 65 |         """异步插入记录
 66 |         
 67 |         Args:
 68 |             records (list[ BodyRecord | Comment1Record | Comment2Record ]): 记录列表
 69 |         
 70 |         Returns:
 71 |             list[int]: id列表
 72 |         """
 73 |         async with self.async_session() as session:
 74 |             try:
 75 |                 session.add_all(records)
 76 |                 await session.commit()
 77 |                 return [record.id for record in records]
 78 |             except Exception as e:
 79 |                 await session.rollback()
 80 |                 logging.error(f"插入记录时出现异常: {e}", exc_info=True)
 81 |                 return []
 82 | 
 83 |     def sync_get_records_by_ids(self, model:  BodyRecord | Comment1Record | Comment2Record , ids: list[int]) -> list[ BodyRecord | Comment1Record | Comment2Record ]:
 84 |         """同步查询记录
 85 |         
 86 |         Args:
 87 |             model ( BodyRecord | Comment1Record | Comment2Record ): 搜索类
 88 |             ids (list[int]): 搜索id列表
 89 | 
 90 |         Returns:
 91 |             list[ BodyRecord | Comment1Record | Comment2Record ]: 搜索列表
 92 |         """
 93 |         with self.sync_session() as session:
 94 |             return session.query(model).filter(model.id.in_(ids)).all()
 95 | 
 96 |     async def async_get_records_by_ids(self, model:  BodyRecord | Comment1Record | Comment2Record , ids: list[int]) -> list[ BodyRecord | Comment1Record | Comment2Record ]:
 97 |         """异步查询记录
 98 |         
 99 |         Args:
100 |             model ( BodyRecord | Comment1Record | Comment2Record ): 搜索类
101 |             ids (list[int]): 搜索id列表
102 | 
103 |         Returns:
104 |             list[ BodyRecord | Comment1Record | Comment2Record ]: 搜索列表
105 |         """
106 |         async with self.async_session() as session:
107 |             stmt = select(model).where(model.id.in_(ids))
108 |             result = await session.execute(stmt)
109 |             return result.scalars().all()
110 | 
111 |     def sync_update_record(self, model:  BodyRecord | Comment1Record | Comment2Record , record_id: int, **kwargs) ->  BodyRecord | Comment1Record | Comment2Record :
112 |         """同步更新记录
113 |         
114 |         Args:
115 |             model ( BodyRecord | Comment1Record | Comment2Record ): 更新类
116 |             record_id (int): 更新id
117 |             kwargs: 更新的字段和值
118 | 
119 |         Returns:
120 |              BodyRecord | Comment1Record | Comment2Record : 更新类
121 |         """
122 |         with self.sync_session() as session:
123 |             record = session.get(model, record_id)
124 |             if record:
125 |                 for key, value in kwargs.items():
126 |                     setattr(record, key, value)
127 |                 try:
128 |                     session.commit()
129 |                 except Exception as e:
130 |                     session.rollback()
131 |                     logging.error(f"更新记录时出现异常: {e}", exc_info=True)
132 |             return record
133 | 
134 |     async def async_update_record(self, model:  BodyRecord | Comment1Record | Comment2Record , record_id: int, **kwargs) ->  BodyRecord | Comment1Record | Comment2Record :
135 |         """异步更新记录
136 |         
137 |         Args:
138 |             model ( BodyRecord | Comment1Record | Comment2Record ): 更新类
139 |             record_id (int): 更新id
140 |             kwargs: 更新的字段和值
141 | 
142 |         Returns:
143 |              BodyRecord | Comment1Record | Comment2Record : 更新记录
144 |         """
145 |         async with self.async_session() as session:
146 |             record = await session.get(model, record_id)
147 |             if record:
148 |                 for key, value in kwargs.items():
149 |                     setattr(record, key, value)
150 |                 try:
151 |                     await session.commit()
152 |                 except Exception as e:
153 |                     await session.rollback()
154 |                     logging.error(f"更新记录时出现异常: {e}", exc_info=True)
155 |             return record
156 | 
157 |     def sync_delete_record(self, model:  BodyRecord | Comment1Record | Comment2Record , record_id: int) ->  BodyRecord | Comment1Record | Comment2Record :
158 |         """同步删除记录
159 |         
160 |         Args:
161 |             model ( BodyRecord | Comment1Record | Comment2Record ): 删除类
162 |             record_id (int): 删除id
163 |         
164 |         Returns:
165 |              BodyRecord | Comment1Record | Comment2Record : 删除记录
166 |         """
167 |         with self.sync_session() as session:
168 |             record = session.get(model, record_id)
169 |             if record:
170 |                 try:
171 |                     session.delete(record)
172 |                     session.commit()
173 |                 except Exception as e:
174 |                     session.rollback()
175 |                     logging.error(f"删除记录时出现异常: {e}", exc_info=True)
176 |             return record
177 | 
178 |     async def async_delete_record(self, model:  BodyRecord | Comment1Record | Comment2Record , record_id: int) ->  BodyRecord | Comment1Record | Comment2Record :
179 |         """异步删除记录
180 |         
181 |         Args:
182 |             model ( BodyRecord | Comment1Record | Comment2Record ): 删除类
183 |             record_id (int): 删除id
184 |         """
185 |         async with self.async_session() as session:
186 |             record = await session.get(model, record_id)
187 |             if record:
188 |                 try:
189 |                     await session.delete(record)
190 |                     await session.commit()
191 |                 except Exception as e:
192 |                     await session.rollback()
193 |                     logging.error(f"删除记录时出现异常: {e}", exc_info=True)
194 |             return record
195 | 
196 |     def sync_get_table_names(self) -> list[str]:
197 |         """同步获取表名
198 |         
199 |         Returns:
200 |             list[str]: 表名列表
201 |         """
202 |         inspector = inspect(self.sync_engine)
203 |         return inspector.get_table_names()
204 | 
205 |     async def async_get_table_names(self) -> list[str]:
206 |         """异步获取表名
207 | 
208 |         Returns:
209 |             list[str]: 表名列表
210 |         """
211 |         inspector = inspect(self.sync_engine)
212 |         return inspector.get_table_names()
213 |     
214 |     def sync_get_records(self, model: BodyRecord | Comment1Record | Comment2Record, limit: int = 100, offset: int = 0) -> list[BodyRecord | Comment1Record | Comment2Record]:
215 |         """同步获取数据 limit 和 offset
216 | 
217 |         Args:
218 |             model (BodyRecord | Comment1Record | Comment2Record): 数据类型
219 |             limit (int, optional): 数据大小. Defaults to 100.
220 |             offset (int, optional): 数据偏移. Defaults to 0.
221 | 
222 |         Returns:
223 |             list[BodyRecord | Comment1Record | Comment2Record]: 数据列表
224 |         """
225 |         with self.sync_session() as session:
226 |             records = session.query(model).limit(limit).offset(offset).all()
227 |         return records
228 |     
229 |     async def async_get_records(self, model: BodyRecord | Comment1Record | Comment2Record, limit: int = 100, offset: int = 0):
230 |         """异步获取数据 limit 和 offset
231 | 
232 |         Args:
233 |             model (BodyRecord | Comment1Record | Comment2Record): 数据类型
234 |             limit (int, optional): 数据大小. Defaults to 100.
235 |             offset (int, optional): 数据偏移. Defaults to 0.
236 | 
237 |         Returns:
238 |             list[BodyRecord | Comment1Record | Comment2Record]: 数据列表
239 |         """
240 |         async with self.async_session() as session:
241 |             records = await session.query(model).limit(limit).offset(offset).all()
242 |         return records
243 |     
244 |     # 异步未实现
245 |     def sync_get_distinct_category_names(self, ModelCol:Any) -> list[str]:
246 |         """同步获取唯一分类名称
247 | 
248 |         Args:
249 |             ModelCol (Any): Model 的 Col 例如 User.name
250 | 
251 |         Returns:
252 |             list[str]: 名称列表
253 |         """
254 |         with self.sync_session() as session:
255 |             unique_names = session.query(ModelCol).distinct().all()
256 |         return unique_names
257 |     
258 |     # 在这里直接写 SQL 吧，分类太多了..
259 | 
260 |     def sql(self, sql_query:str):
261 |         """在数据库中写sql
262 | 
263 |         Args:
264 |             sql (str): sql语句
265 | 
266 |         return: list
267 |         """
268 |         with self.sync_session() as session:
269 |             result = session.execute(text(sql_query))
270 |         data_as_dicts_auto = [dict(zip(result.keys(), row)) for row in result]
271 |         return data_as_dicts_auto
272 | 
273 | __all__ = [BodyRecord, Comment1Record, Comment2Record, RecordFrom, DatabaseManager]


--------------------------------------------------------------------------------
/WeiBoCrawler/database/sql_record.py:
--------------------------------------------------------------------------------
  1 | import enum
  2 | from datetime import datetime
  3 | from sqlalchemy import BigInteger, JSON, Text, ForeignKey, Enum
  4 | from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship
  5 | 
  6 | 
  7 | class RecordFrom(enum.Enum):
  8 |     """主要在 BodyRecord 中使用，表示数据来源
  9 | 
 10 |     """
 11 |     Html = "html"
 12 |     Api = "api"
 13 | 
 14 | 
 15 | class Base(DeclarativeBase):
 16 |     """初始化 registry 属性
 17 | 
 18 |     """
 19 |     ...
 20 | 
 21 | class AbstractBase(Base):
 22 |     __abstract__ = True
 23 |     id: Mapped[int] = mapped_column(primary_key=True)
 24 |     mid: Mapped[int] = mapped_column(BigInteger)
 25 |     uid: Mapped[int] = mapped_column(BigInteger)
 26 |     search_for: Mapped[str] = mapped_column(Text)
 27 |     create_time: Mapped[datetime] = mapped_column(default=lambda: datetime.now())
 28 |     json_data: Mapped[dict] = mapped_column(JSON)
 29 | 
 30 | 
 31 | class BodyComment1(Base):
 32 |     """定义 BodyRecord 与 Comment1Record 的关联表
 33 | 
 34 |     """
 35 |     __tablename__ = 'body_comment1_association'
 36 |     id: Mapped[int] = mapped_column(primary_key=True)
 37 |     body_mid: Mapped[int] = mapped_column(BigInteger, ForeignKey('BodyRecord.mid'))
 38 |     body_uid: Mapped[int] = mapped_column(BigInteger, ForeignKey('BodyRecord.uid'))
 39 |     comment1_f_mid: Mapped[int] = mapped_column(BigInteger, ForeignKey('Comment1Record.mid'))
 40 |     comment1_f_uid: Mapped[int] = mapped_column(BigInteger, ForeignKey('Comment1Record.uid'))
 41 | 
 42 | 
 43 | class BodyComment2(Base):
 44 |     """定义 BodyRecord 与 Comment2Record 的关联表
 45 | 
 46 |     """
 47 |     __tablename__ = 'body_comment2_association'
 48 |     id: Mapped[int] = mapped_column(primary_key=True)
 49 |     body_uid: Mapped[int] = mapped_column(BigInteger, ForeignKey('BodyRecord.uid'))
 50 |     comment2_f_uid: Mapped[int] = mapped_column(BigInteger, ForeignKey('Comment2Record.f_uid'))
 51 | 
 52 | 
 53 | class Comment12(Base):
 54 |     """定义 Comment1Record 与 Comment2Record 的关联表
 55 | 
 56 |     """
 57 |     __tablename__ = 'comment1_comment2_association'
 58 |     id: Mapped[int] = mapped_column(primary_key=True)
 59 |     comment1_mid: Mapped[int] = mapped_column(BigInteger, ForeignKey('Comment1Record.mid'))
 60 |     comment2_f_mid: Mapped[int] = mapped_column(BigInteger, ForeignKey('Comment2Record.f_mid'))
 61 | 
 62 | 
 63 | class BodyRecord(AbstractBase):
 64 |     """存储 Body Record 的数据
 65 | 
 66 |     """
 67 |     __tablename__ = 'BodyRecord'
 68 |     record_from: Mapped[RecordFrom] = mapped_column(Enum(RecordFrom))
 69 | 
 70 |     # 定义关系字段
 71 |     comment1_records: Mapped[list["Comment1Record"]] = relationship(
 72 |         lazy=True,
 73 |         secondary="body_comment1_association",
 74 |         back_populates='body_records',
 75 |         primaryjoin="and_(BodyRecord.mid == body_comment1_association.c.body_mid, BodyRecord.uid == body_comment1_association.c.body_uid)",
 76 |         secondaryjoin="and_(Comment1Record.f_mid == body_comment1_association.c.comment1_f_mid, Comment1Record.f_uid == body_comment1_association.c.comment1_f_uid)",
 77 |         # cascade="all, delete-orphan", # 这里的 cascade 选项表示当 BodyRecord 被删除时，相关联的 Comment1Record 和 Comment2Record 也会被删除 ！！！多对多禁止使用
 78 |     )
 79 |     comment2_records: Mapped[list["Comment2Record"]] = relationship(
 80 |         lazy=True,
 81 |         secondary="body_comment2_association",
 82 |         back_populates='body_records',
 83 |         primaryjoin="BodyRecord.uid == body_comment2_association.c.body_uid",
 84 |         secondaryjoin="Comment2Record.f_uid == body_comment2_association.c.comment2_f_uid",
 85 |         # cascade="all, delete-orphan", # 这里的 cascade 选项表示当 BodyRecord 被删除时，相关联的 Comment1Record 和 Comment2Record 也会被删除 ！！！多对多禁止使用
 86 |     )
 87 | 
 88 |     def __repr__(self):
 89 |         return f"BodyRecord(id={self.id}, mid={self.mid}, uid={self.uid}, search_for='{self.search_for}', record_from='{self.record_from}', create_time={self.create_time})"
 90 | 
 91 | 
 92 | class Comment1Record(AbstractBase):
 93 |     """存储 Comment Record 的数据
 94 | 
 95 |     """
 96 |     __tablename__ = 'Comment1Record'
 97 |     f_mid: Mapped[int] = mapped_column(BigInteger)
 98 |     f_uid: Mapped[int] = mapped_column(BigInteger)
 99 | 
100 |     # 定义关系字段
101 |     body_records: Mapped[list["BodyRecord"]] = relationship(
102 |         secondary="body_comment1_association",
103 |         back_populates='comment1_records',
104 |         primaryjoin="and_(Comment1Record.f_mid == body_comment1_association.c.comment1_f_mid, Comment1Record.f_uid == body_comment1_association.c.comment1_f_uid)",
105 |         secondaryjoin="and_(BodyRecord.mid == body_comment1_association.c.body_mid, BodyRecord.uid == body_comment1_association.c.body_uid)"
106 |     )
107 |     comment2_records: Mapped[list["Comment2Record"]] = relationship(
108 |         secondary="comment1_comment2_association",
109 |         back_populates='comment1_records',
110 |         primaryjoin="Comment1Record.mid == comment1_comment2_association.c.comment1_mid",
111 |         secondaryjoin="Comment2Record.f_mid == comment1_comment2_association.c.comment2_f_mid"
112 |     )
113 | 
114 |     def __repr__(self):
115 |         return f"Comment1Record(id={self.id}, mid={self.mid}, uid={self.uid}, f_mid={self.f_mid}, f_uid={self.f_uid}, search_for='{self.search_for}')"
116 | 
117 | 
118 | class Comment2Record(AbstractBase):
119 |     """存储 Comment Record 的数据
120 | 
121 |     """
122 |     __tablename__ = 'Comment2Record'
123 |     f_mid: Mapped[int] = mapped_column(BigInteger)
124 |     f_uid: Mapped[int] = mapped_column(BigInteger)
125 | 
126 |     # 定义关系字段
127 |     body_records: Mapped[list["BodyRecord"]] = relationship(
128 |         secondary="body_comment2_association",
129 |         back_populates='comment2_records',
130 |         primaryjoin="Comment2Record.f_uid == body_comment2_association.c.comment2_f_uid",
131 |         secondaryjoin="BodyRecord.uid == body_comment2_association.c.body_uid"
132 |     )
133 |     comment1_records: Mapped[list["Comment1Record"]] = relationship(
134 |         secondary="comment1_comment2_association",
135 |         back_populates='comment2_records',
136 |         primaryjoin="Comment2Record.f_mid == comment1_comment2_association.c.comment2_f_mid",
137 |         secondaryjoin="Comment1Record.mid == comment1_comment2_association.c.comment1_mid"
138 |     )
139 | 
140 |     def __repr__(self):
141 |         return f"Comment2Record(id={self.id}, mid={self.mid}, uid={self.uid}, f_mid={self.f_mid}, f_uid={self.f_uid}, search_for='{self.search_for}')"
142 | 


--------------------------------------------------------------------------------
/WeiBoCrawler/pack/BaseDownloader.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | from abc import ABC, abstractmethod
  3 | from typing import Any
  4 | 
  5 | import httpx
  6 | from pydantic import BaseModel
  7 | from ..database import db, BodyRecord, Comment1Record, Comment2Record, RecordFrom
  8 | from ..util import CustomProgress, cookies_config, log_function_params, logging
  9 | 
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | class CommentID(BaseModel):
 14 |     uid: str
 15 |     mid: str
 16 | 
 17 | 
 18 | class BaseDownloader(ABC):
 19 |     def __init__(self, *, table_name: str, concurrency: int = 100):
 20 |         self.table_name = table_name
 21 |         self.semaphore = asyncio.Semaphore(concurrency)
 22 |         self.db = db
 23 |         self.res_ids = []
 24 | 
 25 |     @abstractmethod
 26 |     def _get_request_description(self) -> str:
 27 |         """获取进度条描述
 28 | 
 29 |         Returns:
 30 |             str: 进度条描述
 31 |         """
 32 |         ...
 33 | 
 34 |     @abstractmethod
 35 |     def _get_request_params(self) -> list:
 36 |         """获取请求参数列表
 37 | 
 38 |         Returns:
 39 |             list: 请求参数列表
 40 |         """
 41 |         ...
 42 | 
 43 |     @abstractmethod
 44 |     def _process_response(self, response: httpx.Response, *, param: Any) -> None:
 45 |         """处理请求并存储数据
 46 | 
 47 |         Args:
 48 |             response (httpx.Response): 需要处理的请求
 49 |             param (Any): 请求参数
 50 |         """
 51 |         ...
 52 | 
 53 |     @abstractmethod
 54 |     async def _process_response_asyncio(self, response: httpx.Response, *, param: Any) -> None:
 55 |         """处理请求并存储数据
 56 | 
 57 |         Args:
 58 |             response (httpx.Response): 需要处理的请求
 59 |             param (Any): 请求参数
 60 |         """
 61 |         ...
 62 | 
 63 |     @abstractmethod
 64 |     async def _download_single_asyncio(self, *, param:Any, client:httpx.Response, progress:CustomProgress, overall_task:int):
 65 |         """下载单个请求(异步)
 66 | 
 67 |         Args:
 68 |             param (Any): 请求参数
 69 |             client (httpx.Response): 请求客户端
 70 |             progress (CustomProgress): 进度条
 71 |             overall_task (int): 进度条任务ID
 72 |         """
 73 |         ...
 74 | 
 75 |     @abstractmethod
 76 |     def _download_single_sync(self, *, param: Any, client:httpx.Response, progress:CustomProgress, overall_task:int):
 77 |         """下载单个请求(同步)
 78 | 
 79 |         Args:
 80 |             param (Any): 请求参数
 81 |             client (httpx.Response): 请求客户端
 82 |             progress (CustomProgress): 进度条
 83 |             overall_task (int): 进度条任务ID
 84 |         """
 85 |         ...
 86 | 
 87 |     def _save_to_database(self, items: list[BodyRecord | Comment1Record | Comment2Record]) -> None:
 88 |         """保存数据到数据库
 89 | 
 90 |         Args:
 91 |             items (list[dict]): 数据列表
 92 |         """
 93 |         res_ids = self.db.sync_add_records(items)
 94 |         self.res_ids.extend(res_ids)
 95 | 
 96 |     async def _save_to_database_asyncio(self, items: list[BodyRecord | Comment1Record | Comment2Record]) -> None:
 97 |         """保存数据到数据库(异步)
 98 | 
 99 |         Args:
100 |             items (list[dict]): 数据列表
101 |         """
102 |         res_ids = await self.db.async_add_records(items)
103 |         self.res_ids.extend(res_ids)
104 | 
105 |     @log_function_params(logger=logger)
106 |     def _check_response(self, response: httpx.Response) -> bool:
107 |         """检查响应是否正常
108 | 
109 |         Args:
110 |             response (httpx.Response): 接受到的响应
111 | 
112 |         Returns:
113 |             bool: 有问题返回 False, 否则返回 True
114 |         """
115 |         return response.status_code == httpx.codes.OK
116 | 
117 | 
118 |     async def _download_asyncio(self):
119 |         """异步下载数据
120 | 
121 |         """
122 |         with CustomProgress() as progress:
123 |             overall_task = progress.add_task(
124 |                 description=self._get_request_description(), total=len(self._get_request_params())
125 |             )
126 |             async with httpx.AsyncClient(cookies=cookies_config.cookies) as client:
127 |                 tasks = []
128 |                 for param in self._get_request_params():
129 |                     async with self.semaphore:
130 |                         task = asyncio.create_task(
131 |                             self._download_single_asyncio(
132 |                                 param=param,
133 |                                 client=client,
134 |                                 progress=progress,
135 |                                 overall_task=overall_task,
136 |                             )
137 |                         )
138 |                         tasks.append(task)
139 |                 await asyncio.gather(*tasks)
140 | 
141 |     def _download_sync(self):
142 |         """同步下载数据
143 | 
144 |         """
145 |         with CustomProgress() as progress:
146 |             overall_task = progress.add_task(
147 |                 description=self._get_request_description(), total=len(self._get_request_params())
148 |             )
149 |             with httpx.Client(cookies=cookies_config.cookies) as client:
150 |                 for params in self._get_request_params():
151 |                     self._download_single_sync(params, client, progress, overall_task)
152 | 
153 |     def download(self, asynchrony: bool = True) -> None:
154 |         """整合异步下载和同步下载
155 | 
156 |         asynchrony = True 异步下载
157 |         asynchrony = False 普通下载
158 | 
159 |         Args:
160 |             asynchrony (bool, optional): 异步下载或者普通下载. Defaults to True.
161 |         """
162 |         if asynchrony:
163 |             try:
164 |                 loop = asyncio.get_running_loop()
165 |                 loop.run_until_complete(self._download_asyncio())
166 |             except RuntimeError:
167 |                 asyncio.run(self._download_asyncio())
168 |         else:
169 |             self._download_sync()
170 | 
171 | 
172 | __all__ = [BaseDownloader, BodyRecord, Comment1Record, Comment2Record, RecordFrom]


--------------------------------------------------------------------------------
/WeiBoCrawler/pack/__init__.py:
--------------------------------------------------------------------------------
 1 | from .get_list_data import get_list_data
 2 | from .get_body_data import get_body_data
 3 | from .get_comment1_data import get_comment1_data
 4 | from .get_comment2_data import get_comment2_data
 5 | 
 6 | 
 7 | __all__ = [
 8 |     "get_list_data",
 9 |     "get_body_data",
10 |     "get_comment1_data",
11 |     "get_comment2_data",
12 | ]


--------------------------------------------------------------------------------
/WeiBoCrawler/pack/get_body_data.py:
--------------------------------------------------------------------------------
  1 | import httpx
  2 | from typing import Any
  3 | from ..util import CustomProgress, retry_timeout_decorator, retry_timeout_decorator_asyncio
  4 | from ..parse import process_body_resp
  5 | from .BaseDownloader import BaseDownloader, BodyRecord, RecordFrom
  6 | from ..request import get_body_response, get_body_response_asyncio
  7 | 
  8 | 
  9 | class Downloader(BaseDownloader):
 10 |     def __init__(self, id: list[str] | str, *, table_name: str, concurrency: int = 100):
 11 |         """下载 Body 页面数据, 并保存在数据库的 id 表中, 数据库位置在 database_config 中.
 12 | 
 13 |         Args:
 14 |             id (Union[List[str], str]): 微博详细页 id, 或者 id 列表.
 15 |             table_name (str): 存储的位置(数据表名)
 16 |             concurrency (int, optional): 异步最大并发. Defaults to 100.
 17 |         """
 18 |         super().__init__(table_name=table_name, concurrency=concurrency)
 19 | 
 20 |         if isinstance(id, str):
 21 |             self.ids = [id]
 22 |         else:
 23 |             self.ids = id
 24 | 
 25 |     def _get_request_description(self) -> str:
 26 |         """获取进度条描述
 27 | 
 28 |         Returns:
 29 |             str: 进度条描述
 30 |         """
 31 |         return "download..."
 32 | 
 33 |     def _get_request_params(self) -> list:
 34 |         """获取请求参数列表
 35 | 
 36 |         Returns:
 37 |             list: 请求参数列表
 38 |         """
 39 |         return self.ids
 40 | 
 41 |     def _process_items(self, items: list[dict]) -> list[BodyRecord]:
 42 |         """_summary_
 43 | 
 44 |         Args:
 45 |             items (list[dict]): _description_
 46 | 
 47 |         Returns:
 48 |             list[BodyRecord]: _description_
 49 |         """
 50 |         records = []
 51 |         for item in items:
 52 |             mid = item.get("mid", None)
 53 |             uid = item.get("uid", None)
 54 |             record = BodyRecord(
 55 |                 mid=mid,
 56 |                 uid=uid,
 57 |                 search_for=self.table_name,
 58 |                 record_from=RecordFrom.Api,
 59 |                 json_data = item
 60 |             )
 61 |             records.append(record)
 62 |         return records
 63 | 
 64 |     def _process_response(self, response: httpx.Response, *, param: Any) -> None:
 65 |         """处理请求并存储数据
 66 | 
 67 |         Args:
 68 |             response (httpx.Response): 需要处理的请求
 69 |             param (Any): 请求参数
 70 |         """
 71 |         items = process_body_resp(response)
 72 |         records = self._process_items(items)
 73 |         self._save_to_database(records)
 74 | 
 75 |     async def _process_response_asyncio(self, response: httpx.Response, *, param: Any) -> None:
 76 |         """处理请求并存储数据
 77 | 
 78 |         Args:
 79 |             response (httpx.Response): 需要处理的请求
 80 |             param (Any): 请求参数
 81 |         """
 82 |         items = process_body_resp(response)
 83 |         records = self._process_items(items)
 84 |         await self._save_to_database_asyncio(records)
 85 | 
 86 |     @retry_timeout_decorator_asyncio
 87 |     async def _download_single_asyncio(self, *, param:Any, client:httpx.Response, progress:CustomProgress, overall_task:int):
 88 |         """下载单个请求(异步)
 89 | 
 90 |         Args:
 91 |             param (Any): 请求参数
 92 |             client (httpx.Response): 请求客户端
 93 |             progress (CustomProgress): 进度条
 94 |             overall_task (int): 进度条任务ID
 95 |         """
 96 |         response = await get_body_response_asyncio(
 97 |                             id=param,
 98 |                             client=client)
 99 |                         
100 |         if self._check_response(response):
101 |             await self._process_response_asyncio(response, param=param)
102 |         
103 |         progress.update(overall_task, advance=1, description=f"{param}")
104 | 
105 |     @retry_timeout_decorator
106 |     def _download_single_sync(self, *, param: Any, client:httpx.Response, progress:CustomProgress, overall_task:int):
107 |         """下载单个请求(同步)
108 | 
109 |         Args:
110 |             param (Any): 请求参数
111 |             client (httpx.Response): 请求客户端
112 |             progress (CustomProgress): 进度条
113 |             overall_task (int): 进度条任务ID
114 |         """
115 |         response = get_body_response(
116 |                             id=param,
117 |                             client=client)
118 |         if self._check_response(response):
119 |             self._process_response(response, param=param)
120 |         
121 |         progress.update(overall_task, advance=1, description=f"{param}") 
122 | 
123 | 
124 | 
125 | def get_body_data(id: list[str] | str, *, table_name:str, asynchrony: bool = True) -> list:
126 |     """获取 body 页面数据
127 | 
128 |     Args:
129 |         id (Union[List[str], str]): 微博详细页 id, 或者 id 列表.
130 |         table_name (str): 存储的位置(数据表名)
131 |         asynchrony (bool, optional): _description_. Defaults to True.
132 | 
133 |     Returns:
134 |         list: 存储在数据库中的 id 列表
135 |     """
136 |     downloader = Downloader(id = id, table_name=table_name)
137 |     downloader.download(asynchrony=asynchrony)
138 |     return downloader.res_ids
139 | 


--------------------------------------------------------------------------------
/WeiBoCrawler/pack/get_comment1_data.py:
--------------------------------------------------------------------------------
  1 | import httpx
  2 | from ..request import get_comments_l1_response, get_comments_l1_response_asyncio
  3 | from ..parse import process_comment_resp
  4 | from typing import List, Union, Any
  5 | from ..util import CustomProgress, retry_timeout_decorator, retry_timeout_decorator_asyncio
  6 | from .BaseDownloader import BaseDownloader, CommentID, Comment1Record
  7 | 
  8 | 
  9 | class Downloader(BaseDownloader):
 10 |     def __init__(self, *, uid: Union[List[str], str], mid: Union[List[str], str], table_name: str, concurrency: int = 100, max_failed_times: int = 20) -> None:
 11 |         """根据 uid 和 mid 下载评论数据，并保存在数据库的 mid 表中, 数据库位置在 database_config 中
 12 | 
 13 |         Args:
 14 |             uid (Union[List[str], str]): 用户 ID
 15 |             mid (Union[List[str], str]): 信息 ID
 16 |             table_name (str): 存储的位置(数据表名)
 17 |             concurrency (int, optional): 最大异步并发. Defaults to 100.
 18 |             max_failed_times (int, optional): 最大失败次数. Defaults to 20.
 19 | 
 20 |         Raises:
 21 |             ValueError: uid and mid must be both str or list and the length of uid and mid must be equal.
 22 |         """
 23 |         super().__init__(table_name=table_name, concurrency=concurrency)
 24 | 
 25 |         if isinstance(uid, str) and isinstance(mid, str):
 26 |             self.ids = [CommentID(uid=uid, mid=mid)]
 27 |         elif isinstance(uid, list) and isinstance(mid, list) and len(uid) == len(mid):
 28 |             self.ids = [CommentID(uid=u, mid=m) for u, m in zip(uid, mid)]
 29 |         else:
 30 |             raise ValueError("uid and mid must be both str or list and the length of uid and mid must be equal")
 31 |         
 32 |         self.max_failed_times = max_failed_times
 33 | 
 34 | 
 35 |     def _get_request_description(self) -> str:
 36 |         """获取进度条描述
 37 | 
 38 |         Returns:
 39 |             str: 进度条描述
 40 |         """
 41 |         return "download..."
 42 | 
 43 |     def _get_request_params(self) -> list:
 44 |         """获取请求参数列表
 45 | 
 46 |         Returns:
 47 |             list: 请求参数列表
 48 |         """
 49 |         return self.ids
 50 |     
 51 |     def _process_items(self, items: list[dict]) -> list[Comment1Record]:
 52 |         """_summary_
 53 | 
 54 |         Args:
 55 |             items (list[dict]): _description_
 56 | 
 57 |         Returns:
 58 |             list[BodyRecord]: _description_
 59 |         """
 60 |         records = []
 61 |         for item in items:
 62 |             f_mid = item.get("f_mid", None)
 63 |             f_uid = item.get("f_uid", None)
 64 |             mid = item.get("mid", None)
 65 |             uid = item.get("uid", None)
 66 |             record = Comment1Record(
 67 |                 f_mid = f_mid,
 68 |                 f_uid = f_uid,
 69 |                 mid=mid,
 70 |                 uid=uid,
 71 |                 search_for=self.table_name,
 72 |                 json_data = item
 73 |             )
 74 |             records.append(record)
 75 |         return records
 76 | 
 77 |     def _process_response(self, response: httpx.Response, *, param: Any) -> None:
 78 |         """处理请求并存储数据
 79 | 
 80 |         Args:
 81 |             response (httpx.Response): 需要处理的请求
 82 |             table_name (str): 存储的位置(数据表名)
 83 |         """
 84 |         resp_info, items = process_comment_resp(response)
 85 | 
 86 |         for item in items:
 87 |             item["f_mid"] = param.mid
 88 |             item["f_uid"] = param.uid
 89 | 
 90 |         records = self._process_items(items)
 91 |         self._save_to_database(records)
 92 |         return resp_info
 93 |     
 94 |     async def _process_response_asyncio(self, response: httpx.Response, *, param: Any) -> None:
 95 |         """处理请求并存储数据
 96 | 
 97 |         Args:
 98 |             response (httpx.Response): 需要处理的请求
 99 |             table_name (str): 存储的位置(数据表名)
100 |         """
101 |         resp_info, items = process_comment_resp(response)
102 | 
103 |         for item in items:
104 |             item["f_mid"] = param.mid
105 |             item["f_uid"] = param.uid
106 | 
107 |         records = self._process_items(items)
108 |         await self._save_to_database_asyncio(records)
109 |         return resp_info
110 | 
111 |     @retry_timeout_decorator_asyncio
112 |     async def _download_single_asyncio(self, *, param:Any, client:httpx.Response, progress:CustomProgress, overall_task:int):
113 |         """下载单个请求(异步)
114 |         1. 在这里首先处理第一个评论，因为第一个评论是不需要 max_id 的，所以这里单独处理
115 |         2. 处理每一个评论响应的时候，通过 _process_response 方法获取到 resp_info
116 |         3. 其中 resp_info 包含 max_id, total_number, data_number. 其中 max_id 用于下一个请求, total_number 和 data_number 用于判断是否下载完成
117 |         4. comment 请求有其独有的特性, 在请求次数较多时, 会出现请求失败的情况, 一般来说 failed_times 的上限为 15, 这里取保守值 20.
118 | 
119 |         Args:
120 |             param (Any): 请求参数
121 |             client (httpx.Response): 请求客户端
122 |             progress (CustomProgress): 进度条
123 |             overall_task (int): 进度条任务ID
124 |         """
125 |         response = await get_comments_l1_response_asyncio(uid=param.uid, mid=param.mid, client=client)
126 |         if self._check_response(response):
127 |             resp_info = await self._process_response_asyncio(response, param=param)
128 |             max_id = resp_info.max_id
129 |             total_number = resp_info.total_number
130 |             count_data_number = resp_info.data_number
131 |             failed_times = 0 if resp_info.data_number != 0 else 1
132 | 
133 |             task = progress.add_task(completed=count_data_number, total=total_number, description=f"{param.mid}: failed_times - {failed_times}")
134 |                         
135 |             while (failed_times < self.max_failed_times) and (count_data_number < total_number):
136 |                 response = await get_comments_l1_response_asyncio(uid=param.uid, mid=param.mid, client=client, max_id=max_id)
137 |                 if self._check_response(response):
138 |                     resp_info = await self._process_response_asyncio(response, param=param)
139 |                     max_id = resp_info.max_id
140 |                     count_data_number += resp_info.data_number
141 |                     failed_times = 0 if resp_info.data_number != 0 else failed_times + 1
142 | 
143 |                     progress.update(task, completed=count_data_number, total=total_number, description=f"{param.mid}: failed_times - {failed_times}")
144 | 
145 |                 else:
146 |                     failed_times += 1
147 | 
148 |             progress.remove_task(task)
149 |         progress.update(overall_task, advance=1, description=f"{param.mid}")
150 | 
151 |     @retry_timeout_decorator
152 |     def _download_single_sync(self, *, param: Any, client:httpx.Response, progress:CustomProgress, overall_task:int):
153 |         """下载单个请求(同步)
154 |         1. 在这里首先处理第一个评论，因为第一个评论是不需要 max_id 的，所以这里单独处理
155 |         2. 处理每一个评论响应的时候，通过 _process_response 方法获取到 resp_info
156 |         3. 其中 resp_info 包含 max_id, total_number, data_number. 其中 max_id 用于下一个请求, total_number 和 data_number 用于判断是否下载完成
157 |         4. comment 请求有其独有的特性, 在请求次数较多时, 会出现请求失败的情况, 一般来说 failed_times 的上限为 15, 这里取保守值 20.
158 | 
159 |         Args:
160 |             param (Any): 请求参数
161 |             client (httpx.Response): 请求客户端
162 |             progress (CustomProgress): 进度条
163 |             overall_task (int): 进度条任务ID
164 |         """
165 |         response = get_comments_l1_response(uid=param.uid, mid=param.mid, client=client)
166 |         if self._check_response(response):
167 |             resp_info = self._process_response(response, param=param)
168 |             max_id = resp_info.max_id
169 |             total_number = resp_info.total_number
170 |             count_data_number = resp_info.data_number
171 |             failed_times = 0 if resp_info.data_number != 0 else 1
172 | 
173 |             task = progress.add_task(completed=count_data_number, total=total_number, description=f"{param.mid}: failed_times - {failed_times}")
174 |                         
175 |             while (failed_times < self.max_failed_times) and (count_data_number < total_number):
176 |                 response = get_comments_l1_response(uid=param.uid, mid=param.mid, client=client, max_id=max_id)
177 |                 if self._check_response(response):
178 |                     resp_info = self._process_response(response, param=param)
179 |                     max_id = resp_info.max_id
180 |                     count_data_number += resp_info.data_number
181 |                     failed_times = 0 if resp_info.data_number != 0 else failed_times + 1
182 | 
183 |                     progress.update(task, completed=count_data_number, total=total_number, description=f"{param.mid}: failed_times - {failed_times}")
184 | 
185 |                 else:
186 |                     failed_times += 1
187 | 
188 |             progress.remove_task(task)
189 |         progress.update(overall_task, advance=1, description=f"{param.mid}")
190 | 
191 | 
192 | def get_comment1_data(uid: Union[List[str], str], mid: Union[List[str], str], *, table_name:str, asynchrony: bool = True) -> list:
193 |     """根据 uid 和 mid 下载评论数据，并保存在数据库的 mid 表中, 数据库位置在 database_config 中
194 | 
195 |     Args:
196 |         uid (Union[List[str], str]): 用户 ID
197 |         mid (Union[List[str], str]): 信息 ID
198 |         table_name (str): 存储的位置(数据表名)
199 |         concurrency (int, optional): 最大异步并发. Defaults to 100.
200 | 
201 |     Raises:
202 |         ValueError: uid and mid must be both str or list and the length of uid and mid must be equal.
203 |     
204 |     Returns:
205 |         list: 存储在数据库中的 id 列表
206 |     """
207 |     downloader = Downloader(uid=uid, mid=mid, table_name=table_name)
208 |     downloader.download(asynchrony=asynchrony)
209 |     return downloader.res_ids


--------------------------------------------------------------------------------
/WeiBoCrawler/pack/get_comment2_data.py:
--------------------------------------------------------------------------------
  1 | import httpx
  2 | from ..request import get_comments_l2_response, get_comments_l2_response_asyncio
  3 | from ..parse import process_comment_resp
  4 | from typing import List, Union, Any
  5 | from ..util import CustomProgress, retry_timeout_decorator, retry_timeout_decorator_asyncio
  6 | from .BaseDownloader import BaseDownloader, CommentID, Comment2Record
  7 | 
  8 | 
  9 | 
 10 | class Downloader(BaseDownloader):
 11 |     def __init__(self, *, uid: Union[List[str], str], mid: Union[List[str], str], table_name: str, concurrency: int = 100, max_failed_times: int = 20) -> None:
 12 |         """根据 uid 和 mid 下载评论数据，并保存在数据库的 mid 表中, 数据库位置在 database_config 中
 13 | 
 14 |         Args:
 15 |             uid (Union[List[str], str]): 用户 ID
 16 |             mid (Union[List[str], str]): 信息 ID
 17 |             table_name (str): 存储的位置(数据表名)
 18 |             concurrency (int, optional): 最大异步并发. Defaults to 100.
 19 |             max_failed_times (int, optional): 最大失败次数. Defaults to 20.
 20 | 
 21 |         Raises:
 22 |             ValueError: uid and mid must be both str or list and the length of uid and mid must be equal.
 23 |         """
 24 |         super().__init__(table_name=table_name ,concurrency=concurrency)
 25 | 
 26 |         if isinstance(uid, str) and isinstance(mid, str):
 27 |             self.ids = [CommentID(uid=uid, mid=mid)]
 28 |         elif isinstance(uid, list) and isinstance(mid, list) and len(uid) == len(mid):
 29 |             self.ids = [CommentID(uid=u, mid=m) for u, m in zip(uid, mid)]
 30 |         else:
 31 |             raise ValueError("uid and mid must be both str or list and the length of uid and mid must be equal")
 32 |         
 33 |         self.max_failed_times = max_failed_times
 34 | 
 35 | 
 36 |     def _get_request_description(self) -> str:
 37 |         """获取进度条描述
 38 | 
 39 |         Returns:
 40 |             str: 进度条描述
 41 |         """
 42 |         return "download..."
 43 | 
 44 |     def _get_request_params(self) -> list:
 45 |         """获取请求参数列表
 46 | 
 47 |         Returns:
 48 |             list: 请求参数列表
 49 |         """
 50 |         return self.ids
 51 | 
 52 |     
 53 |     def _process_items(self, items: list[dict]) -> list[Comment2Record]:
 54 |         """_summary_
 55 | 
 56 |         Args:
 57 |             items (list[dict]): _description_
 58 | 
 59 |         Returns:
 60 |             list[BodyRecord]: _description_
 61 |         """
 62 |         records = []
 63 |         for item in items:
 64 |             f_mid = item.get("f_mid", None)
 65 |             f_uid = item.get("f_uid", None)
 66 |             mid = item.get("mid", None)
 67 |             uid = item.get("uid", None)
 68 |             record = Comment2Record(
 69 |                 f_mid = f_mid,
 70 |                 f_uid = f_uid,
 71 |                 mid=mid,
 72 |                 uid=uid,
 73 |                 search_for=self.table_name,
 74 |                 json_data = item
 75 |             )
 76 |             records.append(record)
 77 |         return records
 78 | 
 79 |     def _process_response(self, response: httpx.Response, *, param: Any) -> None:
 80 |         """处理请求并存储数据
 81 | 
 82 |         Args:
 83 |             response (httpx.Response): 需要处理的请求
 84 |             param (Any): 请求参数
 85 |         """
 86 |         resp_info, items = process_comment_resp(response)
 87 |         for item in items:
 88 |             item["f_mid"] = param.mid
 89 |             item["f_uid"] = param.uid
 90 |         
 91 |         records = self._process_items(items)
 92 |         self._save_to_database(records)
 93 |         return resp_info
 94 | 
 95 |     async def _process_response_asyncio(self, response: httpx.Response, *, param: Any) -> None:
 96 |         """处理请求并存储数据
 97 | 
 98 |         Args:
 99 |             response (httpx.Response): 需要处理的请求
100 |             table_name (str): 存储的位置(数据表名)
101 |         """
102 |         resp_info, items = process_comment_resp(response)
103 | 
104 |         for item in items:
105 |             item["f_mid"] = param.mid
106 |             item["f_uid"] = param.uid
107 | 
108 |         records = self._process_items(items)
109 |         await self._save_to_database_asyncio(records)
110 |         return resp_info
111 |     
112 |     @retry_timeout_decorator_asyncio
113 |     async def _download_single_asyncio(self, *, param:Any, client:httpx.Response, progress:CustomProgress, overall_task:int):
114 |         """下载单个请求(异步)
115 |         1. 在这里首先处理第一个评论，因为第一个评论是不需要 max_id 的，所以这里单独处理
116 |         2. 处理每一个评论响应的时候，通过 _process_response 方法获取到 resp_info
117 |         3. 其中 resp_info 包含 max_id, total_number, data_number. 其中 max_id 用于下一个请求, total_number 和 data_number 用于判断是否下载完成
118 |         4. comment 请求有其独有的特性, 在请求次数较多时, 会出现请求失败的情况, 一般来说 failed_times 的上限为 15, 这里取保守值 20.
119 | 
120 |         Args:
121 |             param (Any): 请求参数
122 |             client (httpx.Response): 请求客户端
123 |             progress (CustomProgress): 进度条
124 |             overall_task (int): 进度条任务ID
125 |         """
126 |         response = await get_comments_l2_response_asyncio(uid=param.uid, mid=param.mid, client=client)
127 |         if self._check_response(response):
128 |             resp_info = await self._process_response_asyncio(response, param=param)
129 |             max_id = resp_info.max_id
130 |             total_number = resp_info.total_number
131 |             count_data_number = resp_info.data_number
132 |             failed_times = 0 if resp_info.data_number != 0 else 1
133 | 
134 |             task = progress.add_task(completed=count_data_number, total=total_number, description=f"{param.mid}: failed_times - {failed_times}")
135 |                         
136 |             while (failed_times < self.max_failed_times) and (count_data_number < total_number):
137 |                 response = await get_comments_l2_response_asyncio(uid=param.uid, mid=param.mid, client=client, max_id=max_id)
138 |                 if self._check_response(response):
139 |                     resp_info = await self._process_response_asyncio(response, param=param)
140 |                     max_id = resp_info.max_id
141 |                     count_data_number += resp_info.data_number
142 |                     failed_times = 0 if resp_info.data_number != 0 else failed_times + 1
143 | 
144 |                     progress.update(task, completed=count_data_number, total=total_number, description=f"{param.mid}: failed_times - {failed_times}")
145 | 
146 |                 else:
147 |                     failed_times += 1
148 | 
149 |             progress.remove_task(task)
150 |         progress.update(overall_task, advance=1, description=f"{param.mid}")
151 | 
152 |     @retry_timeout_decorator
153 |     def _download_single_sync(self, *, param: Any, client:httpx.Response, progress:CustomProgress, overall_task:int):
154 |         """下载单个请求(同步)
155 |         1. 在这里首先处理第一个评论，因为第一个评论是不需要 max_id 的，所以这里单独处理
156 |         2. 处理每一个评论响应的时候，通过 _process_response 方法获取到 resp_info
157 |         3. 其中 resp_info 包含 max_id, total_number, data_number. 其中 max_id 用于下一个请求, total_number 和 data_number 用于判断是否下载完成
158 |         4. comment 请求有其独有的特性, 在请求次数较多时, 会出现请求失败的情况, 一般来说 failed_times 的上限为 15, 这里取保守值 20.
159 | 
160 |         Args:
161 |             param (Any): 请求参数
162 |             client (httpx.Response): 请求客户端
163 |             progress (CustomProgress): 进度条
164 |             overall_task (int): 进度条任务ID
165 |         """
166 |         response = get_comments_l2_response(uid=param.uid, mid=param.mid, client=client)
167 |         if self._check_response(response):
168 |             resp_info = self._process_response(response, param=param)
169 |             max_id = resp_info.max_id
170 |             total_number = resp_info.total_number
171 |             count_data_number = resp_info.data_number
172 |             failed_times = 0 if resp_info.data_number != 0 else 1
173 | 
174 |             task = progress.add_task(completed=count_data_number, total=total_number, description=f"{param.mid}: failed_times - {failed_times}")
175 |                         
176 |             while (failed_times < self.max_failed_times) and (count_data_number < total_number):
177 |                 response = get_comments_l2_response(uid=param.uid, mid=param.mid, client=client, max_id=max_id)
178 |                 if self._check_response(response):
179 |                     resp_info = self._process_response(response, param=param)
180 |                     max_id = resp_info.max_id
181 |                     count_data_number += resp_info.data_number
182 |                     failed_times = 0 if resp_info.data_number != 0 else failed_times + 1
183 | 
184 |                     progress.update(task, completed=count_data_number, total=total_number, description=f"{param.mid}: failed_times - {failed_times}")
185 | 
186 |                 else:
187 |                     failed_times += 1
188 | 
189 |             progress.remove_task(task)
190 |         progress.update(overall_task, advance=1, description=f"{param.mid}")
191 | 
192 | def get_comment2_data(uid: Union[List[str], str], mid: Union[List[str], str], *, table_name: str, asynchrony: bool = True) -> list:
193 |     """根据 uid 和 mid 下载评论数据，并保存在数据库的 mid 表中, 数据库位置在 database_config 中
194 | 
195 |     Args:
196 |         uid (Union[List[str], str]): 用户 ID
197 |         mid (Union[List[str], str]): 信息 ID
198 |         table_name (str): 存储的位置(数据表名)
199 |         concurrency (int, optional): 最大异步并发. Defaults to 100.
200 | 
201 |     Raises:
202 |         ValueError: uid and mid must be both str or list and the length of uid and mid must be equal.
203 | 
204 |     Returns:
205 |         list: 存储在数据库中的 id 列表
206 |     """
207 |     downloader = Downloader(uid=uid, mid=mid, table_name=table_name)
208 |     downloader.download(asynchrony=asynchrony)
209 |     return downloader.res_ids


--------------------------------------------------------------------------------
/WeiBoCrawler/pack/get_list_data.py:
--------------------------------------------------------------------------------
  1 | import httpx
  2 | from datetime import datetime
  3 | from typing import Literal, Optional, Any
  4 | from ..util import CustomProgress, retry_timeout_decorator, retry_timeout_decorator_asyncio
  5 | from ..request import get_list_response_asyncio, get_list_response
  6 | from ..parse import parse_list_html
  7 | from .BaseDownloader import BaseDownloader, BodyRecord, RecordFrom
  8 | 
  9 | 
 10 | class Downloader(BaseDownloader):
 11 |     def __init__(self, search_for: str, *, table_name: str, kind : Literal["综合", "实时", "高级"] = "综合", 
 12 |                       advanced_kind: Literal["综合", "热度", "原创"] = "综合", time_start: Optional[datetime] = None, time_end:Optional[datetime]=None, concurrency: int = 100):
 13 |         """下载 List 页面数据, 并保存在数据库的 search_for 表中, 数据库位置在 database_config 中.
 14 | 
 15 |         Args:
 16 |             search_for (str): 需要搜索的内容，如果是话题，需要在 search_for 前后都加上 #
 17 |             table_name (str): 存储的位置(数据表名)
 18 |             kind (Literal[, optional): 搜索类型可以是 综合，实时，高级(添加了综合，热度，原创筛选以及时间). Defaults to "综合".
 19 |             advanced_kind (Literal[, optional): 筛选条件，可以是综合，热度，原创. Defaults to "综合".
 20 |             time_start (Optional[datetime], optional): 起始时间，最大颗粒度为小时. Defaults to Optional[datetime].
 21 |             time_end (Optional[datetime], optional): 结束时间，最大颗粒度为小时. Defaults to Optional[datetime].
 22 |             concurrency (int, optional): 异步最大并发. Defaults to 100.
 23 |         """
 24 |         super().__init__(table_name=table_name, concurrency=concurrency)
 25 | 
 26 |         self.search_for = search_for
 27 |         self.kind = kind
 28 |         self.advanced_kind = advanced_kind
 29 |         self.time_start = time_start
 30 |         self.time_end = time_end
 31 | 
 32 | 
 33 |     def _get_request_description(self) -> str:
 34 |         """获取进度条描述
 35 | 
 36 |         Returns:
 37 |             str: 进度条描述
 38 |         """
 39 |         return "download..."
 40 | 
 41 |     def _get_request_params(self) -> list:
 42 |         """获取请求参数列表
 43 | 
 44 |         Returns:
 45 |             list: 请求参数列表
 46 |         """
 47 |         return list(range(1, 51))
 48 | 
 49 |     def _process_items(self, items: list[dict]) -> list[BodyRecord]:
 50 |         """_summary_
 51 | 
 52 |         Args:
 53 |             items (list[dict]): _description_
 54 | 
 55 |         Returns:
 56 |             list[BodyRecord]: _description_
 57 |         """
 58 |         records = []
 59 |         for item in items:
 60 |             mid = item.get("mid", None)
 61 |             uid = item.get("uid", None)
 62 |             record = BodyRecord(
 63 |                 mid=mid,
 64 |                 uid=uid,
 65 |                 search_for=self.table_name,
 66 |                 record_from=RecordFrom.Html,
 67 |                 json_data = item
 68 |             )
 69 |             records.append(record)
 70 |         return records
 71 | 
 72 |     def _process_response(self, response: httpx.Response, *, param: Any) -> None:
 73 |         """处理请求并存储数据
 74 | 
 75 |         Args:
 76 |             response (httpx.Response): 需要处理的请求
 77 |             table_name (str): 存储的位置(数据表名)
 78 |         """
 79 |         items = parse_list_html(response.text)
 80 |         records = self._process_items(items)
 81 |         self._save_to_database(records)
 82 | 
 83 |     async def _process_response_asyncio(self, response: httpx.Response, *, param: Any) -> None:
 84 |         """处理请求并存储数据
 85 | 
 86 |         Args:
 87 |             response (httpx.Response): 需要处理的请求
 88 |             param (Any): 请求参数
 89 |         """
 90 |         items = parse_list_html(response.text)
 91 |         records = self._process_items(items)
 92 |         await self._save_to_database_asyncio(records)
 93 | 
 94 |     @retry_timeout_decorator_asyncio
 95 |     async def _download_single_asyncio(self, *, param:Any, client:httpx.Response, progress:CustomProgress, overall_task:int):
 96 |         """下载单个请求(异步)
 97 | 
 98 |         Args:
 99 |             param (Any): 请求参数
100 |             client (httpx.Response): 请求客户端
101 |             progress (CustomProgress): 进度条
102 |             overall_task (int): 进度条任务ID
103 |         """
104 |         response = await get_list_response_asyncio(
105 |                             search_for=self.search_for,
106 |                             kind=self.kind,
107 |                             advanced_kind=self.advanced_kind, 
108 |                             time_start=self.time_start, 
109 |                             time_end=self.time_end, 
110 |                             page_index=param,
111 |                             client=client)
112 |                         
113 |         if self._check_response(response):
114 |             await self._process_response_asyncio(response, param=param)
115 |         
116 |         progress.update(overall_task, advance=1, description=f"{param}...")
117 | 
118 |     @retry_timeout_decorator
119 |     def _download_single_sync(self, *, param: Any, client:httpx.Response, progress:CustomProgress, overall_task:int):
120 |         """下载单个请求(同步)
121 | 
122 |         Args:
123 |             param (Any): 请求参数
124 |             client (httpx.Response): 请求客户端
125 |             progress (CustomProgress): 进度条
126 |             overall_task (int): 进度条任务ID
127 |         """
128 |         response = get_list_response(
129 |                             search_for=self.search_for,
130 |                             kind=self.kind,
131 |                             advanced_kind=self.advanced_kind,
132 |                             time_start=self.time_start,
133 |                             time_end=self.time_end,
134 |                             page_index=param,
135 |                             client=client)
136 |         
137 |         if self._check_response(response):
138 |             self._process_response(response, param=param)
139 |         
140 |         progress.update(overall_task, advance=1, description=f"{param}") 
141 | 
142 | 
143 | def get_list_data(search_for: str, *,  table_name: str, asynchrony: bool = True, kind : Literal["综合", "实时", "高级"] = "综合", 
144 |                       advanced_kind: Literal["综合", "热度", "原创"] = "综合", time_start: Optional[datetime] = None, time_end:Optional[datetime]=None) -> list:
145 |     """获取 List 页面数据
146 | 
147 |     Args:
148 |         search_for (str): 需要搜索的内容，如果是话题，需要在 search_for 前后都加上 #.
149 |         table_name (str): 存储的位置(数据表名)
150 |         asynchrony (bool, optional): _description_. Defaults to True.
151 |         kind (Literal[, optional): 搜索类型可以是 综合，实时，高级(添加了综合，热度，原创筛选以及时间). Defaults to "综合".
152 |         advanced_kind (Literal[, optional): 筛选条件，可以是综合，热度，原创. Defaults to "综合".
153 |         time_start (Optional[datetime], optional): 起始时间，最大颗粒度为小时. Defaults to None.
154 |         time_end (Optional[datetime], optional): 结束时间，最大颗粒度为小时. Defaults to None.
155 |     
156 |     Returns:
157 |         list: 存储在数据库中的 id 列表
158 |     """
159 |     downloader = Downloader(search_for=search_for, table_name=table_name, kind=kind, advanced_kind=advanced_kind, time_start=time_start, time_end=time_end)
160 |     downloader.download(asynchrony=asynchrony)
161 |     return downloader.res_ids
162 | 


--------------------------------------------------------------------------------
/WeiBoCrawler/parse/__init__.py:
--------------------------------------------------------------------------------
 1 | from .process_list import process_list_documents
 2 | from .process_comment import process_comment_documents, process_comment_resp
 3 | from .process_body import process_body_documents, process_body_resp
 4 | from .parse_list_html import parse_list_html
 5 | 
 6 | __all__ = [
 7 |     "process_list_documents", 
 8 |     "process_comment_documents", 
 9 |     "process_body_documents",
10 | 
11 |     "parse_list_html",
12 | 
13 |     "process_body_resp",
14 |     "process_comment_resp"
15 | ]


--------------------------------------------------------------------------------
/WeiBoCrawler/parse/parse_list_html.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import parsel
  3 | from typing import Optional, List
  4 | from ..util import custom_validate_call, process_time_str
  5 | 
  6 | 
  7 | @custom_validate_call
  8 | def get_mid(select: parsel.Selector) -> Optional[str]:
  9 |     """获取微博的mid
 10 | 
 11 |     Args:
 12 |         select (parsel.Selector): 经过 parsel 解析 html 后得到的 Selector 对象
 13 | 
 14 |     Returns:
 15 |         Optional[str]: 微博的mid
 16 |     """
 17 |     mid = select.xpath("//div[@mid]/@mid").get()
 18 |     return mid
 19 | 
 20 | 
 21 | @custom_validate_call
 22 | def get_uid(select: parsel.Selector) -> Optional[str]:
 23 |     """获取微博的uid
 24 | 
 25 |     Args:
 26 |         select (parsel.Selector): 经过 parsel 解析 html 后得到的 Selector 对象
 27 | 
 28 |     Returns:
 29 |         Optional[str]: 微博的uid
 30 |     """
 31 |     uid = select.xpath("//a[@nick-name]/@href").get()
 32 |     if uid is None:
 33 |         return None
 34 |     else:
 35 |         uid = re.search(r"/(\d+)/?", uid).group(1)
 36 |         return uid
 37 | 
 38 | @custom_validate_call
 39 | def get_mblogid(select: parsel.Selector) -> Optional[str]:
 40 |     """获取微博的mblogid
 41 | 
 42 |     Args:
 43 |         select (parsel.Selector): 经过 parsel 解析 html 后得到的 Selector 对象
 44 | 
 45 |     Returns:
 46 |         Optional[str]: 微博的mblogid
 47 |     """
 48 |     mblogid = select.xpath('//div[@class="from"]/a[1]/@href').get()
 49 |     if mblogid is None:
 50 |         return None
 51 |     else:
 52 |         mblogid = re.search(r"/(\w+)\?", mblogid).group(1)
 53 |         return mblogid
 54 | 
 55 | 
 56 | @custom_validate_call
 57 | def get_personal_name(select: parsel.Selector) -> Optional[str]:
 58 |     """获取微博的个人名称
 59 | 
 60 |     Args:
 61 |         select (parsel.Selector): 经过 parsel 解析 html 后得到的 Selector 对象
 62 | 
 63 |     Returns:
 64 |         Optional[str]: 微博的个人名称
 65 |     """
 66 |     personal_name = select.xpath("//a[@nick-name]/@nick-name").get()
 67 |     return personal_name
 68 | 
 69 | @custom_validate_call
 70 | def get_personal_href(select: parsel.Selector) -> Optional[str]:
 71 |     """获取微博的个人主页
 72 | 
 73 |     Args:
 74 |         select (parsel.Selector): 经过 parsel 解析 html 后得到的 Selector 对象
 75 | 
 76 |     Returns:
 77 |         Optional[str]: 个人主页的 URL
 78 |     """
 79 |     personal_href = select.xpath("//a[@nick-name]/@href").get()
 80 |     if personal_href is None:
 81 |         return None
 82 |     else:
 83 |         return "https:" + personal_href
 84 | 
 85 | 
 86 | def get_weibo_href(select: parsel.Selector) -> Optional[str]:
 87 |     """获取微博的链接
 88 | 
 89 |     Args:
 90 |         select (parsel.Selector): 经过 parsel 解析 html 后得到的 Selector 对象
 91 | 
 92 |     Returns:
 93 |         Optional[str]: 微博的链接
 94 |     """
 95 |     weibo_href = select.xpath('//div[@class="from"]/a[1]/@href').get()
 96 |     if weibo_href is None:
 97 |         return None
 98 |     else:
 99 |         return "https:" + weibo_href
100 | 
101 | 
102 | @custom_validate_call
103 | def get_publish_time(select: parsel.Selector) -> Optional[str]:
104 |     """获取微博的发布时间
105 | 
106 |     Args:
107 |         select (parsel.Selector): 经过 parsel 解析 html 后得到的 Selector 对象
108 | 
109 |     Returns:
110 |         Optional[datetime]: 微博的发布时间
111 |     """
112 |     publish_time_str = select.xpath('//div[@class="from"]/a[1]/text()').get()
113 |     if publish_time_str is None:
114 |         return publish_time_str
115 |     else:
116 |         publish_time = process_time_str(publish_time_str).strftime("%Y-%m-%d %H:%M:%S")
117 |         return publish_time
118 | 
119 | @custom_validate_call
120 | def get_content_from(select:parsel.Selector) -> Optional[str]:
121 |     """获取微博的发送设备
122 | 
123 |     Args:
124 |         select (parsel.Selector): 经过 parsel 解析 html 后得到的 Selector 对象
125 | 
126 |     Returns:
127 |         Optional[str]: 微博的发送设备
128 |     """
129 |     content_from = select.xpath('//div[@class="from"]/a[2]/text()').get()
130 |     return content_from
131 | 
132 | @custom_validate_call
133 | def get_content_all(select:parsel.Selector) -> Optional[str]:
134 |     """获取微博的内容
135 | 
136 |     Args:
137 |         select (parsel.Selector): 经过 parsel 解析 html 后得到的 Selector 对象
138 | 
139 |     Returns:
140 |         Optional[str]: 微博的内容
141 |     """
142 |     content_all = select.xpath('string(//p[@node-type="feed_list_content_full"])').get()
143 |     content_all = re.sub(r"\n[ \t]+", "\n", content_all)
144 |     content_all = re.sub(r"(?<!\n)\n(?!\n)", "", content_all)
145 |     content_all = re.sub(r"[ \t]*收起d[ \t]*", "", content_all)
146 | 
147 |     content_show = select.xpath('string(//p[@node-type="feed_list_content"])').get()
148 |     content_show = re.sub(r"\n[ \t]+", "\n", content_show)
149 |     content_show = re.sub(r"(?<!\n)\n(?!\n)", "", content_show)
150 |     
151 |     content_final = content_all if content_all else content_show
152 |     content_final = content_final.replace("\u200b", "").strip()
153 |     content_final = re.sub(r"[ \t]*\n+[ \t]*", "\n\n", content_final)
154 | 
155 |     return content_final
156 | 
157 | @custom_validate_call
158 | def get_retweet_num(select: parsel.Selector) -> Optional[int]:
159 |     """获取微博的转发数量
160 | 
161 |     Args:
162 |         select (parsel.Selector): 经过 parsel 解析 html 后得到的 Selector 对象
163 | 
164 |     Returns:
165 |         Optional[int]: 微博的转发数量
166 |     """
167 |     retweet_num = select.xpath('string(//div[@class="card-act"]/ul[1]/li[1])').get()
168 |     if retweet_num:
169 |         retweet_num = re.findall(r"\d+", retweet_num)
170 |         return int(retweet_num[0]) if retweet_num else 0
171 |     else:
172 |         return None
173 |         
174 |     
175 | @custom_validate_call
176 | def get_comment_num(select:parsel.Selector) -> Optional[int]:
177 |     """获取微博的评论数量
178 | 
179 |     Args:
180 |         select (parsel.Selector): 经过 parsel 解析 html 后得到的 Selector 对象
181 | 
182 |     Returns:
183 |         Optional[int]: 微博的评论数量
184 |     """
185 |     comment_num = select.xpath('string(//div[@class="card-act"]/ul[1]/li[2])').get()
186 |     if comment_num:
187 |         comment_num = re.findall(r"\d+", comment_num)
188 |         return int(comment_num[0]) if comment_num else 0
189 |     else:
190 |         return None
191 | 
192 | @custom_validate_call
193 | def get_star_num(select: parsel.Selector) -> Optional[int]:
194 |     """获取微博的点赞数量
195 | 
196 |     Args:
197 |         select (parsel.Selector): 经过 parsel 解析 html 后得到的 Selector 对象
198 | 
199 |     Returns:
200 |         Optional[int]: 微博的点赞数量
201 |     """
202 |     star_num = select.xpath('string(//div[@class="card-act"]/ul[1]/li[3])').get()
203 |     if star_num:
204 |         star_num = re.findall(r"\d+", star_num)
205 |         return int(star_num[0]) if star_num else 0
206 |     else:
207 |         return None
208 |     
209 | 
210 | def parse_list_html(html: str) -> List[dict]:
211 |     """解析微博列表主体的html
212 | 
213 |     Args:
214 |         html (str): 爬虫获取到的 html 文本
215 | 
216 |     Returns:
217 |         List[dict]: 整理后的 List[dict]
218 |     """
219 |     select = parsel.Selector(html)
220 |     check_div_mpage = select.css("div.m-page").get()
221 |     if check_div_mpage is None:
222 |         return []
223 |     else:
224 |         div_list = select.xpath('//*[@id="pl_feedlist_index"]//div[@action-type="feed_list_item"]').getall()
225 |         lst = []
226 |         for div_string in div_list:
227 |             select = parsel.Selector(div_string)
228 |             item = {
229 |                 "mid": get_mid(select),
230 |                 "uid": get_uid(select),
231 |                 "mblogid": get_mblogid(select),
232 |                 "personal_name": get_personal_name(select),
233 |                 "personal_href": get_personal_href(select),
234 |                 "weibo_href": get_weibo_href(select),
235 |                 "publish_time": get_publish_time(select),
236 |                 "content_from": get_content_from(select),
237 |                 "content_all": get_content_all(select),
238 |                 "retweet_num": get_retweet_num(select),
239 |                 "comment_num": get_comment_num(select),
240 |                 "star_num": get_star_num(select),
241 |             }
242 |             lst.append(item)
243 |         return lst


--------------------------------------------------------------------------------
/WeiBoCrawler/parse/process_body.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from ..util import process_base_document, process_base_documents
 3 | 
 4 | def process_body_resp(resp):
 5 |     """处理详细页数据
 6 | 
 7 |     这里一般都会收到正常的响应，所以只需要处理数据即可.
 8 |     Args:
 9 |         resp (httpx.Response): 接受到的响应.
10 | 
11 |     Returns:
12 |         list[dict]: 响应的数据, 这里使用 list 包装一下(对齐其他的process请求).
13 |     """
14 |     data = resp.json()
15 |     transform_dict = {
16 |             "mid": "mid",
17 |             "uid": ["user", "idstr"],
18 |     }
19 |     data.update(process_base_document(data, transform_dict))
20 |     return [data]
21 | 
22 | 
23 | def process_body_documents(documents: list[dict]) -> pd.DataFrame:
24 |     """将 documents 处理成 dataframe 的形式
25 |     
26 |     transform_dict = {
27 |             "转发数量": "retweet_num",
28 |             "评论数量": "comment_num",
29 |             "点赞数量": "star_num",
30 |             ...
31 |         }
32 | 
33 |     Args:
34 |         documents (list[dict]): 文档列表
35 |         transform_dict (dict): 转换字典, key 是转化后的字段, value 是原始字段
36 | 
37 |     Returns:
38 |         pd.DataFrame: (去重)处理后得到的表格
39 |     """
40 |     transform_dict = {
41 |             "mid": "mid",
42 |             "uid": ["user", "idstr"],
43 |             "mblogid": "mblogid",
44 |             "个人昵称": ["user", "screen_name"],
45 | 
46 |             "用户性别": ["longText", "user", "gender"],
47 | 
48 |             "用户定位": ["longText","user", "location"],
49 |             "用户粉丝": ["longText","user", "followers_count"],
50 |             "用户累计评论数": ["user", "status_total_counter", "comment_cnt"],
51 |             "用户累计转发数": ["user", "status_total_counter", "repost_cnt"],
52 |             "用户累计点赞数": ["user", "status_total_counter", "like_cnt"],
53 |             "用户累计评转赞": ["user", "status_total_counter", "total_cnt"],
54 |             "发布时间": "created_at",
55 |             "原生内容": "text",
56 |             "展示内容": "text_raw",
57 |             
58 |             "转发数量": "reposts_count",
59 |             "评论数量": "comments_count",
60 |             "点赞数量": "attitudes_count",
61 |         }
62 |     df = process_base_documents(documents, transform_dict)
63 |     return df


--------------------------------------------------------------------------------
/WeiBoCrawler/parse/process_comment.py:
--------------------------------------------------------------------------------
 1 | from typing import Tuple
 2 | import httpx
 3 | from pydantic import BaseModel
 4 | import pandas as pd
 5 | from ..util import process_base_documents, process_base_document
 6 | 
 7 | class CommmentResponseInfo(BaseModel):
 8 |     max_id: str
 9 |     total_number: int
10 |     data_number: int
11 | 
12 | 
13 | 
14 | def process_comment_resp(resp: httpx.Response) -> Tuple[CommmentResponseInfo, list]:
15 |     """处理评论数据
16 | 
17 |     这里有三种方式判断 resp 是否正常：
18 |     1. 正常响应头中会有 content-encoding:gzip, 而不正常的响应头中相应位置为 content-length: 117(或者其他)
19 |     2. 正常响应中会有 filter_group 字段, 不正常响应中没有该字段,
20 |     3. 无论正常还是非正常响应中都有 data 字段, 正常响应 data 字段内容为 [dict], 非正常响应 data 字段内容为 []
21 | 
22 |     目前使用第三种方法.
23 |     
24 |     Args:
25 |         resp (httpx.Response): 接受到的响应.
26 | 
27 |     Returns:
28 |         Tuple[dict, list]: 前面是 请求的信息(后面要用到), 后面是数据
29 |     """
30 |     data = resp.json()
31 |     max_id = data.get("max_id", "")
32 |     total_number = data.get("total_number", 0)
33 |     data_number = len(data.get("data", []))
34 | 
35 |     data_list = data["data"]
36 | 
37 |     transform_dict = {
38 |             "mid": "mid",
39 |             "uid": ["user", "idstr"],
40 |     }
41 |     
42 |     [data.update(process_base_document(data, transform_dict)) for data in data_list]
43 | 
44 |     resp_info = CommmentResponseInfo(max_id=str(max_id), total_number=int(total_number), data_number=data_number)
45 |     return resp_info, data_list
46 | 
47 | 
48 | 
49 | 
50 | 
51 | def process_comment_documents(documents: list[dict]) -> pd.DataFrame:
52 |     """将表处理成 dataframe 的形式
53 |     
54 |     transform_dict = {
55 |             "转发数量": "retweet_num",
56 |             "评论数量": "comment_num",
57 |             "点赞数量": "star_num",
58 |             ...
59 |         }
60 | 
61 |     Args:
62 |         table (Table): 需要处理的表
63 |         transform_dict (dict): 转换字典, key 是转化后的字段, value 是原始字段
64 | 
65 |     Returns:
66 |         pd.DataFrame: (去重)处理后得到的表格
67 |     """
68 |     transform_dict = {
69 |         "f_mid": "f_mid",
70 |         "f_uid": "f_uid",
71 |         "mid": "mid",
72 |         "uid": ["user", "id"],
73 |         "个人昵称": ["user", "screen_name"],
74 |         "用户性别": ["user", "gender"],
75 |         "用户定位": ["user", "location"],
76 |         "用户粉丝": ["user", "followers_count"],
77 |         "用户累计评论数": ["user", "status_total_counter", "comment_cnt"],
78 |         "用户累计转发数": ["user", "status_total_counter", "repost_cnt"],
79 |         "用户累计点赞数": ["user", "status_total_counter", "like_cnt"],
80 |         "用户累计评转赞": ["user", "status_total_counter", "total_cnt"],
81 |         "发布时间": "created_at",
82 |         "原生内容": "text",
83 |         "展示内容": "text_raw",
84 |         "评论数量": "total_number",
85 |         "点赞数量": "like_counts",
86 |     }
87 |     df = process_base_documents(documents, transform_dict)
88 |     return df


--------------------------------------------------------------------------------
/WeiBoCrawler/parse/process_list.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from ..util import process_base_documents
 3 | 
 4 | def process_list_documents(documents: list[dict]) -> pd.DataFrame:
 5 |     """将 documents 处理成 dataframe 的形式
 6 |     
 7 |     transform_dict = {
 8 |             "转发数量": "retweet_num",
 9 |             "评论数量": "comment_num",
10 |             "点赞数量": "star_num",
11 |             ...
12 |         }
13 | 
14 |     Args:
15 |         documents (list[dict]): 文档列表
16 |         transform_dict (dict): 转换字典, key 是转化后的字段, value 是原始字段
17 | 
18 |     Returns:
19 |         pd.DataFrame: (去重)处理后得到的表格
20 |     """
21 |     transform_dict = {
22 |             "mid": "mid",
23 |             "uid": "uid",
24 |             "mblogid": "mblogid",
25 |             "个人昵称": "personal_name",
26 |             "个人主页": "personal_href",
27 |             "微博链接": "weibo_href",
28 |             "发布时间": "publish_time",
29 |             "内容来自": "content_from",
30 |             "全部内容": "content_all",
31 |             "转发数量": "retweet_num",
32 |             "评论数量": "comment_num",
33 |             "点赞数量": "star_num",
34 |         }
35 |     df = process_base_documents(documents, transform_dict)
36 |     return df


--------------------------------------------------------------------------------
/WeiBoCrawler/request/__init__.py:
--------------------------------------------------------------------------------
 1 | from .get_list_request import get_list_response, get_list_response_asyncio
 2 | from .get_body_request import get_body_response, get_body_response_asyncio
 3 | from .get_comment_request import get_comments_l1_response, get_comments_l2_response, get_comments_l1_response_asyncio, get_comments_l2_response_asyncio
 4 | from .get_cookies import get_qr_Info, get_qr_status
 5 | 
 6 | __all__ = [
 7 |     "get_list_response",
 8 |     "get_body_response",
 9 |     "get_comments_l1_response",
10 |     "get_comments_l2_response",
11 | 
12 |     "get_list_response_asyncio",
13 |     "get_body_response_asyncio",
14 |     "get_comments_l1_response_asyncio",
15 |     "get_comments_l2_response_asyncio",
16 | 
17 |     "get_qr_Info",
18 |     "get_qr_status",
19 | ]


--------------------------------------------------------------------------------
/WeiBoCrawler/request/get_body_request.py:
--------------------------------------------------------------------------------
 1 | import httpx
 2 | from .util import request_headers
 3 | 
 4 | 
 5 | def build_body_params(id: str) -> tuple:
 6 |     """构建微博详细页参数
 7 |     微博详细页id位置(https://weibo.com/{userid}/{id}?{params}):
 8 |         1. 找到需要爬取的微博内容页, 比如:
 9 |             https://weibo.com/1644114654/OiZre8dir?refer_flag=1001030103_  -> id = OiZre8dir
10 | 
11 |     Args:
12 |         id (str): 微博详细页id.
13 | 
14 |     Returns:
15 |         tuple: (url, params, headers).
16 |     """
17 |     headers = request_headers.body_headers
18 |     url = "https://weibo.com/ajax/statuses/show"
19 |     params = {
20 |         "id": f"{id}",
21 |         "locale": "zh-CN",
22 |         "isGetLongText": "true"
23 |     }
24 |     return url, params, headers
25 | 
26 | 
27 | def get_body_response(id: str, *, client: httpx.Client) -> httpx.Response:
28 |     """获取微博详细页的请求结果
29 |     微博详细页id位置(https://weibo.com/{userid}/{id}?{params}):
30 |         1. 找到需要爬取的微博内容页, 比如:
31 |             https://weibo.com/1644114654/OiZre8dir?refer_flag=1001030103_  -> id = OiZre8dir
32 | 
33 |     Args:
34 |         id (str): 微博详细页id.
35 |         client (httpx.Client): 客户端.
36 | 
37 |     Returns:
38 |         httpx.Response: 返回的请求结果.
39 |     """
40 |     url, params, headers = build_body_params(id)
41 |     response = client.get(url, params=params, headers=headers)
42 |     return response
43 | 
44 | 
45 | async def get_body_response_asyncio(id:str, *, client: httpx.AsyncClient) -> httpx.Response:
46 |     """获取微博详细页的请求结果(异步)
47 |     微博详细页id位置(https://weibo.com/{userid}/{id}?{params}):
48 |         1. 找到需要爬取的微博内容页, 比如:
49 |             https://weibo.com/1644114654/OiZre8dir?refer_flag=1001030103_  -> id = OiZre8dir
50 | 
51 |     Args:
52 |         id (str): 微博详细页id.
53 |         client (httpx.AsyncClient): 异步客户端.
54 | 
55 |     Returns:
56 |         httpx.Response: 返回的请求结果.
57 |     """
58 |     url, params, headers = build_body_params(id)
59 |     response = await client.get(url, params=params, headers=headers)
60 |     return response


--------------------------------------------------------------------------------
/WeiBoCrawler/request/get_comment_request.py:
--------------------------------------------------------------------------------
  1 | import httpx
  2 | from .util import request_headers
  3 | from typing import Optional
  4 | 
  5 | def build_comments_l1_params(uid: str, mid : str, *, max_id: Optional[str]=None) -> tuple:
  6 |     """构建微博主体一级评论的参数
  7 | 
  8 |     Args:
  9 |         uid (str): 微博的uid
 10 |         mid (str): 微博的mid
 11 |         max_id (str, optional): 是否是第一次请求该微博主体的评论, 如果是第一次, max_id 设置为 None; 否则设置为 max_id. Defaults to None.
 12 | 
 13 |     Returns:
 14 |         tuple: (url, params, headers).
 15 |     """
 16 |     url = "https://weibo.com/ajax/statuses/buildComments"
 17 |     headers = request_headers.comment1_buildComments_headers
 18 | 
 19 |     params = {
 20 |         "is_reload": "1",
 21 |         "id": f"{mid}",
 22 |         "is_show_bulletin": "2",
 23 |         "is_mix": "0",
 24 |         "count": "20",
 25 |         "uid": f"{uid}",
 26 |         "fetch_level": "0",
 27 |         "locale": "zh-CN",
 28 |     }
 29 |     if max_id is not None:
 30 |         params["flow"] = "0"
 31 |         params["max_id"] = max_id
 32 | 
 33 |     return url, params, headers
 34 | 
 35 | 
 36 | def build_comments_l2_params(uid: str, mid : str, *, max_id: Optional[str]=None) -> tuple:
 37 |     """构建微博主体二级评论的参数
 38 | 
 39 |     Args:
 40 |         uid (str): 微博的uid
 41 |         mid (str): 微博的mid
 42 |         max_id (str, optional): 是否是第一次请求该微博主体的评论, 如果是第一次, max_id 设置为 None; 否则设置为 max_id. Defaults to None.
 43 | 
 44 |     Returns:
 45 |         tuple: (url, params, headers).
 46 |     """
 47 |     url = "https://weibo.com/ajax/statuses/buildComments"
 48 |     headers = request_headers.comment2_buildComments_headers
 49 |     
 50 |     params = {
 51 |         "flow": "0", # 0 表示按热度, 1 表示按时间
 52 |         "is_reload": "1",
 53 |         "id": f"{mid}",
 54 |         "is_show_bulletin": "2",
 55 |         "is_mix": "1",
 56 |         "fetch_level": "1",
 57 |         "count": "20",
 58 |         "uid": f"{uid}",
 59 |         "locale": "zh-CN"
 60 |     }
 61 | 
 62 |     if max_id is not None:
 63 |         params["max_id"] = max_id
 64 |     else:
 65 |         params["max_id"] = "0"
 66 | 
 67 |     return url, params, headers
 68 | 
 69 | 
 70 | 
 71 | def get_comments_l1_response(uid: str, mid : str, *, client: httpx.Client, max_id: Optional[str]=None) -> httpx.Response:
 72 |     """获取微博主体的一级评论
 73 | 
 74 |     Args:
 75 |         uid (str): 微博的uid
 76 |         mid (str): 微博的mid
 77 |         client (httpx.Client): 需要的client
 78 |         max_id (str, optional): 是否是第一次请求该微博主体的评论, 如果是第一次, max_id 设置为 None; 否则设置为 max_id. Defaults to None.
 79 | 
 80 |     Returns:
 81 |         httpx.Response: 评论的响应
 82 |     """
 83 |     url, params, headers = build_comments_l1_params(uid, mid, max_id=max_id)       
 84 |     response = client.get(url, params=params, headers=headers)
 85 |     return response
 86 | 
 87 | 
 88 | async def get_comments_l1_response_asyncio(uid: str, mid : str, *, client: httpx.AsyncClient, max_id: Optional[str]=None) -> httpx.Response:
 89 |     """获取微博主体的一级评论(异步)
 90 | 
 91 |     Args:
 92 |         uid (str): 微博的uid
 93 |         mid (str): 微博的mid
 94 |         client (httpx.AsyncClient): 需要的client
 95 |         max_id (str, optional): 是否是第一次请求该微博主体的评论, 如果是第一次, max_id 设置为 None; 否则设置为 max_id. Defaults to None.
 96 | 
 97 |     Returns:
 98 |         httpx.Response: 评论的响应
 99 |     """
100 |     url, params, headers = build_comments_l1_params(uid, mid, max_id=max_id)   
101 |     response = await client.get(url, params=params, headers=headers)
102 |     return response
103 | 
104 | 
105 | def get_comments_l2_response(uid: str, mid : str, *, client: httpx.Client, max_id: Optional[str]=None):
106 |     """获取微博主体的二级评论
107 | 
108 |     Args:
109 |         uid (str): 微博的uid
110 |         mid (str): 微博的mid
111 |         client (httpx.Client): 需要的client
112 |         max_id (str, optional): 是否是第一次请求该微博主体的评论, 如果是第一次, max_id 设置为 None; 否则设置为 max_id. Defaults to None.
113 | 
114 |     Returns:
115 |         httpx.Response: 评论的响应
116 |     """
117 |     url, params, headers = build_comments_l2_params(uid, mid, max_id=max_id)
118 |     response = client.get(url, params=params, headers=headers)
119 |     return response
120 | 
121 | 
122 | async def get_comments_l2_response_asyncio(uid: str, mid : str, *, client: httpx.AsyncClient, max_id: Optional[str]=None):
123 |     """获取微博主体的二级评论(异步)
124 | 
125 |     Args:
126 |         uid (str): 微博的uid
127 |         mid (str): 微博的mid
128 |         client (httpx.AsyncClient): 需要的client
129 |         max_id (str, optional): 是否是第一次请求该微博主体的评论, 如果是第一次, max_id 设置为 None; 否则设置为 max_id. Defaults to None.
130 | 
131 |     Returns:
132 |         httpx.Response: 评论的响应
133 |     """
134 |     url, params, headers = build_comments_l2_params(uid, mid, max_id=max_id)
135 |     response = await client.get(url, params=params, headers=headers)
136 |     return response


--------------------------------------------------------------------------------
/WeiBoCrawler/request/get_cookies.py:
--------------------------------------------------------------------------------
  1 | import httpx
  2 | from .util import request_headers
  3 | from PIL import Image
  4 | from io import BytesIO
  5 | import time
  6 | from ..util import logging
  7 | 
  8 | 
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | def get_login_signin_response(client:httpx.Client) -> httpx.Response:
 12 |     """主要是获取 cookies 中的 X-CSRF-TOKEN 字段
 13 | 
 14 |     Args:
 15 |         client (httpx.Client): 会话客户端
 16 | 
 17 |     Returns:
 18 |         httpx.Response: 目的是获取响应的 url
 19 |     """
 20 |     headers = request_headers.login_signin_headers
 21 | 
 22 |     url = "https://passport.weibo.com/sso/signin"
 23 |     params = {
 24 |         "entry": "miniblog",
 25 |         "source": "miniblog",
 26 |         "disp": "popup",
 27 |         "url": "https://weibo.com/newlogin?tabtype=weibo&gid=102803&openLoginLayer=0&url=https%3A%2F%2Fweibo.com%2F",
 28 |         "from": "weibopro"
 29 |     }
 30 |     
 31 |     response = client.get(url, params=params, headers=headers)
 32 |     response.raise_for_status()
 33 |     return response
 34 | 
 35 | 
 36 | def get_login_qrcode_response(client:httpx.Client, login_signin_url:str) -> httpx.Response:
 37 |     """主要是获取二维码的 id 以及 二维码的 url 路径
 38 | 
 39 |     Args:
 40 |         client (httpx.Client): 会话客户端
 41 |         login_signin_url (str): signin 请求的url 主要是需要设置 referer 字段
 42 | 
 43 |     Returns:
 44 |         httpx.Response: 主要是获取 qrid 字段 和 二维码的 url
 45 |     """
 46 |     headers = request_headers.login_qrcode_headers
 47 |     headers["referer"] = login_signin_url
 48 |     headers["x-csrf-token"] = client.cookies.get("X-CSRF-TOKEN")
 49 | 
 50 |     url = "https://passport.weibo.com/sso/v2/qrcode/image"
 51 |     params = {
 52 |         "entry": "miniblog",
 53 |         "size": "180"
 54 |     }
 55 |     response = client.get(url, params=params, headers=headers)
 56 |     response.raise_for_status()
 57 |     return response
 58 |     
 59 | 
 60 | def get_login_check_response(client:httpx.Client, login_signin_url:str, qrid:str) -> httpx.Response:
 61 |     """检查二维码状态：未使用，已扫描未确认，已确认，已过期
 62 | 
 63 |     Args:
 64 |         client (httpx.Client): 会话客户端
 65 |         login_signin_url (str): signin 请求的url 主要是需要设置 referer 字段
 66 |         qrid (str): 二维码的 id
 67 | 
 68 |     Returns:
 69 |         httpx.Response: 检查二维码状态
 70 |     """
 71 |     headers = request_headers.login_final_headers
 72 |     headers["referer"] = login_signin_url
 73 |     headers["x-csrf-token"] = client.cookies["X-CSRF-TOKEN"]
 74 | 
 75 |     url = "https://passport.weibo.com/sso/v2/qrcode/check"
 76 |     params = {
 77 |         "entry": "miniblog",
 78 |         "source": "miniblog",
 79 |         "url": "https://weibo.com/newlogin?tabtype=weibo&gid=102803&openLoginLayer=0&url=https%3A%2F%2Fweibo.com%2F",
 80 |         "qrid": qrid,
 81 |         "disp": "popup"
 82 |     }
 83 |     response = client.get(url, headers=headers, params=params)
 84 |     response.raise_for_status()
 85 |     return response
 86 | 
 87 | 
 88 | 
 89 | def get_login_final_response(client:httpx.Client, login_url:str) -> httpx.Response:
 90 |     """最终的登录请求
 91 | 
 92 |     Args:
 93 |         client (httpx.Client): 会话客户端
 94 |         login_url (str): 最终的登入 url
 95 | 
 96 |     1. 在这里由于是重定向请求，所有在 client 中最好设置 follow_redirects=True.
 97 |     2. 最终的 response 不知道为啥一直是 403 请求，但是 cookies 是成功获取得到了的.
 98 |     
 99 |     Returns:
100 |         httpx.Response: 没啥用
101 |     """
102 |     response = client.get(login_url)
103 |     # response.raise_for_status()
104 |     return response
105 | 
106 | 
107 | def download_image(url:str, show:bool=False):
108 |     """下载并打开图片用来扫描
109 | 
110 |     Args:
111 |         url (str): 二维码图片地址
112 |         show (bool, optional): 是否显示图片. Defaults to False.
113 |     """
114 |     try:
115 |         response = httpx.get(url)
116 |         response.raise_for_status()
117 |         image_content = BytesIO(response.content)
118 |         image = Image.open(image_content)
119 | 
120 |         if show:
121 |             image.show()
122 | 
123 |         return image
124 |     
125 |     except httpx.RequestError as e:
126 |         print(f"请求发生错误: {e}")
127 |     except Exception as e:
128 |         print(f"发生其他错误: {e}")
129 |     
130 | 
131 | def get_qr_status(client:httpx.Client, login_signin_url:str, qrid:str) -> dict | None:
132 |     """获取二维码的状态
133 | 
134 |     Args:
135 |         client (httpx.Client): 会话客户端
136 |         login_signin_url (str): 登入验证 url
137 |         qrid (str): qr 的 id
138 | 
139 |     Returns:
140 |         dict | None: 返回 cookies 或者 None
141 |     """
142 |     while True:
143 |         login_check_response = get_login_check_response(client, login_signin_url=login_signin_url, qrid=qrid)
144 |         login_check_response.encoding = "utf-8"
145 |         login_check_json_data = login_check_response.json()
146 | 
147 |         retcode = login_check_json_data.get("retcode")
148 |         if retcode in [20000000, 50114001, 50114002]:
149 |             if login_check_json_data.get("retcode") == 20000000:
150 |                 login_url = login_check_json_data.get("data").get("url")
151 |                 # 这里的 response 是一个重定向的响应, 其最终结果状态是 403 但是好像在重定向的过程中会设置一些 cookie 信息
152 |                 get_login_final_response(client, login_url=login_url)
153 |                 return dict(client.cookies)
154 |             else:
155 |                 logging.info(f"二维码状态码: {login_check_json_data.get('retcode')}, 状态信息: {login_check_json_data.get('msg')}")
156 |         else:
157 |             return None
158 | 
159 |         time.sleep(1)
160 |     
161 |     
162 | 
163 | def get_qr_Info() -> list[Image.Image, httpx.Client, str, str]:
164 |     """最终获取 cookies 的函数
165 | 
166 |     Returns:
167 |         list[Image.Image, httpx.Client, str, str]: 返回图片，会话客户端，登入验证 url，qr 的 id
168 |     """
169 |     client = httpx.Client(follow_redirects=True)
170 | 
171 |     login_signin_response = get_login_signin_response(client)
172 |     login_signin_url = str(login_signin_response.url)
173 | 
174 |     login_qrcode_response = get_login_qrcode_response(client, login_signin_url=login_signin_url)
175 |     qrcode_json_data = login_qrcode_response.json().get("data")
176 | 
177 |     qrid = qrcode_json_data.get("qrid")
178 |     image_path = qrcode_json_data.get("image")
179 |     image = download_image(image_path)
180 |     return image, client, login_signin_url, qrid


--------------------------------------------------------------------------------
/WeiBoCrawler/request/get_list_request.py:
--------------------------------------------------------------------------------
  1 | import httpx
  2 | from copy import deepcopy
  3 | from typing import Literal, Optional
  4 | from datetime import datetime
  5 | from .util import request_headers
  6 | 
  7 | 
  8 | def build_list_params(search_for: str, page_index: int, *,  kind : Literal["综合", "实时", "高级"] = "综合", 
  9 |                       advanced_kind: Literal["综合", "热度", "原创"] = "综合", time_start: Optional[datetime] = None, time_end: Optional[datetime]=None) -> tuple:
 10 |     """构建列表页参数
 11 | 
 12 |     Args:
 13 |         search_for (str): 需要搜索的内容，如果是话题，需要在 search_for 前后都加上 #.
 14 |         page_index (int): 页码.
 15 |         kind (Literal[, optional): 搜索类型可以是 综合，实时，高级(添加了综合，热度，原创筛选以及时间). Defaults to "综合".
 16 |         advanced_kind (Literal[, optional): 筛选条件，可以是综合，热度，原创. Defaults to "综合".
 17 |         time_start (Optional[datetime], optional): 起始时间，最大颗粒度为小时. Defaults to None.
 18 |         time_end (Optional[datetime], optional): 结束时间，最大颗粒度为小时. Defaults to None.
 19 | 
 20 |     Returns:
 21 |         httpx.Response: 返回列表页响应
 22 |     """
 23 |     url_with_params_dic = {
 24 |         "综合":{
 25 |             "url" : "https://s.weibo.com/weibo",
 26 |             "params": {"q": search_for, "Refer": "weibo_weibo", "page": page_index},
 27 |         },
 28 |         "实时":{
 29 |             "url" : "https://s.weibo.com/realtime",
 30 |             "params": {"q": search_for, "rd": "realtime", "tw": "realtime", "Refer": "weibo_realtime", "page": page_index},
 31 |         },
 32 |         "高级":{
 33 |             "url" : "https://s.weibo.com/weibo",
 34 |             "params": {"q": search_for, "suball": "1", "Refer": "g", "page": page_index},
 35 |         },
 36 |     }
 37 | 
 38 |     url_with_params = url_with_params_dic[kind]
 39 |     if kind == "高级":
 40 |         if advanced_kind == "综合":
 41 |             url_with_params["params"]["typeall"] = "1"
 42 |         if advanced_kind == "热度":
 43 |             url_with_params["params"]["xsort"] = "hot"
 44 |         if advanced_kind == "原创":
 45 |             url_with_params["params"]["scope"] = "ori"
 46 | 
 47 |         time_start = time_start.strftime("%Y-%m-%d-%H") if time_start else ""
 48 |         time_end = time_end.strftime("%Y-%m-%d-%H") if time_end else ""
 49 | 
 50 |         url_with_params["params"]["timescope"] = f"custom:{time_start}:{time_end}"
 51 | 
 52 |     headers = request_headers.body_headers
 53 | 
 54 |     if url_with_params["params"]["page"] > 1:
 55 |         referer_url_with_params = deepcopy(url_with_params)
 56 |         referer_url_with_params["params"]["page"] = url_with_params["params"]["page"] - 1
 57 |         headers["referer"] = str(httpx.URL(url_with_params["url"], params=referer_url_with_params["params"]))
 58 | 
 59 |     url = httpx.URL(url=url_with_params["url"], params=url_with_params["params"])
 60 |     return url, headers
 61 | 
 62 | 
 63 | def get_list_response(search_for: str, page_index: int, *, client: httpx.Client, kind : Literal["综合", "实时", "高级"] = "综合", 
 64 |                       advanced_kind: Literal["综合", "热度", "原创"] = "综合", time_start: Optional[datetime] = None, time_end: Optional[datetime]=None) -> httpx.Response:
 65 |     """获取列表页响应
 66 | 
 67 |     Args:
 68 |         search_for (str): 需要搜索的内容，如果是话题，需要在 search_for 前后都加上 #.
 69 |         page_index (int): 页码.
 70 |         client (httpx.Client): 客户端.
 71 |         kind (Literal[, optional): 搜索类型可以是 综合，实时，高级(添加了综合，热度，原创筛选以及时间). Defaults to "综合".
 72 |         advanced_kind (Literal[, optional): 筛选条件，可以是综合，热度，原创. Defaults to "综合".
 73 |         time_start (Optional[datetime], optional): 起始时间，最大颗粒度为小时. Defaults to None.
 74 |         time_end (Optional[datetime], optional): 结束时间，最大颗粒度为小时. Defaults to None.
 75 | 
 76 |     Returns:
 77 |         httpx.Response: 返回列表页响应
 78 |     """
 79 |     url, headers = build_list_params(search_for, page_index, kind=kind, advanced_kind=advanced_kind, time_start=time_start, time_end=time_end)
 80 |     response = client.get(url, headers=headers)
 81 |     return response
 82 | 
 83 | 
 84 | async def get_list_response_asyncio(search_for: str, page_index: int, *,  client: httpx.AsyncClient, kind : Literal["综合", "实时", "高级"] = "综合", 
 85 |                       advanced_kind: Literal["综合", "热度", "原创"] = "综合", time_start: Optional[datetime] = None, time_end: Optional[datetime] = None) -> httpx.Response:
 86 |     """获取列表页响应(异步)
 87 | 
 88 |     Args:
 89 |         search_for (str): 需要搜索的内容，如果是话题，需要在 search_for 前后都加上 #.
 90 |         page_index (int): 页码.
 91 |         client (httpx.AsyncClient): 异步客户端.
 92 |         kind (Literal[, optional): 搜索类型可以是 综合，实时，高级(添加了综合，热度，原创筛选以及时间). Defaults to "综合".
 93 |         advanced_kind (Literal[, optional): 筛选条件，可以是综合，热度，原创. Defaults to "综合".
 94 |         time_start (Optional[datetime], optional): 起始时间，最大颗粒度为小时. Defaults to None.
 95 |         time_end (Optional[datetime], optional): 结束时间，最大颗粒度为小时. Defaults to None.
 96 | 
 97 |     Returns:
 98 |         httpx.Response: 返回列表页响应
 99 |     """
100 |     url, headers = build_list_params(search_for, page_index, kind=kind, advanced_kind=advanced_kind, time_start=time_start, time_end=time_end)
101 |     response = await client.get(url, headers=headers)
102 |     return response
103 | 


--------------------------------------------------------------------------------
/WeiBoCrawler/request/get_rum_request(unuse).py:
--------------------------------------------------------------------------------
 1 | # rum 不需要构建
 2 | 
 3 | import httpx
 4 | import json
 5 | 
 6 | def get_rum_level_one_response(buildComments_url):
 7 |     
 8 |     headers = {
 9 |         "accept": "*/*",
10 |         "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
11 |         "content-type": "multipart/form-data; boundary=----WebKitFormBoundaryvnSjtxxxjv6x1pFT",
12 |         "origin": "https://weibo.com",
13 |         "priority": "u=1, i",
14 |         "referer": "https://weibo.com/2803301701/PblVL5Bg5",
15 |         "sec-ch-ua": "\"Not A(Brand\";v=\"8\", \"Chromium\";v=\"132\", \"Microsoft Edge\";v=\"132\"",
16 |         "sec-ch-ua-mobile": "?0",
17 |         "sec-ch-ua-platform": "\"Windows\"",
18 |         "sec-fetch-dest": "empty",
19 |         "sec-fetch-mode": "cors",
20 |         "sec-fetch-site": "same-origin",
21 |         "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0",
22 |         "x-xsrf-token": "seBDSEeh70cZTEWGWWkFmxxG"
23 |     }
24 | 
25 |     cookies = {
26 |         "SCF": "AnQhEA08TUG9ln2r7R0-cHMvj3KTSZb-85kfIcXTHqooYhjTcn-UkaGS5792LpSqqbJApBlXrIheowZ1k4aYR1Q.",
27 |         "SUB": "_2A25Kkj8dDeRhGeFJ4lIT9CzNyj6IHXVp7j7VrDV8PUNbmtAYLVT5kW9NfsmQ4UzJuUOhUQbYBkUvv3HADVVzl9Ig",
28 |         "SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9W5Oj.LmOvr7_7fS8d6lYxiZ5JpX5KzhUgL.FoMN1K5EShzpeKz2dJLoIp7LxKML1KBLBKnLxKqL1hnLBoMNS0.7eoBEeK2E",
29 |         "ALF": "02_1740495949",
30 |         "SINAGLOBAL": "970667482772.5692.1737903974414",
31 |         "ULV": "1737903974460:1:1:1:970667482772.5692.1737903974414:",
32 |         "XSRF-TOKEN": "seBDSEeh70cZTEWGWWkFmxxG",
33 |         "WBPSESS": "2bPq4LTfaY-EnTnt8h5hWX9KGoz50scMNqd4lpDCT8IiCLnpv2C9Z_Kk8JVbYkIyBQ0eFNYccRFpnV_A6ntYbwjqG_PAbMAldrAdPPf_XvQiQHrkm_9GFJunwjaIeUwiupJQv3fNpU5K1Xq-CCdaFg=="
34 |     }
35 | 
36 |     entry = {
37 |                 "name": "https://weibo.com/ajax/statuses/buildComments?flow=0&is_reload=1&id=5127059131334865&is_show_bulletin=2&is_mix=0&max_id=139293859600042&count=20&uid=2803301701&fetch_level=0&locale=zh-CN",
38 |                 "entryType": "resource",
39 |                 "startTime": "327212.7000000002",
40 |                 "duration": "493.20000000018626",
41 |                 "initiatorType": "xmlhttprequest",
42 |                 "deliveryType": "",
43 |                 "nextHopProtocol": "h2",
44 |                 "renderBlockingStatus": "non-blocking",
45 |                 "workerStart": 0,
46 |                 "redirectStart": 0,
47 |                 "redirectEnd": 0,
48 |                 "fetchStart": "327212.7000000002",
49 |                 "domainLookupStart": "327212.7000000002",
50 |                 "domainLookupEnd": "327212.7000000002",
51 |                 "connectStart": "327212.7000000002",
52 |                 "secureConnectionStart": "327212.7000000002",
53 |                 "connectEnd": "327212.7000000002",
54 |                 "requestStart": "327226.7000000002",
55 |                 "responseStart": "327702.6000000001",
56 |                 "firstInterimResponseStart": 0,
57 |                 "responseEnd": "327705.9000000004",
58 |                 "transferSize": 11971,
59 |                 "encodedBodySize": 11671,
60 |                 "decodedBodySize": 72237,
61 |                 "responseStatus": 200,
62 |                 "serverTiming": [],
63 |                 "dns": 0,
64 |                 "tcp": 0,
65 |                 "ttfb": "475.89999999990687",
66 |                 "pathname": "https://weibo.com/ajax/statuses/buildComments",
67 |                 "speed": 0
68 |             }
69 | 
70 |     files = {
71 |             "entry": (None, json.dumps(entry)),
72 |             "request_id": (None, ""),
73 |         }
74 | 
75 |     url = "https://weibo.com/ajax/log/rum"
76 |     response = httpx.post(url, headers=headers, cookies=cookies, files=files)
77 | 
78 |     print(response.headers)
79 | 
80 | url = "https://weibo.com/ajax/statuses/buildComments?flow=0&is_reload=1&id=139293862124853&is_show_bulletin=2&is_mix=0&max_id=139568722411765&count=20&uid=2803301701&fetch_level=0&locale=zh-CN"
81 | get_rum_level_one_response(url)


--------------------------------------------------------------------------------
/WeiBoCrawler/request/request.toml:
--------------------------------------------------------------------------------
  1 | # 需要配置 referer
  2 | [list_headers]
  3 | accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"
  4 | accept-language = "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6"
  5 | priority = "u=0, i"
  6 | sec-ch-ua = "\"Not A(Brand\";v=\"8\", \"Chromium\";v=\"132\", \"Microsoft Edge\";v=\"132\""
  7 | sec-ch-ua-mobile = "?0"
  8 | sec-ch-ua-platform = "\"Windows\""
  9 | sec-fetch-dest = "document"
 10 | sec-fetch-mode = "navigate"
 11 | sec-fetch-site = "none"
 12 | sec-fetch-user = "?1"
 13 | upgrade-insecure-requests = "1"
 14 | user-agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0"
 15 | 
 16 | # 需要配置 referer
 17 | [body_headers]
 18 | accept = "application/json, text/plain, */*"
 19 | accept-language = "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6"
 20 | client-version = "v2.47.25"
 21 | priority = "u=1, i"
 22 | referer = "https://weibo.com/1644114654/OiZre8dir?refer_flag=1001030103_"
 23 | sec-ch-ua = "\"Not A(Brand\";v=\"8\", \"Chromium\";v=\"132\", \"Microsoft Edge\";v=\"132\""
 24 | sec-ch-ua-mobile = "?0"
 25 | sec-ch-ua-platform = "\"Windows\""
 26 | sec-fetch-dest = "empty"
 27 | sec-fetch-mode = "cors"
 28 | sec-fetch-site = "same-origin"
 29 | server-version = "v2025.01.23.1"
 30 | user-agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0"
 31 | x-requested-with = "XMLHttpRequest"
 32 | x-xsrf-token = "seBDSEeh70cZTEWGWWkFmxxG"
 33 | 
 34 | 
 35 | # 需要配置 referer x-xsrf-token
 36 | [comment1_buildComments_headers]
 37 | accept = "application/json, text/plain, */*"
 38 | accept-language = "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6"
 39 | client-version = "v2.47.25"
 40 | priority = "u=1, i"
 41 | referer = "https://weibo.com/1644114654/OiZre8dir"
 42 | sec-ch-ua = "\"Not A(Brand\";v=\"8\", \"Chromium\";v=\"132\", \"Microsoft Edge\";v=\"132\""
 43 | sec-ch-ua-mobile = "?0"
 44 | sec-ch-ua-platform = "\"Windows\""
 45 | sec-fetch-dest = "empty"
 46 | sec-fetch-mode = "cors"
 47 | sec-fetch-site = "same-origin"
 48 | server-version = "v2025.01.23.1"
 49 | user-agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0"
 50 | x-requested-with = "XMLHttpRequest"
 51 | x-xsrf-token = "seBDSEeh70cZTEWGWWkFmxxG"
 52 | 
 53 | # 需要配置 referer
 54 | [comment2_buildComments_headers]
 55 | accept = "application/json, text/plain, */*"
 56 | accept-language = "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6"
 57 | client-version = "v2.47.25"
 58 | priority = "u=1, i"
 59 | referer = "https://weibo.com/1644114654/OiZre8dir"
 60 | sec-ch-ua = "\"Not A(Brand\";v=\"8\", \"Chromium\";v=\"132\", \"Microsoft Edge\";v=\"132\""
 61 | sec-ch-ua-mobile = "?0"
 62 | sec-ch-ua-platform = "\"Windows\""
 63 | sec-fetch-dest = "empty"
 64 | sec-fetch-mode = "cors"
 65 | sec-fetch-site = "same-origin"
 66 | server-version = "v2025.01.23.1"
 67 | user-agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0"
 68 | x-requested-with = "XMLHttpRequest"
 69 | x-xsrf-token = "seBDSEeh70cZTEWGWWkFmxxG"
 70 | 
 71 | 
 72 | # 需要配置 referer  x-xsrf-token
 73 | [comment1_rum_headers]
 74 | accept = "*/*"
 75 | accept-language = "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6"
 76 | content-type = "multipart/form-data; boundary=----WebKitFormBoundaryP8RPAfGDm1mdduKE"
 77 | origin = "https://weibo.com"
 78 | priority = "u=1, i"
 79 | referer = "https://weibo.com/1644114654/OiZre8dir"
 80 | sec-ch-ua = "\"Not A(Brand\";v=\"8\", \"Chromium\";v=\"132\", \"Microsoft Edge\";v=\"132\""
 81 | sec-ch-ua-mobile = "?0"
 82 | sec-ch-ua-platform = "\"Windows\""
 83 | sec-fetch-dest = "empty"
 84 | sec-fetch-mode = "cors"
 85 | sec-fetch-site = "same-origin"
 86 | user-agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0"
 87 | x-xsrf-token = "seBDSEeh70cZTEWGWWkFmxxG"
 88 | 
 89 | 
 90 | # 需要配置 referer  x-xsrf-token
 91 | [comment2_rum_headers]
 92 | accept = "*/*"
 93 | accept-language = "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6"
 94 | content-type = "multipart/form-data; boundary=----WebKitFormBoundary0CRQdyFBn3rj8Xh2"
 95 | origin = "https://weibo.com"
 96 | priority = "u=1, i"
 97 | referer = "https://weibo.com/1644114654/OiZre8dir"
 98 | sec-ch-ua = "\"Not A(Brand\";v=\"8\", \"Chromium\";v=\"132\", \"Microsoft Edge\";v=\"132\""
 99 | sec-ch-ua-mobile = "?0"
100 | sec-ch-ua-platform = "\"Windows\""
101 | sec-fetch-dest = "empty"
102 | sec-fetch-mode = "cors"
103 | sec-fetch-site = "same-origin"
104 | user-agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0"
105 | x-xsrf-token = "seBDSEeh70cZTEWGWWkFmxxG"
106 | 
107 | 
108 | # 需要配置 referer  x-xsrf-token
109 | [login_signin_headers]
110 | accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"
111 | accept-language = "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6"
112 | cache-control = "max-age=0"
113 | priority = "u=0, i"
114 | sec-ch-ua = "\"Not A(Brand\";v=\"8\", \"Chromium\";v=\"132\", \"Microsoft Edge\";v=\"132\""
115 | sec-ch-ua-mobile = "?0"
116 | sec-ch-ua-platform = "\"Windows\""
117 | sec-fetch-dest = "document"
118 | sec-fetch-mode = "navigate"
119 | sec-fetch-site = "same-origin"
120 | sec-fetch-user = "?1"
121 | upgrade-insecure-requests = "1"
122 | user-agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0"
123 | 
124 | 
125 | # 需要配置 referer  x-xsrf-token
126 | [login_qrcode_headers]
127 | accept = "application/json, text/plain, */*"
128 | accept-language = "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6"
129 | priority = "u=1, i"
130 | sec-ch-ua = "\"Not A(Brand\";v=\"8\", \"Chromium\";v=\"132\", \"Microsoft Edge\";v=\"132\""
131 | sec-ch-ua-mobile = "?0"
132 | sec-ch-ua-platform = "\"Windows\""
133 | sec-fetch-dest = "empty"
134 | sec-fetch-mode = "cors"
135 | sec-fetch-site = "same-origin"
136 | user-agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0"
137 | x-requested-with = "XMLHttpRequest"
138 | 
139 | 
140 | # 需要配置 referer  x-xsrf-token
141 | [login_final_headers]
142 | accept = "application/json, text/plain, */*"
143 | accept-language = "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6"
144 | priority = "u=1, i"
145 | sec-ch-ua = "\"Not A(Brand\";v=\"8\", \"Chromium\";v=\"132\", \"Microsoft Edge\";v=\"132\""
146 | sec-ch-ua-mobile = "?0"
147 | sec-ch-ua-platform = "\"Windows\""
148 | sec-fetch-dest = "empty"
149 | sec-fetch-mode = "cors"
150 | sec-fetch-site = "same-origin"
151 | user-agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 Edg/132.0.0.0"
152 | x-requested-with = "XMLHttpRequest"


--------------------------------------------------------------------------------
/WeiBoCrawler/request/util.py:
--------------------------------------------------------------------------------
1 | import toml
2 | from pathlib import Path
3 | from ..util import RequestHeaders
4 | 
5 | module_path = Path(__file__).parent
6 | 
7 | request_headers = RequestHeaders.model_validate(toml.load(module_path / "./request.toml"))
8 | 
9 | __all__ = [ request_headers ]


--------------------------------------------------------------------------------
/WeiBoCrawler/util/__init__.py:
--------------------------------------------------------------------------------
 1 | from .path import config_path
 2 | from .log import logging
 3 | from .database import database_config
 4 | from .cookie import cookies_config
 5 | from .decorator import log_function_params, retry_timeout_decorator, retry_timeout_decorator_asyncio, custom_validate_call
 6 | from .custom import CustomProgress, RequestHeaders
 7 | from .process import process_time_str, process_base_document, process_base_documents
 8 | 
 9 | __all__ = [
10 |     "logging",
11 |     
12 |     "config_path",
13 | 
14 |     "database_config", 
15 |     "cookies_config",
16 |     
17 |     "log_function_params",
18 |     "retry_timeout_decorator",
19 |     "retry_timeout_decorator_asyncio",
20 |     "custom_validate_call",
21 | 
22 |     "CustomProgress",
23 |     "RequestHeaders",
24 | 
25 |     "process_time_str",
26 |     "process_base_document",
27 |     "process_base_documents",
28 | ]


--------------------------------------------------------------------------------
/WeiBoCrawler/util/cookie.py:
--------------------------------------------------------------------------------
 1 | import toml
 2 | from pydantic import BaseModel
 3 | from .path import config_path
 4 | 
 5 | class CookiesConfig(BaseModel):
 6 |     """这个类主要用来保存 Cookies
 7 | 
 8 |     Attributes:
 9 |         cookies (dict): 微博的cookies
10 |         cookies_info (datetime): 更新时间    
11 |     """
12 |     cookies: dict
13 |     cookies_info: dict
14 | 
15 | cookies_config = CookiesConfig.model_validate(toml.load(config_path))
16 | 


--------------------------------------------------------------------------------
/WeiBoCrawler/util/custom.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from rich.progress import (
 3 |     BarColumn,
 4 |     MofNCompleteColumn,
 5 |     Progress,
 6 |     TextColumn,
 7 |     TimeElapsedColumn,
 8 | )
 9 | 
10 | 
11 | 
12 | class CustomProgress:
13 |     """自定义进度条
14 | 
15 |     Attributes:
16 |         progress (Progress): 进度条
17 |     """
18 |     def __init__(self):
19 |         self.progress = Progress(
20 |             BarColumn(),
21 |             MofNCompleteColumn(),
22 |             TimeElapsedColumn(),
23 |             TextColumn("[progress.description]{task.description}", justify="left"),
24 |         )
25 | 
26 |     def __enter__(self):
27 |         self.progress.start()
28 |         return self.progress
29 | 
30 |     def __exit__(self, exc_type, exc_val, exc_tb):
31 |         self.progress.stop()
32 | 
33 | 
34 | 
35 | class RequestHeaders(BaseModel):
36 |     """这个类主要用来保存一些请求参数的东西
37 | 
38 |     Attributes:
39 |         body_headers (dict): 微博主页的请求头
40 |         comment1_buildComments_headers (dict): 评论区buildComments的请求头
41 |         comment1_rum_headers (dict): 评论区rum的请求头
42 |         ....
43 |     """
44 |     list_headers: dict
45 |     body_headers: dict
46 |     comment1_buildComments_headers: dict
47 |     comment1_rum_headers: dict
48 |     comment2_buildComments_headers: dict
49 |     comment2_rum_headers: dict
50 |     login_signin_headers:dict
51 |     login_qrcode_headers:dict
52 |     login_final_headers:dict


--------------------------------------------------------------------------------
/WeiBoCrawler/util/database.py:
--------------------------------------------------------------------------------
 1 | import toml
 2 | from .path import module_path, config_path, Path
 3 | from pydantic import BaseModel, field_validator
 4 | 
 5 | 
 6 | class DatabaseConfig(BaseModel):
 7 |     path: str
 8 | 
 9 |     @field_validator('path')
10 |     def modify_module_path(cls, value):
11 |         if Path(value).is_absolute():
12 |             return str(value)
13 |         else:
14 |             return str(module_path / value)
15 |         
16 | 
17 | database_config = DatabaseConfig.model_validate(toml.load(config_path)["database"])
18 | 


--------------------------------------------------------------------------------
/WeiBoCrawler/util/decorator.py:
--------------------------------------------------------------------------------
 1 | from .log import logging
 2 | from typing import Callable
 3 | import httpx
 4 | from pydantic import validate_call
 5 | 
 6 | def custom_validate_call(func: Callable) -> Callable:
 7 |     return validate_call(func, config={"arbitrary_types_allowed": True}, validate_return=True)
 8 | 
 9 | def log_function_params(logger: logging.Logger=logging):
10 |     """记录函数的参数和返回值
11 | 
12 |     Args:
13 |         func (Callable): 需要装饰的函数
14 |            
15 |     Returns:
16 |         Callable: 装饰后的函数
17 |     """
18 |     def log_function_params_(func:Callable) -> Callable:
19 |         def wrapper(*args, **kwargs):
20 |             # 记录函数名和参数
21 |             args_repr = [repr(a) for a in args]
22 |             kwargs_repr = [f"{k}={v!r}" for k, v in kwargs.items()]
23 |             signature = ", ".join(args_repr + kwargs_repr)
24 |             logger.info(f"Calling Function {func.__name__}({signature})")
25 |             
26 |             # 调用原函数
27 |             result = func(*args, **kwargs)
28 |             
29 |             # 记录返回值
30 |             logger.info(f"Function {func.__name__} returned {result!r}")
31 |             return result
32 |         return wrapper
33 |     return log_function_params_
34 | 
35 | 
36 | def retry_timeout_decorator(func: Callable) -> Callable:
37 |     """超时重试装饰器
38 | 
39 |     Args:
40 |         retry_times (int): 重试次数. Defaults to 3.
41 | 
42 |     Returns:
43 |         Callable: 装饰后的函数
44 |     """
45 |     retry_times = 3
46 |     def wrapper(*args, **kwargs):
47 |         attempts = 0
48 |         while attempts < retry_times:
49 |             try:
50 |                 return func(*args, **kwargs)
51 |             except httpx.TimeoutException as e:
52 |                 attempts += 1
53 |                 if attempts < retry_times:
54 |                     logging.warning(f"请求超时，正在进行第 {attempts} 次重试...")
55 |                 else:
56 |                     logging.error(f"请求超时，重试次数已达到最大值，请检查网络连接或重试次数！错误原因{e}")
57 |     return wrapper
58 | 
59 | 
60 | def retry_timeout_decorator_asyncio(func: Callable) -> Callable:
61 |     """超时重试装饰器(异步)
62 | 
63 |     Args:
64 |         retry_times (int): 重试次数. Defaults to 3.
65 | 
66 |     Returns:
67 |         Callable: 装饰后的函数
68 |     """
69 |     retry_times = 3
70 |     async def wrapper(*args, **kwargs):  # 将 wrapper 改为异步函数
71 |         attempts = 0
72 |         while attempts < retry_times:
73 |             try:
74 |                 return await func(*args, **kwargs)  # 调用异步函数并使用 await
75 |             except httpx.TimeoutException as e:
76 |                 attempts += 1
77 |                 if attempts < retry_times:
78 |                     logging.warning(f"请求超时，正在进行第 {attempts} 次重试...")
79 |                 else:
80 |                     logging.error(f"请求超时，重试次数已达到最大值，请检查网络连接或重试次数！错误原因{e}")
81 |     return wrapper


--------------------------------------------------------------------------------
/WeiBoCrawler/util/log.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from .path import module_path
 3 | 
 4 | 
 5 | # 配置日志
 6 | logging.basicConfig(
 7 |     filename=module_path / "./app.log",
 8 |     level=logging.INFO, 
 9 |     format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
10 |     encoding="utf-8",
11 | )


--------------------------------------------------------------------------------
/WeiBoCrawler/util/path.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | 
3 | 
4 | module_path = Path(__file__).parent.parent
5 | 
6 | config_path = module_path / "./config.toml"


--------------------------------------------------------------------------------
/WeiBoCrawler/util/process.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from datetime import datetime, timedelta
  3 | import pandas as pd
  4 | 
  5 | 
  6 | def process_time_str(time_str:str) -> datetime:
  7 |     """这段代码是用来解析微博的时间字段的
  8 |          1. 处理 年、月、日、时、分
  9 |          2. 处理 分钟前，小时前，这里不处理秒前
 10 | 
 11 |     Args:
 12 |         time_str (str): 微博时间字段
 13 | 
 14 |     Returns:
 15 |         datatime: 返回时间字段
 16 |     """
 17 |     datetime_now = datetime.now()
 18 | 
 19 |     if "年" in time_str:
 20 |         year = re.search(r"(\d{4})年", time_str).group(1)
 21 |     else:
 22 |         year = datetime_now.year
 23 |     if "月" in time_str:
 24 |         month = re.search(r"(\d{1,2})月", time_str).group(1)
 25 |     else:
 26 |         month = datetime_now.month
 27 |     if "日" in time_str:
 28 |         day = re.search(r"(\d{1,2})日", time_str).group(1)
 29 |     else:
 30 |         day = datetime_now.day
 31 |     if ":" in time_str:
 32 |         hour = re.search(r"(\d{1,2}):", time_str).group(1)
 33 |         minute = re.search(r":(\d{1,2})", time_str).group(1)
 34 |     else:
 35 |         hour = datetime_now.hour
 36 |         minute = datetime_now.minute
 37 | 
 38 |     datetime_now = datetime(int(year), int(month), int(day), int(hour), int(minute))
 39 | 
 40 |     if "分钟前" in time_str:
 41 |         minute_before = re.search(r"(\d+)分钟前", time_str).group(1)
 42 |         datetime_now = datetime_now - timedelta(minutes=int(minute_before))
 43 |     if "小时前" in time_str:
 44 |         hour_before = re.search(r"(\d+)小时前", time_str).group(1)
 45 |         datetime_now = datetime_now - timedelta(hours=int(hour_before))
 46 | 
 47 |     return datetime_now
 48 | 
 49 | 
 50 | 
 51 | def drop_documents_duplicates(documents: list[dict]) -> None:
 52 |     """dict 列表去重
 53 |     这里暂时使用最简单的列表去重法, 后续可以考虑使用 hash 去重等方法优化..
 54 | 
 55 |     Args:
 56 |         list[dict]: 去重后的表
 57 |     """
 58 |     unique_document = []
 59 |     for document in documents:
 60 |         if document not in unique_document:
 61 |             unique_document.append(document)
 62 |     
 63 |     return unique_document
 64 | 
 65 | 
 66 | def process_base_document(document: dict, transform_dict: dict) -> dict:
 67 |     """将 document 处理成字典的形式
 68 | 
 69 |     transform_dict = {
 70 |             "转发数量": "retweet_num",
 71 |             "评论数量": "comment_num",
 72 |             "点赞数量": "star_num
 73 |           ...
 74 |         }
 75 | 
 76 |     Args:
 77 |         document (dict): 文档
 78 |         transform_dict (dict): 转换字典, key 是转化后的字段, value 是原始字段
 79 | 
 80 |     Returns:
 81 |         dict: 处理后的字典
 82 |     """
 83 |     item = {}
 84 | 
 85 |     for key, value in transform_dict.items():
 86 |         if isinstance(value, str):
 87 |             final_value = document.get(value, None)
 88 | 
 89 |         elif isinstance(value, list):
 90 |             final_value = document
 91 |             for v in value:
 92 |                 if final_value is None:
 93 |                     break
 94 |                 final_value = final_value.get(v, None)
 95 | 
 96 |         item[key] = final_value
 97 |     return item
 98 | 
 99 | 
100 | def process_base_documents(documents: list[dict], transform_dict: dict) -> pd.DataFrame:
101 |     """将 documents 处理成 dataframe 的形式
102 |     
103 |     transform_dict = {
104 |             "转发数量": "retweet_num",
105 |             "评论数量": "comment_num",
106 |             "点赞数量": "star_num",
107 |             ...
108 |         }
109 | 
110 |     Args:
111 |         documents (list[dict]): 文档列表
112 |         transform_dict (dict): 转换字典, key 是转化后的字段, value 是原始字段
113 | 
114 |     Returns:
115 |         pd.DataFrame: (去重)处理后得到的表格
116 |     """
117 |     items = [process_base_document(document, transform_dict) for document in documents]
118 |     df = pd.DataFrame(items)
119 |     df.drop_duplicates(inplace=True)
120 |     return df


--------------------------------------------------------------------------------
/WeiBoCrawler/util/show_qrcode.py:
--------------------------------------------------------------------------------
 1 | from pyzbar.pyzbar import decode
 2 | from PIL import Image
 3 | import qrcode
 4 | 
 5 | def show_qrcode(img_path:str):
 6 |     """在控制台显示二维码
 7 | 
 8 |     Args:
 9 |         img_path (str): 二维码路径
10 |     """
11 |     img = Image.open('gen.png')
12 |     decoded_data = decode(img)
13 |     data = decoded_data[0].data.decode('utf-8')
14 |     qr = qrcode.QRCode()
15 |     qr.add_data(data)
16 |     qr.make()
17 |     qr.print_ascii()


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "weibocrawler"
 3 | version = "0.1.0"
 4 | description = "Add your description here"
 5 | readme = "README.md"
 6 | requires-python = ">=3.10"
 7 | dependencies = [
 8 |     "aiosqlite>=0.20.0",
 9 |     "httpx>=0.28.1",
10 |     "pandas>=2.0.3",
11 |     "parsel>=1.9.1",
12 |     "pydantic>=2.10.6",
13 |     "pyzbar>=0.1.9",
14 |     "qrcode>=8.0",
15 |     "sqlalchemy>=2.0.37",
16 |     "streamlit>=1.41.1",
17 |     "toml>=0.10.2",
18 | ]
19 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | # This file was autogenerated by uv via the following command:
  2 | # 这个文件是使用下列命名自动生成的:
  3 | #    uv pip compile pyproject.toml -o requirements.txt
  4 | aiosqlite==0.20.0
  5 |     # via weibocrawler (pyproject.toml)
  6 | altair==5.5.0
  7 |     # via streamlit
  8 | annotated-types==0.7.0
  9 |     # via pydantic
 10 | anyio==4.8.0
 11 |     # via httpx
 12 | attrs==25.1.0
 13 |     # via
 14 |     #   jsonschema
 15 |     #   referencing
 16 | blinker==1.9.0
 17 |     # via streamlit
 18 | cachetools==5.5.1
 19 |     # via streamlit
 20 | certifi==2024.12.14
 21 |     # via
 22 |     #   httpcore
 23 |     #   httpx
 24 |     #   requests
 25 | charset-normalizer==3.4.1
 26 |     # via requests
 27 | click==8.1.8
 28 |     # via streamlit
 29 | colorama==0.4.6
 30 |     # via
 31 |     #   click
 32 |     #   qrcode
 33 | cssselect==1.2.0
 34 |     # via parsel
 35 | exceptiongroup==1.2.2
 36 |     # via anyio
 37 | gitdb==4.0.12
 38 |     # via gitpython
 39 | gitpython==3.1.44
 40 |     # via streamlit
 41 | greenlet==3.1.1
 42 |     # via sqlalchemy
 43 | h11==0.14.0
 44 |     # via httpcore
 45 | httpcore==1.0.7
 46 |     # via httpx
 47 | httpx==0.28.1
 48 |     # via weibocrawler (pyproject.toml)
 49 | idna==3.10
 50 |     # via
 51 |     #   anyio
 52 |     #   httpx
 53 |     #   requests
 54 | jinja2==3.1.5
 55 |     # via
 56 |     #   altair
 57 |     #   pydeck
 58 | jmespath==1.0.1
 59 |     # via parsel
 60 | jsonschema==4.23.0
 61 |     # via altair
 62 | jsonschema-specifications==2024.10.1
 63 |     # via jsonschema
 64 | lxml==5.3.0
 65 |     # via parsel
 66 | markdown-it-py==3.0.0
 67 |     # via rich
 68 | markupsafe==3.0.2
 69 |     # via jinja2
 70 | mdurl==0.1.2
 71 |     # via markdown-it-py
 72 | narwhals==1.24.1
 73 |     # via altair
 74 | numpy==2.2.2
 75 |     # via
 76 |     #   pandas
 77 |     #   pydeck
 78 |     #   streamlit
 79 | packaging==24.2
 80 |     # via
 81 |     #   altair
 82 |     #   parsel
 83 |     #   streamlit
 84 | pandas==2.2.3
 85 |     # via
 86 |     #   weibocrawler (pyproject.toml)
 87 |     #   streamlit
 88 | parsel==1.10.0
 89 |     # via weibocrawler (pyproject.toml)
 90 | pillow==11.1.0
 91 |     # via streamlit
 92 | protobuf==5.29.3
 93 |     # via streamlit
 94 | pyarrow==19.0.0
 95 |     # via streamlit
 96 | pydantic==2.10.6
 97 |     # via weibocrawler (pyproject.toml)
 98 | pydantic-core==2.27.2
 99 |     # via pydantic
100 | pydeck==0.9.1
101 |     # via streamlit
102 | pygments==2.19.1
103 |     # via rich
104 | python-dateutil==2.9.0.post0
105 |     # via pandas
106 | pytz==2024.2
107 |     # via pandas
108 | pyzbar==0.1.9
109 |     # via weibocrawler (pyproject.toml)
110 | qrcode==8.0
111 |     # via weibocrawler (pyproject.toml)
112 | referencing==0.36.2
113 |     # via
114 |     #   jsonschema
115 |     #   jsonschema-specifications
116 | requests==2.32.3
117 |     # via streamlit
118 | rich==13.9.4
119 |     # via streamlit
120 | rpds-py==0.22.3
121 |     # via
122 |     #   jsonschema
123 |     #   referencing
124 | six==1.17.0
125 |     # via python-dateutil
126 | smmap==5.0.2
127 |     # via gitdb
128 | sniffio==1.3.1
129 |     # via anyio
130 | sqlalchemy==2.0.37
131 |     # via weibocrawler (pyproject.toml)
132 | streamlit==1.41.1
133 |     # via weibocrawler (pyproject.toml)
134 | tenacity==9.0.0
135 |     # via streamlit
136 | toml==0.10.2
137 |     # via
138 |     #   weibocrawler (pyproject.toml)
139 |     #   streamlit
140 | tornado==6.4.2
141 |     # via streamlit
142 | typing-extensions==4.12.2
143 |     # via
144 |     #   aiosqlite
145 |     #   altair
146 |     #   anyio
147 |     #   pydantic
148 |     #   pydantic-core
149 |     #   referencing
150 |     #   rich
151 |     #   sqlalchemy
152 |     #   streamlit
153 | tzdata==2025.1
154 |     # via pandas
155 | urllib3==2.3.0
156 |     # via requests
157 | w3lib==2.2.1
158 |     # via parsel
159 | watchdog==6.0.0
160 |     # via streamlit
161 | 


--------------------------------------------------------------------------------
/web/main.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | 
 3 | # 在使用绝对路径的时候, 只有初次能够成功！如果修改页面就会报错. 
 4 | # 使用相对路径的话不是基于项目的, 而是基于运行 streamlit run main.py 的路径.
 5 | 
 6 | 
 7 | st.set_page_config(
 8 |     page_title="微博爬虫数据分析",
 9 |     page_icon="💻",
10 |     layout="wide",
11 |     initial_sidebar_state="expanded",
12 | )
13 | 
14 | 
15 | pg = st.navigation({
16 |     "Cookie": [
17 |         st.Page("./web_pages/Cookie/Cookie.py", title="Cookie", icon=":material/add_circle:")
18 |     ],
19 |     "下载": [
20 |         st.Page("./web_pages/搜索/列表搜索.py", title="列表搜索", icon=":material/add_circle:"),
21 |         st.Page("./web_pages/搜索/详细页搜索.py", title="详细页搜索", icon=":material/add_circle:"),
22 |         st.Page("./web_pages/搜索/一级评论搜索.py", title="一级评论搜索", icon=":material/add_circle:"),
23 |         st.Page("./web_pages/搜索/二级评论搜索.py", title="二级评论搜索", icon=":material/add_circle:"),
24 |     ],
25 |     "查询": [
26 |         st.Page("./web_pages/查询/查询.py", title="SQL语句查询", icon=":material/add_circle:")
27 |     ],
28 | })
29 | 
30 | pg.run()


--------------------------------------------------------------------------------
/web/util/__init__.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append(".")
 3 | 
 4 | from WeiBoCrawler.database import db, BodyRecord, Comment1Record, Comment2Record
 5 | from WeiBoCrawler.pack import get_list_data, get_body_data, get_comment1_data, get_comment2_data
 6 | from WeiBoCrawler.parse import process_list_documents, process_comment_documents, process_body_documents
 7 | from WeiBoCrawler.request import get_qr_Info, get_qr_status
 8 | from WeiBoCrawler.util import config_path, cookies_config
 9 | 
10 | 
11 | __all__ = [
12 |     "config_path",
13 |     "cookies_config",
14 |     
15 |     "get_qr_Info",
16 |     "get_qr_status",
17 | 
18 |     "get_list_data",
19 |     "get_body_data",
20 |     "get_comment1_data",
21 |     "get_comment2_data",
22 | 
23 |     "db",
24 |     "BodyRecord",
25 |     "Comment1Record",
26 |     "Comment2Record",
27 | 
28 |     "process_body_documents",
29 |     "process_list_documents",
30 |     "process_comment_documents",
31 | ]


--------------------------------------------------------------------------------
/web/web_pages/Cookie/Cookie.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import toml
 3 | from util import cookies_config, config_path, get_qr_Info, get_qr_status
 4 | from datetime import datetime
 5 | from threading import Thread
 6 | from streamlit.runtime.scriptrunner import add_script_run_ctx, get_script_run_ctx
 7 | 
 8 | if 'Thread' not in st.session_state:
 9 |     st.session_state["Thread"] = None
10 | 
11 | def set_cookies(cookies):
12 |     if cookies is not None:
13 |         cookies_config.cookies.update(cookies)
14 |         cookies_config.cookies_info["update_time"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
15 |         config_data = toml.load(config_path)
16 | 
17 |         config_data["cookies"].update(cookies_config.cookies)
18 |         config_data["cookies_info"].update(cookies_config.cookies_info)
19 | 
20 |         with open(config_path, "w", encoding="utf-8") as f:
21 |             toml.dump(config_data, f)
22 |     else:
23 |         st.error("获取 cookies 失败!!!!!!!")
24 | 
25 | 
26 | def get_cookies(client, login_signin_url, qrid):
27 |     cookies = get_qr_status(client, login_signin_url, qrid)
28 |     if cookies is None:
29 |         st.error("获取 cookies 失败!!!!!!!")
30 |     else:
31 |         set_cookies(cookies)
32 |     client.close()
33 | 
34 | 
35 | @st.dialog("使用微博APP扫码登录")
36 | def scan_code():
37 |     if st.session_state["Thread"] is not None and st.session_state["Thread"].is_alive():
38 |         st.image(image=st.session_state["image"])
39 |     else:
40 |         image, client, login_signin_url, qrid = get_qr_Info()
41 |         st.session_state["image"] = image
42 |         st.image(image=image)
43 | 
44 |         st.session_state["Thread"] = Thread(target=get_cookies, args=(client, login_signin_url, qrid))
45 |         add_script_run_ctx(st.session_state["Thread"], get_script_run_ctx())
46 |         st.session_state["Thread"].start()
47 | 
48 | cols = st.columns([1, 1, 15])
49 | cols[0].button("更新", key="update", on_click=scan_code, type="secondary", use_container_width=True)
50 | if cols[1].button("刷新", key="rerun", type="secondary", use_container_width=True):
51 |     st.rerun()
52 | st.write(cookies_config)


--------------------------------------------------------------------------------
/web/web_pages/搜索/一级评论搜索.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | from util import get_comment1_data, db, Comment1Record, process_comment_documents
 3 | 
 4 | cols = st.columns([4, 4, 3, 1, 2, 2], vertical_alignment="bottom")
 5 | cols[0].text_input("uid 列表(用空格分隔)", value="2035895904 1749277070", key="uid")
 6 | cols[1].text_input("mid 列表(用空格分隔)", value="5096904217856018 5045463240409185", key="mid")
 7 | cols[2].text_input("存储表名", value="test", key="table_name")
 8 | 
 9 | cols[-1].button("搜索", type="primary", key="comment1_button")
10 | 
11 | if st.session_state["comment1_button"]:
12 |     uids = st.session_state["uid"].split()
13 |     mids = st.session_state["mid"].split()
14 | 
15 |     if st.session_state["table_name"] == "" or mids == [] or uids == []:
16 |         st.warning("uid列表，mid列表存储表名不能为空")
17 |     elif len(mids) != len(uids):
18 |         st.warning("uid列表和mid列表长度必须一致")
19 |     else:
20 |         with st.spinner("搜索中(进展在控制台)..."):
21 |             res_ids = get_comment1_data(uid=uids, mid=mids, table_name=st.session_state["table_name"])
22 |         with st.spinner("导入中(进展在控制台)..."):
23 |             records = db.sync_get_records_by_ids(Comment1Record, res_ids)
24 |             documents = [record.json_data for record in records]
25 |             st.session_state["comment1"] = process_comment_documents(documents)
26 | 
27 | if "comment1" in st.session_state:
28 |     st.dataframe(st.session_state["comment1"])


--------------------------------------------------------------------------------
/web/web_pages/搜索/二级评论搜索.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | from util import get_comment2_data, db, Comment2Record, process_comment_documents
 3 | 
 4 | cols = st.columns([4, 4, 3, 1, 2, 2], vertical_alignment="bottom")
 5 | cols[0].text_input("uid 列表(用空格分隔)", value="1644114654 1644114654 1644114654", key="uid")
 6 | cols[1].text_input("mid 列表(用空格分隔)", value="5045280045531535 5045270515551948 5045277713760776", key="mid")
 7 | cols[2].text_input("存储表名", value="test", key="table_name")
 8 | 
 9 | cols[-1].button("搜索", type="primary", key="comment2_button")
10 | 
11 | if st.session_state["comment2_button"]:
12 |     uids = st.session_state["uid"].split()
13 |     mids = st.session_state["mid"].split()
14 | 
15 |     if st.session_state["table_name"] == "" or mids == [] or uids == []:
16 |         st.warning("uid列表，mid列表存储表名不能为空")
17 |     elif len(mids) != len(uids):
18 |         st.warning("uid列表和mid列表长度必须一致")
19 |     else:
20 |         with st.spinner("搜索中(进展在控制台)..."):
21 |             res_ids = get_comment2_data(uid=uids, mid=mids, table_name=st.session_state["table_name"])
22 |         with st.spinner("导入中(进展在控制台)..."):
23 |             records = db.sync_get_records_by_ids(Comment2Record, res_ids)
24 |             documents = [record.json_data for record in records]
25 |             st.session_state["comment2"] = process_comment_documents(documents)
26 | 
27 | if "comment2" in st.session_state:
28 |     st.dataframe(st.session_state["comment2"])


--------------------------------------------------------------------------------
/web/web_pages/搜索/列表搜索.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | from util import get_list_data, db, BodyRecord, process_list_documents
 3 | from datetime import date
 4 | 
 5 | cols = st.columns([3, 3, 1, 1, 2, 2, 2, 2], vertical_alignment="bottom")
 6 | cols[0].text_input("搜索内容(话题需要在前后加上#)", value="姜平", key="search_for")
 7 | cols[1].text_input("存储表名", value="test", key="table_name")
 8 | cols[2].selectbox("搜索类型", options=["综合", "实时", "高级"], key="kind")
 9 | cols[3].selectbox("筛选条件", options=["综合", "热度", "原创"], key="advanced_kind", disabled=st.session_state["kind"]!= "高级")
10 | cols[4].date_input("起始时间", value="today", min_value=date(year=2000, month=1, day=1), key="start", disabled=st.session_state["kind"]!= "高级")
11 | cols[5].date_input("结束时间", value="today", key="end", min_value=date(year=2000, month=1, day=1), disabled=st.session_state["kind"]!= "高级")
12 | 
13 | cols[-1].button("搜索", type="primary", key="list_button")
14 | 
15 | if st.session_state["list_button"]:
16 |     if st.session_state["search_for"] == "" or st.session_state["table_name"] == "":
17 |         st.warning("搜索内容和存储表名不能为空")
18 |     else:
19 |         with st.spinner("搜索中(进展在控制台)..."):
20 |             res_ids = get_list_data(search_for=st.session_state["search_for"], table_name=st.session_state["table_name"], 
21 |                       kind=st.session_state["kind"], advanced_kind=st.session_state["advanced_kind"], time_start=st.session_state["start"], time_end=st.session_state["end"])
22 |         with st.spinner("导入中(进展在控制台)..."):
23 |             records = db.sync_get_records_by_ids(BodyRecord, res_ids)
24 |             documents = [record.json_data for record in records]
25 |             st.session_state["list"] = process_list_documents(documents)
26 | 
27 | if "list" in st.session_state:
28 |     st.dataframe(st.session_state["list"])


--------------------------------------------------------------------------------
/web/web_pages/搜索/详细页搜索.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | from util import get_body_data, db, BodyRecord, process_body_documents
 3 | 
 4 | cols = st.columns([7, 3, 2, 2, 2], vertical_alignment="bottom")
 5 | cols[0].text_input("搜索id列表(用空格分隔)", value="OEEV7wXHY Oj0PXme8I OiZre8dir Oj0zUmucE", key="ids")
 6 | cols[1].text_input("存储表名", value="test", key="table_name")
 7 | 
 8 | cols[-1].button("搜索", type="primary", key="body_button")
 9 | 
10 | if st.session_state["body_button"]:
11 |     ids = st.session_state["ids"].split()
12 |     if st.session_state["table_name"] == "" or ids == []:
13 |         st.warning("搜索id列表和存储表名不能为空")
14 |     else:
15 |         with st.spinner("搜索中(进展在控制台)..."):
16 |             res_ids = get_body_data(id=ids, table_name=st.session_state["table_name"])
17 |         with st.spinner("导入中(进展在控制台)..."):
18 |             records = db.sync_get_records_by_ids(BodyRecord, res_ids)
19 |             documents = [record.json_data for record in records]
20 |             st.session_state["body"] = process_body_documents(documents)
21 | 
22 | if "body" in st.session_state:
23 |     st.dataframe(st.session_state["body"])


--------------------------------------------------------------------------------
/web/web_pages/查询/查询.py:
--------------------------------------------------------------------------------
 1 | from util import db
 2 | import streamlit as st
 3 | import pandas as pd
 4 | 
 5 | cols = st.columns([10, 1], vertical_alignment="bottom")
 6 | 
 7 | cols[0].text_input(label="sql(切记这里要记得写limit，不然卡死你)", placeholder="写sql语句", value="select * from BodyRecord limit 100 offset 10;", key="sql")
 8 | cols[1].button("执行sql", key="sql_button")
 9 | 
10 | if st.session_state.get("sql_button"):
11 |     df = pd.DataFrame(db.sql(st.session_state.sql))
12 |     st.session_state["sql_result"] = df
13 |     
14 | 
15 | if "sql_result" in st.session_state:
16 |     st.write(st.session_state["sql_result"])


--------------------------------------------------------------------------------