├── .env.exa ├── .gitignore ├── LICENSE ├── README.md ├── bot.py ├── config └── config.py ├── log.py ├── methods ├── __init__.py └── tg_parse_hub.py ├── plugins ├── ai_summary.py ├── inline_parse.py ├── parse.py └── start.py ├── pyproject.toml ├── utiles ├── converter.py ├── filters.py ├── img_host.py ├── ph.py └── utile.py └── uv.lock /.env.exa: -------------------------------------------------------------------------------- 1 | ##### Bot 配置 ##### 2 | API_HASH= 3 | API_ID= 4 | BOT_TOKEN= 5 | 6 | # BOT_PROXY=http://127.0.0.1:7890 7 | # PARSER_PROXY=http://127.0.0.1:7890 8 | # DOWNLOADER_PROXY=http://127.0.0.1:7890 9 | 10 | ##### API 配置 ##### 11 | # DOUYIN_API=http://127.0.0.1:80 12 | 13 | ##### AI总结配置 ##### 14 | AI_SUMMARY=True 15 | API_KEY= 16 | BASE_URL=https://apic.ohmygpt.com/v1 17 | MODEL=gpt-4o 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.venv/ 2 | /logs 3 | /.idea 4 | /downloads 5 | .env 6 | *.session -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 梓澪 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ParseHubBot 2 | 3 | --- 4 | 5 | **Telegram聚合解析Bot, 支持AI总结, 支持内联模式** 6 | **Telegram aggregation analysis Bot, supports AI summary, supports inline mode** 7 | 8 | 部署好的Bot: [@ParseHubBot](https://t.me/ParseHubBot)目前支持的平台: 9 | 10 | - `抖音视频|图文` 11 | - `哔哩哔哩视频|动态` 12 | - `YouTube` 13 | - `YouTube Music` 14 | - `TikTok视频|图文` 15 | - `小红书视频|图文` 16 | - `Twitter视频|图文` 17 | - `百度贴吧图文|视频` 18 | - `Facebook视频` 19 | - `微博视频|图文` 20 | - `Instagram视频|图文` 21 | - [查看更多...](https://github.com/z-mio/ParseHub) 22 | 23 | ![](https://img.155155155.xyz/i/2024/09/66f2d4b70416c.webp) 24 | ![](https://img.155155155.xyz/i/2024/09/66f2d4d6ca7ec.webp) 25 | 26 | ## 部署Bot 27 | 28 | ### 修改配置 29 | 30 | 将 `.env.exa` 复制为 `.env`, 并修改配置 31 | 32 | | 参数 | 说明 | 33 | |----------------------|-------------------------------------| 34 | | `API_HASH`, `API_ID` | 登录 https://my.telegram.org 获取 | 35 | | `BOT_TOKEN` | 在 @BotFather 获取 | 36 | | `BOT_PROXY` | 海外服务器无需填写 | 37 | | `PARSER_PROXY` | 解析时使用的代理 | 38 | | `DOWNLOADER_PROXY` | 下载时使用的代理 | 39 | | `DOUYIN_API` | 你部署的抖音API地址, 默认: https://douyin.wtf | 40 | | `AI_SUMMARY` | 是否开启AI总结 | 41 | | `API_KEY` | openai 密钥 | 42 | | `BASE_URL` | openai api地址 | 43 | | `MODEL` | AI总结使用的模型 | 44 | | `CACHE_TIME` | 解析结果缓存时间, 单位: 秒, 0为不缓存, 默认缓存10分钟 | 45 | 46 | ### 开始部署 47 | 48 | **在项目根目录运行:** 49 | 50 | ```shell 51 | apt install python3-pip -y 52 | pip install uv --break-system-packages 53 | uv venv --python 3.12 54 | uv sync 55 | ``` 56 | 57 | **启动bot** 58 | 59 | ```shell 60 | uv run bot.py 61 | ``` 62 | 63 | **设置命令列表** 64 | 私聊bot发送指令 `/menu` 65 | 66 | ## 使用 67 | 68 | 普通使用: 发送分享链接给bot即可 69 | 内联使用: 任意聊天窗口输入: `@bot用户名 链接` 70 | ![](https://img.155155155.xyz/i/2024/09/66f3f92973ad1.webp) 71 | 72 | ## 相关项目 73 | 74 | - [z-mio/ParseHub](https://github.com/z-mio/ParseHub) 75 | 76 | ## 鸣谢 77 | 78 | - [OhMyGPT](https://www.ohmygpt.com) 79 | - [KurimuzonAkuma/pyrogram](https://github.com/KurimuzonAkuma/pyrogram) 80 | -------------------------------------------------------------------------------- /bot.py: -------------------------------------------------------------------------------- 1 | from pyrogram import Client 2 | from config.config import bot_cfg 3 | from log import logger 4 | 5 | logger.add("logs/bot.log", rotation="10 MB") 6 | 7 | 8 | class Bot(Client): 9 | def __init__(self): 10 | self.cfg = bot_cfg 11 | 12 | super().__init__( 13 | f'{self.cfg.bot_token.split(":")[0]}_bot', 14 | api_id=self.cfg.api_id, 15 | api_hash=self.cfg.api_hash, 16 | bot_token=self.cfg.bot_token, 17 | plugins=dict(root="plugins"), 18 | proxy=self.cfg.bot_proxy.dict_format, 19 | ) 20 | 21 | async def start(self): 22 | logger.info("Bot开始运行...") 23 | await super().start() 24 | 25 | async def stop(self, *args): 26 | await super().stop() 27 | 28 | 29 | if __name__ == "__main__": 30 | bot = Bot() 31 | bot.run() 32 | -------------------------------------------------------------------------------- /config/config.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urlparse 2 | 3 | from dotenv import load_dotenv 4 | from os import getenv 5 | 6 | load_dotenv() 7 | 8 | 9 | class BotConfig: 10 | def __init__(self): 11 | self.bot_token = getenv("BOT_TOKEN") 12 | self.api_id = getenv("API_ID") 13 | self.api_hash = getenv("API_HASH") 14 | self.bot_proxy: None | BotConfig._Proxy = self._Proxy(getenv("BOT_PROXY", None)) 15 | self.parser_proxy: None | str = getenv("PARSER_PROXY", None) 16 | self.downloader_proxy: None | str = getenv("DOWNLOADER_PROXY", None) 17 | 18 | self.cache_time = int(ct) if (ct := getenv("CACHE_TIME")) else 600 19 | self.ai_summary = bool(getenv("AI_SUMMARY").lower() == "true") 20 | self.douyin_api = getenv("DOUYIN_API", None) 21 | 22 | class _Proxy: 23 | def __init__(self, url: str): 24 | self._url = urlparse(url) if url else None 25 | self.url = self._url.geturl() if self._url else None 26 | 27 | @property 28 | def dict_format(self): 29 | if not self._url: 30 | return None 31 | return { 32 | "scheme": self._url.scheme, 33 | "hostname": self._url.hostname, 34 | "port": self._url.port, 35 | "username": self._url.username, 36 | "password": self._url.password, 37 | } 38 | 39 | 40 | bot_cfg = BotConfig() 41 | -------------------------------------------------------------------------------- /log.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import logging 3 | from typing import TYPE_CHECKING 4 | 5 | import loguru 6 | 7 | if TYPE_CHECKING: 8 | # 避免 sphinx autodoc 解析注释失败 9 | # 因为 loguru 模块实际上没有 `Logger` 类 10 | from loguru import Logger 11 | 12 | logger: "Logger" = loguru.logger 13 | 14 | 15 | class InterceptHandler(logging.Handler): 16 | def emit(self, record: logging.LogRecord) -> None: 17 | level: str | int 18 | try: 19 | level = logger.level(record.levelname).name 20 | except ValueError: 21 | level = record.levelno 22 | 23 | frame, depth = inspect.currentframe(), 0 24 | while frame and (depth == 0 or frame.f_code.co_filename == logging.__file__): 25 | frame = frame.f_back 26 | depth += 1 27 | 28 | logger.opt(depth=depth, exception=record.exc_info).log( 29 | level, record.getMessage() 30 | ) 31 | 32 | 33 | def init_logger(): 34 | logging.basicConfig(handlers=[InterceptHandler()], force=True) 35 | 36 | 37 | init_logger() 38 | -------------------------------------------------------------------------------- /methods/__init__.py: -------------------------------------------------------------------------------- 1 | from .tg_parse_hub import ( 2 | TgParseHub, 3 | VideoParseResultOperate, 4 | ImageParseResultOperate, 5 | MultimediaParseResultOperate, 6 | ) 7 | -------------------------------------------------------------------------------- /methods/tg_parse_hub.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import tempfile 3 | from datetime import datetime, timedelta 4 | 5 | import httpx 6 | from abc import ABC, abstractmethod 7 | from typing import Union, Callable 8 | from aiocache import Cache 9 | from aiocache.plugins import TimingPlugin 10 | from apscheduler.schedulers.asyncio import AsyncIOScheduler 11 | from parsehub.config import ParseConfig, DownloadConfig 12 | from pyrogram import enums, Client 13 | from pyrogram.types import ( 14 | Message, 15 | InlineKeyboardMarkup as Ikm, 16 | InlineKeyboardButton as Ikb, 17 | InputMediaPhoto, 18 | InputMediaVideo, 19 | InlineQuery, 20 | InlineQueryResultPhoto, 21 | InlineQueryResultAnimation, 22 | CallbackQuery, 23 | InlineQueryResultArticle, 24 | InputTextMessageContent, 25 | ) 26 | 27 | from parsehub import ParseHub 28 | from parsehub.types import ( 29 | ParseResult, 30 | Image, 31 | Video, 32 | Ani, 33 | VideoParseResult, 34 | ImageParseResult, 35 | MultimediaParseResult, 36 | SummaryResult, 37 | DownloadResult, 38 | ParseError, 39 | ) 40 | from parsehub.utiles.utile import match_url 41 | from parsehub.parsers.parser.weixin import WXImageParseResult 42 | from parsehub.parsers.parser.coolapk import CoolapkImageParseResult 43 | from config.config import bot_cfg 44 | from utiles.converter import clean_article_html 45 | from utiles.img_host import ImgHost 46 | from utiles.ph import Telegraph 47 | from utiles.utile import encrypt 48 | from contextlib import asynccontextmanager 49 | from markdown import markdown 50 | 51 | _parsing = Cache(Cache.MEMORY, plugins=[TimingPlugin()]) # 正在解析的链接 52 | _url_cache = Cache(Cache.MEMORY, plugins=[TimingPlugin()]) # 网址缓存 53 | _operate_cache = Cache(Cache.MEMORY, plugins=[TimingPlugin()]) # 解析结果缓存 54 | _msg_cache = Cache(Cache.MEMORY, plugins=[TimingPlugin()]) # 解析结果消息缓存 55 | 56 | scheduler = AsyncIOScheduler() 57 | scheduler.start() 58 | 59 | 60 | class TgParseHub(ParseHub): 61 | """重新封装 ParseHub 类,使其适用于 Telegram""" 62 | 63 | def __init__(self): 64 | super().__init__( 65 | ParseConfig(douyin_api=bot_cfg.douyin_api, proxy=bot_cfg.parser_proxy) 66 | ) 67 | self.url = None 68 | self.on_cache = bool(bot_cfg.cache_time) 69 | self.parsing = _parsing 70 | """正在解析的链接""" 71 | self.cache = _operate_cache 72 | """解析结果缓存""" 73 | self.url_cache = _url_cache 74 | """网址缓存""" 75 | self.operate: ParseResultOperate | None = None 76 | """解析结果操作对象""" 77 | 78 | async def parse( 79 | self, url: str, cache_time: int = bot_cfg.cache_time 80 | ) -> "TgParseHub": 81 | """ 82 | 解析网址,并返回解析结果操作对象。 83 | :param url: url 或 hash后的url 84 | :param cache_time: 缓存时间, 默认缓存一天 85 | :return: 86 | """ 87 | self.url = await self._get_url(url) 88 | while await self._get_parse_task(): 89 | await asyncio.sleep(1) 90 | 91 | if not (operate := await self._get_cache()): 92 | await self._add_parse_task() 93 | async with self.error_handler(): 94 | r = await super().parse(self.url) 95 | operate = self._select_operate(r) 96 | 97 | self.operate = operate 98 | if self.on_cache: 99 | """缓存结果""" 100 | await self._set_cache(operate, cache_time) 101 | if bot_cfg.ai_summary: 102 | """开启 AI 总结""" 103 | await self._set_url_cache() 104 | 105 | return self 106 | 107 | async def ai_summary(self, cq: CallbackQuery): 108 | """获取 AI 总结""" 109 | self.operate = await self.operate.ai_summary(cq) 110 | await self._set_cache(self.operate, bot_cfg.cache_time) 111 | 112 | async def un_ai_summary(self, cq: CallbackQuery): 113 | """取消 AI 总结""" 114 | return await self.operate.un_ai_summary(cq) 115 | 116 | async def download( 117 | self, 118 | callback: Callable = None, 119 | callback_args: tuple = (), 120 | ) -> DownloadResult: 121 | if (dr := self.operate.download_result) and dr.exists(): 122 | return dr 123 | async with self.error_handler(): 124 | r = await self.result.download( 125 | None, 126 | callback, 127 | callback_args, 128 | config=DownloadConfig( 129 | yt_dlp_duration_limit=1800, proxy=bot_cfg.downloader_proxy 130 | ), 131 | ) 132 | self.operate.download_result = r 133 | return r 134 | 135 | async def delete(self): 136 | """删除文件""" 137 | if self.on_cache: 138 | await self.cache.delete(self.operate.hash_url) 139 | self.operate.delete() 140 | 141 | async def chat_upload( 142 | self, cli: Client, msg: Message 143 | ) -> Message | list[Message] | list[list[Message]]: 144 | """发送解析结果到聊天中""" 145 | 146 | async def handle_cache(m): 147 | if isinstance(m, Message): 148 | return await m.copy(msg.chat.id) 149 | if isinstance(m, list): 150 | if all(isinstance(i, Message) for i in m): 151 | if not m: 152 | return None 153 | m = m[0] 154 | mg = await cli.copy_media_group(msg.chat.id, m.chat.id, m.id) 155 | 156 | return mg 157 | [await handle_cache(i) for i in m] 158 | await msg.reply( 159 | self.operate.content_and_no_url, 160 | quote=False, 161 | reply_markup=self.operate.button(), 162 | disable_web_page_preview=True, 163 | ) 164 | 165 | cache_msg = await self._get_msg_cache() 166 | if cache_msg: 167 | return await handle_cache(cache_msg) 168 | 169 | async with self.error_handler(): 170 | msg = await self.operate.chat_upload(msg) 171 | 172 | if self.on_cache: 173 | await self._set_msg_cache(msg) 174 | else: 175 | await self.delete() 176 | await self._del_parse_task() 177 | return msg 178 | 179 | async def inline_upload(self, iq: InlineQuery): 180 | """发送解析结果到内联中""" 181 | async with self.error_handler(): 182 | await self.operate.inline_upload(iq) 183 | await self._del_parse_task() 184 | 185 | @asynccontextmanager 186 | async def error_handler(self): 187 | try: 188 | yield 189 | except Exception as e: 190 | await self._error_callback() 191 | raise e 192 | 193 | async def _error_callback(self): 194 | """错误回调""" 195 | await self._del_parse_task() 196 | 197 | async def get_parse_task(self, url: str) -> bool: 198 | """获取解析任务""" 199 | url = await self._get_url(url) 200 | return await self.parsing.get(url) 201 | 202 | async def _get_parse_task(self): 203 | """获取解析任务""" 204 | return await self.parsing.get(self.url, False) 205 | 206 | async def _add_parse_task(self): 207 | """添加解析任务, 超时: 5分钟""" 208 | await self.parsing.set(self.url, True, ttl=300) 209 | 210 | async def _del_parse_task(self): 211 | """解析结束""" 212 | await self.parsing.delete(self.url) 213 | 214 | async def _get_url(self, url: str): 215 | """获取网址""" 216 | if "http" not in url: 217 | url = await self._get_url_cache(url) 218 | url = match_url(url) 219 | if not url: 220 | raise ParseError("无效的网址") 221 | return await self._select_parser(url)(parse_config=self.config).get_raw_url(url) 222 | 223 | async def _set_url_cache(self): 224 | """缓存网址""" 225 | await self.url_cache.set(encrypt(self.url), self.url, ttl=bot_cfg.cache_time) 226 | 227 | async def _get_url_cache(self, hash_url: str) -> str | None: 228 | """获取缓存网址""" 229 | return await self.url_cache.get(hash_url) 230 | 231 | async def _get_cache(self) -> Union["ParseResultOperate", None]: 232 | """获取缓存结果""" 233 | return await self.cache.get(encrypt(self.url)) 234 | 235 | async def _set_cache(self, result: "ParseResultOperate", cache_time): 236 | """缓存结果""" 237 | await self.cache.set(result.hash_url, result) 238 | await self._clear_cache(cache_time) 239 | 240 | async def _clear_cache(self, cache_time: int = bot_cfg.cache_time): 241 | """定时删除缓存""" 242 | 243 | async def fn(): 244 | await self.cache.delete(self.operate.hash_url) 245 | self.operate.delete() 246 | 247 | if not scheduler.get_job(self.operate.hash_url): 248 | run_time = datetime.now() + timedelta(seconds=cache_time) 249 | scheduler.add_job(fn, "date", run_date=run_time, id=self.operate.hash_url) 250 | 251 | async def _get_msg_cache( 252 | self, 253 | ) -> Message | list[Message] | list[list[Message]] | None: 254 | """获取缓存消息""" 255 | return await _msg_cache.get(self.operate.hash_url) 256 | 257 | async def _set_msg_cache(self, msg: Message): 258 | """缓存消息""" 259 | await _msg_cache.set(self.operate.hash_url, msg, ttl=bot_cfg.cache_time) 260 | 261 | @staticmethod 262 | def _select_operate(result: ParseResult = None) -> "ParseResultOperate": 263 | """根据解析结果类型选择对应的操作类""" 264 | cls = result.__class__ 265 | if issubclass(cls, VideoParseResult): 266 | op = VideoParseResultOperate 267 | elif issubclass(cls, ImageParseResult): 268 | op = ImageParseResultOperate 269 | elif issubclass(cls, MultimediaParseResult): 270 | op = MultimediaParseResultOperate 271 | else: 272 | raise ValueError("未知的 ParseResult 类型") 273 | return op(result) 274 | 275 | @property 276 | def result(self) -> ParseResult: 277 | return self.operate and self.operate.result 278 | 279 | 280 | class ParseResultOperate(ABC): 281 | """解析结果操作基类""" 282 | 283 | def __init__(self, result: ParseResult): 284 | self.result = result 285 | self.download_result: DownloadResult | None = None 286 | self.ai_summary_result: SummaryResult | None = None 287 | self.telegraph_url: str | None = None # telegraph 帖子链接 288 | 289 | @abstractmethod 290 | async def chat_upload( 291 | self, msg: Message 292 | ) -> Message | list[Message] | list[list[Message]]: 293 | """普通聊天上传""" 294 | raise NotImplementedError 295 | 296 | async def inline_upload(self, iq: InlineQuery): 297 | """内联上传""" 298 | results = [] 299 | 300 | media = ( 301 | self.result.media 302 | if isinstance(self.result.media, list) 303 | else [self.result.media] 304 | ) 305 | if not media: 306 | results.append( 307 | InlineQueryResultArticle( 308 | title=self.result.title or "无标题", 309 | description=self.result.desc, 310 | input_message_content=InputTextMessageContent( 311 | self.content_and_no_url, disable_web_page_preview=True 312 | ), 313 | reply_markup=self.button(), 314 | ) 315 | ) 316 | for index, i in enumerate(media): 317 | text = self.content_and_no_url 318 | k = { 319 | "caption": text, 320 | "title": text, 321 | "reply_markup": self.button(), 322 | } 323 | 324 | if isinstance(i, Image): 325 | results.append( 326 | InlineQueryResultPhoto( 327 | i.path, 328 | photo_width=300, 329 | photo_height=300, 330 | **k, 331 | ) 332 | ) 333 | elif isinstance(i, Video): 334 | results.append( 335 | InlineQueryResultPhoto( 336 | i.thumb_url 337 | or "https://telegra.ph/file/cdfdb65b83a4b7b2b6078.png", 338 | photo_width=300, 339 | photo_height=300, 340 | id=f"download_{index}", 341 | title=text, 342 | caption=text, 343 | reply_markup=self.button(hide_summary=True), 344 | ) 345 | ) 346 | elif isinstance(i, Ani): 347 | results.append( 348 | InlineQueryResultAnimation(i.path, thumb_url=i.thumb_url, **k) 349 | ) 350 | return await iq.answer(results, cache_time=0) 351 | 352 | def delete(self): 353 | """删除文件""" 354 | if not self.download_result: 355 | return 356 | self.download_result.delete() 357 | 358 | def button( 359 | self, 360 | hide_summary: bool = False, 361 | show_summary_result: bool = False, 362 | summarizing: bool = False, 363 | ) -> Ikm | None: 364 | """ 365 | 按钮 366 | :param hide_summary: 隐藏 AI 总结按钮 367 | :param show_summary_result: 显示 AI 总结结果 368 | :param summarizing: 总结中 369 | :return: 370 | """ 371 | if not self.result.raw_url: 372 | return 373 | button = [] 374 | 375 | raw_url_btn = Ikb("原链接", url=self.result.raw_url) 376 | 377 | if show_summary_result: 378 | ai_summary_btn = Ikb("AI总结✅", callback_data=f"unsummary_{self.hash_url}") 379 | else: 380 | ai_summary_btn = Ikb("AI总结❎", callback_data=f"summary_{self.hash_url}") 381 | 382 | button.append(raw_url_btn) 383 | if bot_cfg.ai_summary and not hide_summary: 384 | if summarizing: 385 | ai_summary_btn = Ikb( 386 | "AI总结中❇️", callback_data=f"summarizing_{self.hash_url}" 387 | ) 388 | button.append(ai_summary_btn) 389 | 390 | return Ikm([button]) 391 | 392 | @property 393 | def hash_url(self): 394 | """网址哈希值""" 395 | return encrypt(self.result.raw_url) 396 | 397 | async def ai_summary(self, cq: CallbackQuery) -> "ParseResultOperate": 398 | """获取 AI 总结""" 399 | 400 | if not (r := self.ai_summary_result): 401 | await cq.edit_message_text( 402 | self.content_and_no_url, 403 | reply_markup=self.button(summarizing=True), 404 | ) 405 | if not self.download_result: 406 | self.download_result = await self.result.download() 407 | try: 408 | r = await self.download_result.summary() 409 | except Exception as e: 410 | await cq.edit_message_text( 411 | self.content_and_no_url, 412 | reply_markup=self.button(), 413 | ) 414 | raise e 415 | self.ai_summary_result = r 416 | 417 | await cq.edit_message_text( 418 | self.f_text(r.content), reply_markup=self.button(show_summary_result=True) 419 | ) 420 | 421 | return self 422 | 423 | async def un_ai_summary(self, cq: CallbackQuery): 424 | """取消 AI 总结""" 425 | 426 | await cq.edit_message_text(self.content_and_no_url, reply_markup=self.button()) 427 | 428 | @property 429 | def content_and_no_url(self) -> str: 430 | return ( 431 | f"[{self.result.title or '无标题'}]({self.telegraph_url})" 432 | if self.telegraph_url 433 | else self.f_text(f"**{self.result.title}**\n\n{self.result.desc}") 434 | ).strip() 435 | 436 | @property 437 | def content_and_url(self) -> str: 438 | text = self.content_and_no_url 439 | return self.f_text( 440 | f"{text}\n\n> 原文链接: [LINK]({self.result.raw_url})" 441 | if self.result.raw_url 442 | else text 443 | ).strip() 444 | 445 | @staticmethod 446 | def f_text(text: str) -> str: 447 | """格式化输出内容, 限制长度, 添加折叠块样式""" 448 | text = text.strip() 449 | if text[1020:]: 450 | text = text[:1000] + "..." 451 | return f"
{text}
" 452 | elif text[500:] or len(text.splitlines()) > 10: 453 | # 超过 500 字或超过 10 行, 则添加折叠块样式 454 | return f"
{text}
" 455 | else: 456 | return text 457 | 458 | 459 | class VideoParseResultOperate(ParseResultOperate): 460 | """视频解析结果操作""" 461 | 462 | async def chat_upload(self, msg: Message) -> Message: 463 | with tempfile.NamedTemporaryFile(delete=False) as temp_file: 464 | if self.result.media.thumb_url: 465 | async with httpx.AsyncClient() as client: 466 | thumb = await client.get(self.result.media.thumb_url) 467 | temp_file.write(thumb.content) 468 | temp = temp_file.name 469 | else: 470 | temp = None 471 | 472 | await msg.reply_chat_action(enums.ChatAction.UPLOAD_VIDEO) 473 | return await msg.reply_video( 474 | self.download_result.media.path, 475 | caption=self.content_and_no_url, 476 | thumb=temp, 477 | quote=True, 478 | reply_markup=self.button(), 479 | ) 480 | 481 | 482 | class ImageParseResultOperate(ParseResultOperate): 483 | """图片解析结果操作""" 484 | 485 | async def _send_ph(self, html_content: str, msg: Message): 486 | page = await Telegraph().create_page( 487 | self.result.title or "无标题", html_content=html_content 488 | ) 489 | self.telegraph_url = page.url 490 | return await msg.reply_text( 491 | self.content_and_no_url, 492 | quote=True, 493 | reply_markup=self.button(), 494 | ) 495 | 496 | async def chat_upload( 497 | self, msg: Message 498 | ) -> Message | list[Message] | list[list[Message]]: 499 | await msg.reply_chat_action(enums.ChatAction.UPLOAD_PHOTO) 500 | 501 | if isinstance(self.result, WXImageParseResult): 502 | return await self._send_ph( 503 | clean_article_html( 504 | markdown( 505 | self.result.wx.markdown_content.replace( 506 | "mmbiz.qpic.cn", "mmbiz.qpic.cn.in" 507 | ) 508 | ) 509 | ), 510 | msg, 511 | ) 512 | elif isinstance(self.result, CoolapkImageParseResult) and ( 513 | markdown_content := self.result.coolapk.markdown_content 514 | ): 515 | return await self._send_ph( 516 | clean_article_html( 517 | markdown( 518 | markdown_content.replace( 519 | "image.coolapk.com", "qpic.cn.in/image.coolapk.com" 520 | ) 521 | ) 522 | ), 523 | msg, 524 | ) 525 | 526 | count = len(self.download_result.media) 527 | text = self.content_and_no_url 528 | if count == 0: 529 | return await msg.reply_text( 530 | text, 531 | quote=True, 532 | disable_web_page_preview=True, 533 | reply_markup=self.button(), 534 | ) 535 | elif count == 1: 536 | return await msg.reply_photo( 537 | self.download_result.media[0].path, 538 | quote=True, 539 | caption=text, 540 | reply_markup=self.button(), 541 | ) 542 | elif count <= 9: 543 | text = self.content_and_no_url 544 | m = await msg.reply_media_group( 545 | [InputMediaPhoto(v.path) for v in self.download_result.media] 546 | ) 547 | await m[0].reply_text( 548 | text, 549 | disable_web_page_preview=True, 550 | reply_markup=self.button(), 551 | quote=True, 552 | ) 553 | return [m] 554 | else: 555 | tasks = [ImgHost().litterbox(i.path) for i in self.download_result.media] 556 | results = await asyncio.gather(*tasks, return_exceptions=True) 557 | results = [ 558 | f'' for i in results if not isinstance(i, Exception) 559 | ] 560 | 561 | await self._send_ph(f"{self.result.desc}

" + "".join(results), msg) 562 | 563 | 564 | class MultimediaParseResultOperate(ParseResultOperate): 565 | """图片视频混合解析结果操作""" 566 | 567 | async def chat_upload( 568 | self, msg: Message 569 | ) -> Message | list[Message] | list[list[Message]]: 570 | await msg.reply_chat_action(enums.ChatAction.UPLOAD_PHOTO) 571 | 572 | count = len(self.download_result.media) 573 | text = self.content_and_no_url 574 | if count == 0: 575 | return await msg.reply_text( 576 | text, 577 | quote=True, 578 | disable_web_page_preview=True, 579 | reply_markup=self.button(), 580 | ) 581 | elif count == 1: 582 | m = self.download_result.media[0] 583 | k = { 584 | "quote": True, 585 | "caption": text, 586 | "reply_markup": self.button(), 587 | } 588 | if isinstance(m, Image): 589 | return await msg.reply_photo(m.path, **k) 590 | elif isinstance(m, Video): 591 | return await msg.reply_video(m.path, **k) 592 | elif isinstance(m, Ani): 593 | return await msg.reply_animation(m.path, **k) 594 | 595 | else: 596 | text = self.content_and_no_url 597 | media = [] 598 | ani_msg = [] 599 | for i, v in enumerate(self.download_result.media): 600 | if isinstance(v, Image): 601 | media.append(InputMediaPhoto(v.path)) 602 | elif isinstance(v, Video): 603 | media.append(InputMediaVideo(v.path)) 604 | elif isinstance(v, Ani): 605 | ani = await msg.reply_animation( 606 | v.path, 607 | quote=True, 608 | caption=text if not i else f"**{i + 1}/{count}**", 609 | ) 610 | ani_msg.append(ani) 611 | m = ani_msg + [ 612 | await msg.reply_media_group(media[i : i + 10], quote=True) 613 | for i in range(0, count, 10) 614 | ] 615 | mm = m[0][0] if isinstance(m[0], list) else m[0] 616 | await mm.reply_text( 617 | text, 618 | disable_web_page_preview=True, 619 | reply_markup=self.button(), 620 | quote=True, 621 | ) 622 | return m 623 | -------------------------------------------------------------------------------- /plugins/ai_summary.py: -------------------------------------------------------------------------------- 1 | from pyrogram import Client, filters 2 | from pyrogram.types import CallbackQuery 3 | from methods.tg_parse_hub import TgParseHub 4 | from parsehub.types import ParseError 5 | 6 | 7 | @Client.on_callback_query(filters.regex("^summary_|unsummary_")) 8 | async def ai_summary(_, cq: CallbackQuery): 9 | hash_url = cq.data.split("_")[1] 10 | ph = TgParseHub() 11 | try: 12 | result = await ph.parse(hash_url) 13 | except ParseError: 14 | # 缓存失效, 重新解析 15 | return 16 | match cq.data.split("_")[0]: 17 | case "summary": 18 | await result.ai_summary(cq) 19 | case "unsummary": 20 | await result.un_ai_summary(cq) 21 | -------------------------------------------------------------------------------- /plugins/inline_parse.py: -------------------------------------------------------------------------------- 1 | from pyrogram import Client 2 | from pyrogram.errors import MessageNotModified 3 | from pyrogram.types import ( 4 | InlineQuery, 5 | InputTextMessageContent, 6 | InlineQueryResultArticle, 7 | ChosenInlineResult, 8 | InputMediaVideo, 9 | ) 10 | 11 | from methods import TgParseHub 12 | from plugins.start import get_supported_platforms 13 | from utiles.filters import platform_filter 14 | from utiles.utile import progress 15 | 16 | 17 | @Client.on_inline_query(~platform_filter) 18 | async def inline_parse_tip(_, iq: InlineQuery): 19 | results = [ 20 | InlineQueryResultArticle( 21 | title="聚合解析", 22 | description="请在聊天框输入链接", 23 | input_message_content=InputTextMessageContent(get_supported_platforms()), 24 | thumb_url="https://i.imgloc.com/2023/06/15/Vbfazk.png", 25 | ) 26 | ] 27 | await iq.answer(results=results, cache_time=1) 28 | 29 | 30 | @Client.on_inline_query(platform_filter) 31 | async def call_inline_parse(_, iq: InlineQuery): 32 | pp = await TgParseHub().parse(iq.query) 33 | await pp.inline_upload(iq) 34 | 35 | 36 | async def callback( 37 | current, total, status: str, client: Client, inline_message_id, pp: TgParseHub 38 | ): 39 | text = progress(current, total, status) 40 | if not text: 41 | return 42 | text = f"{pp.operate.content_and_no_url}\n\n{text}" 43 | try: 44 | await client.edit_inline_text( 45 | inline_message_id, text, reply_markup=pp.operate.button(hide_summary=True) 46 | ) 47 | except MessageNotModified: 48 | ... 49 | 50 | 51 | @Client.on_chosen_inline_result() 52 | async def inline_result_jx(client: Client, cir: ChosenInlineResult): 53 | if not cir.result_id.startswith("download_"): 54 | return 55 | index = int(cir.result_id.split("_")[1]) 56 | imid = cir.inline_message_id 57 | 58 | try: 59 | pp = await TgParseHub().parse(cir.query) 60 | await pp.download( 61 | callback, 62 | (client, imid, pp), 63 | ) 64 | except Exception as e: 65 | await client.edit_inline_text(imid, f"{e}") 66 | raise e 67 | else: 68 | await client.edit_inline_text( 69 | imid, 70 | f"{pp.operate.content_and_no_url}\n\n上 传 中...", 71 | reply_markup=pp.operate.button(hide_summary=True), 72 | ) 73 | await client.edit_inline_media( 74 | imid, 75 | media=InputMediaVideo( 76 | pp.operate.download_result.media[index].path 77 | if isinstance(pp.operate.download_result.media, list) 78 | else pp.operate.download_result.media.path, 79 | caption=pp.operate.content_and_no_url, 80 | ), 81 | reply_markup=pp.operate.button(), 82 | ) 83 | -------------------------------------------------------------------------------- /plugins/parse.py: -------------------------------------------------------------------------------- 1 | from pyrogram import Client, filters 2 | from pyrogram.types import ( 3 | Message, 4 | ) 5 | 6 | from utiles.filters import platform_filter 7 | from methods import TgParseHub 8 | from utiles.utile import progress 9 | 10 | 11 | @Client.on_message(filters.text & platform_filter) 12 | async def call_parse(cli: Client, msg: Message): 13 | try: 14 | tph = TgParseHub() 15 | t = ( 16 | "已有相同任务正在解析, 等待解析完成..." 17 | if await tph.get_parse_task(msg.text or msg.caption) 18 | else "解 析 中..." 19 | ) 20 | r_msg = await msg.reply_text(t) 21 | pp = await tph.parse(msg.text) 22 | await pp.download(callback, (r_msg,)) 23 | except Exception as e: 24 | await msg.reply_text(f"{e}") 25 | raise e 26 | else: 27 | await r_msg.edit_text("上 传 中...") 28 | try: 29 | await pp.chat_upload(cli, msg) 30 | except Exception as e: 31 | await r_msg.edit_text("上传失败") 32 | raise e 33 | await r_msg.delete() 34 | 35 | 36 | async def callback(current, total, status: str, msg: Message): 37 | text = progress(current, total, status) 38 | if not text or msg.text == text: 39 | return 40 | await msg.edit_text(text) 41 | msg.text = text 42 | -------------------------------------------------------------------------------- /plugins/start.py: -------------------------------------------------------------------------------- 1 | from pyrogram import Client, filters 2 | from pyrogram.types import Message 3 | from parsehub import ParseHub 4 | 5 | 6 | @Client.on_message(filters.command(["start", "help"])) 7 | async def start(_, msg: Message): 8 | await msg.reply(get_supported_platforms()) 9 | 10 | 11 | def get_supported_platforms(): 12 | return "**支持的平台:**\n\n" + "\n".join(ParseHub().get_supported_platforms()) 13 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "parsehubbot" 3 | version = "0.1.0" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.12" 7 | dependencies = [ 8 | "aiocache>=0.12.3", 9 | "apscheduler==3.10.4", 10 | "dynrender-skia>=0.2.5", 11 | "loguru>=0.6.0,<0.7.0", 12 | "lxml-html-clean>=0.4.1", 13 | "markdown>=3.7", 14 | "parsehub>=1.0.5", 15 | "pydantic==1.10.19", 16 | "pyrogram", 17 | "python-dotenv>=1.0.1", 18 | "telegraph>=2.2.0", 19 | "tgcrypto>=1.2.5", 20 | ] 21 | 22 | [tool.uv.sources] 23 | pyrogram = { url = "https://github.com/KurimuzonAkuma/pyrogram/archive/v2.1.28.zip" } 24 | -------------------------------------------------------------------------------- /utiles/converter.py: -------------------------------------------------------------------------------- 1 | # encoding=utf8 2 | # FROM https://github.com/mercuree/html-telegraph-poster/blob/7212225e28a0206803c32e67d1185bbfbd1fc181/html_telegraph_poster/converter.py 3 | import re 4 | from lxml.html.clean import Cleaner 5 | 6 | 7 | allowed_tags = ( 8 | "a", 9 | "aside", 10 | "b", 11 | "blockquote", 12 | "br", 13 | "code", 14 | "em", 15 | "figcaption", 16 | "figure", 17 | "h3", 18 | "h4", 19 | "hr", 20 | "i", 21 | "iframe", 22 | "img", 23 | "li", 24 | "ol", 25 | "p", 26 | "pre", 27 | "s", 28 | "strong", 29 | "u", 30 | "ul", 31 | "video", 32 | ) 33 | telegram_embed_script_re = re.compile( 34 | r"""]+\sdata-telegram-post=['"]([^'"]+))[^<]+""", 35 | re.IGNORECASE, 36 | ) 37 | pre_content_re = re.compile(r"<(pre|code)(>|\s[^>]*>)[\s\S]*?") 38 | line_breaks_inside_pre = re.compile(r"|\s[^<>]*>)") 39 | line_breaks_and_empty_strings = re.compile(r"(\s{2,}|\s*\r?\n\s*)") 40 | header_re = re.compile(r"") 41 | 42 | 43 | def clean_article_html(html_string): 44 | html_string = html_string.replace("", "") 45 | # telegram will convert anyway 46 | html_string = re.sub(r"<(/?)b(?=\s|>)", r"<\1strong", html_string) 47 | html_string = re.sub(r"<(/?)(h2|h5|h6)", r"<\1h4", html_string) 48 | # convert telegram embed posts before cleaner 49 | html_string = re.sub( 50 | telegram_embed_script_re, 51 | r'', 52 | html_string, 53 | ) 54 | # remove if present (can't do this with Cleaner) 55 | html_string = header_re.sub("", html_string) 56 | 57 | c = Cleaner( 58 | allow_tags=allowed_tags, 59 | style=True, 60 | remove_unknown_tags=False, 61 | embedded=False, 62 | safe_attrs_only=True, 63 | safe_attrs=("src", "href", "class"), 64 | ) 65 | # wrap with div to be sure it is there 66 | # (otherwise lxml will add parent element in some cases 67 | html_string = "
%s
" % html_string 68 | cleaned = c.clean_html(html_string) 69 | # remove wrapped div 70 | cleaned = cleaned[5:-6] 71 | # remove all line breaks and empty strings 72 | html_string = replace_line_breaks_except_pre(cleaned) 73 | # but replace multiple br tags with one line break, telegraph will convert it to
74 | html_string = re.sub(r"(|\s[^<>]*>)\s*)+", "\n", html_string) 75 | 76 | return html_string.strip(" \t") 77 | 78 | 79 | def replace_line_breaks_except_pre(html_string, replace_by=" "): 80 | # Remove all line breaks and empty strings, except pre tag 81 | # how to make it in one string? :\ 82 | pre_ranges = [0] 83 | out = "" 84 | 85 | # replace non-breaking space with usual space 86 | html_string = html_string.replace("\u00a0", " ") 87 | 88 | # get
 start/end postion
 89 |     for x in pre_content_re.finditer(html_string):
 90 |         start, end = x.start(), x.end()
 91 |         pre_ranges.extend((start, end))
 92 |     pre_ranges.append(len(html_string))
 93 | 
 94 |     # all odd elements are 
, leave them untouched
 95 |     for k in range(1, len(pre_ranges)):
 96 |         part = html_string[pre_ranges[k - 1] : pre_ranges[k]]
 97 |         if k % 2 == 0:
 98 |             out += line_breaks_inside_pre.sub("\n", part)
 99 |         else:
100 |             out += line_breaks_and_empty_strings.sub(replace_by, part)
101 |     return out
102 | 


--------------------------------------------------------------------------------
/utiles/filters.py:
--------------------------------------------------------------------------------
 1 | from pyrogram import filters
 2 | from pyrogram.types import Message, InlineQuery
 3 | 
 4 | from parsehub import ParseHub
 5 | 
 6 | 
 7 | async def _platform_filter(_, __, update: Message | InlineQuery):
 8 |     if isinstance(update, Message):
 9 |         t = update.caption or update.text
10 |     else:
11 |         t = update.query
12 |     return bool(ParseHub()._select_parser(t))
13 | 
14 | 
15 | platform_filter = filters.create(_platform_filter)
16 | 
17 | 
18 | def filter_inline_query_results(command: str):
19 |     """
20 |     过滤指定字符开头的内联查询结果
21 | 
22 |     :param command:
23 |     :return:
24 |     """
25 | 
26 |     async def func(_, __, update):
27 |         return update.query.startswith(command)
28 | 
29 |     return filters.create(func, commands=command)
30 | 


--------------------------------------------------------------------------------
/utiles/img_host.py:
--------------------------------------------------------------------------------
 1 | import httpx
 2 | from httpx._types import ProxiesTypes
 3 | from tenacity import retry, stop_after_attempt
 4 | 
 5 | 
 6 | class ImgHost:
 7 |     def __init__(self, proxies: ProxiesTypes = None):
 8 |         self.async_client = httpx.AsyncClient(proxies=proxies)
 9 | 
10 |     @retry(stop=stop_after_attempt(5))
11 |     async def litterbox(self, filename: str):
12 |         host_url = "https://litterbox.catbox.moe/resources/internals/api.php"
13 | 
14 |         file = open(filename, "rb")
15 |         try:
16 |             data = {
17 |                 "reqtype": "fileupload",
18 |                 "time": "1h",
19 |             }
20 |             response = await self.async_client.post(
21 |                 host_url, data=data, files={"fileToUpload": file}
22 |             )
23 |         finally:
24 |             file.close()
25 | 
26 |         return response.text
27 | 
28 |     def __aexit__(self, exc_type, exc_val, exc_tb):
29 |         self.async_client.aclose()
30 | 


--------------------------------------------------------------------------------
/utiles/ph.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | from telegraph.aio import Telegraph as TelegraphAPI
 4 | 
 5 | 
 6 | class Telegraph:
 7 |     """Telegraph API 封装"""
 8 | 
 9 |     def __init__(self, token: str = None, domain: str = "telegra.ph"):
10 |         self.token = token
11 |         self.domain = domain
12 |         self.telegraph = TelegraphAPI(access_token=token, domain=domain)
13 | 
14 |     async def create_account(
15 |         self, short_name: str, author_name: str = None, author_url: str = None
16 |     ) -> "TelegraphAccount":
17 |         """创建 Telegraph 账户"""
18 |         account = await self.telegraph.create_account(
19 |             short_name, author_name, author_url
20 |         )
21 |         acc_info = await self.get_account_info(account)
22 |         self.token = acc_info.access_token
23 |         return acc_info
24 | 
25 |     async def get_account_info(self, account_info: dict = None) -> "TelegraphAccount":
26 |         """获取 Telegraph 账户信息"""
27 |         account_info = account_info or await self.telegraph.get_account_info(
28 |             [
29 |                 "short_name",
30 |                 "author_name",
31 |                 "author_url",
32 |                 "auth_url",
33 |             ]
34 |         )
35 |         return TelegraphAccount(
36 |             self.telegraph.get_access_token(),
37 |             account_info["short_name"],
38 |             account_info["author_name"],
39 |             account_info["author_url"],
40 |             account_info["auth_url"],
41 |         )
42 | 
43 |     async def create_page(
44 |         self,
45 |         title,
46 |         content=None,
47 |         html_content=None,
48 |         author_name=None,
49 |         author_url=None,
50 |         return_content=False,
51 |         auto_create_account=True,
52 |     ) -> "TelegraphPage":
53 |         """创建 Telegraph 页面"""
54 |         if auto_create_account and not self.token:
55 |             # 随机用户名
56 |             short_name = "tg_" + str(
57 |                 int(100000 * (1 + 0.5 * (1 + 0.5 * (1 + 0.5 * 1))))
58 |             )
59 |             await self.create_account(short_name)
60 |         response = await self.telegraph.create_page(
61 |             title,
62 |             content,
63 |             html_content,
64 |             author_name,
65 |             author_url,
66 |             return_content,
67 |         )
68 |         return TelegraphPage(
69 |             response["path"],
70 |             response["url"],
71 |             response["title"],
72 |             response["description"],
73 |             response["views"],
74 |             response["can_edit"],
75 |             await self.get_account_info(),
76 |         )
77 | 
78 | 
79 | @dataclass
80 | class TelegraphAccount:
81 |     access_token: str
82 |     short_name: str
83 |     author_name: str
84 |     author_url: str
85 |     auth_url: str
86 | 
87 | 
88 | @dataclass
89 | class TelegraphPage:
90 |     path: str
91 |     url: str
92 |     title: str
93 |     description: str
94 |     views: int
95 |     can_edit: bool
96 |     account: TelegraphAccount
97 | 


--------------------------------------------------------------------------------
/utiles/utile.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import hashlib
 3 | 
 4 | from pyrogram import Client
 5 | 
 6 | 
 7 | async def schedule_delete_messages(
 8 |     client: Client, chat_id: int, message_ids: int | list, delay_seconds: int = 2
 9 | ):
10 |     """定时删除消息"""
11 | 
12 |     await asyncio.sleep(delay_seconds)
13 | 
14 |     try:
15 |         await client.delete_messages(chat_id, message_ids)
16 |     except Exception:
17 |         ...
18 | 
19 | 
20 | def progress(current, total, status):
21 |     if total == 0:
22 |         return status
23 | 
24 |     text = None
25 |     if total >= 100:
26 |         if round(current * 100 / total, 1) % 25 == 0:
27 |             text = f"下 载 中...|{status}"
28 |     else:
29 |         if (current + 1) % 3 == 0 or (current + 1) == total:
30 |             text = f"下 载 中...|{status}"
31 |     return text
32 | 
33 | 
34 | def encrypt(text: str):
35 |     """hash加密"""
36 |     md5 = hashlib.md5()
37 |     md5.update(text.encode("utf-8"))
38 |     return md5.hexdigest()
39 | 


--------------------------------------------------------------------------------