├── README.md ├── img ├── img1.jpg └── img2.jpg ├── nonebot_plugin_chatpdf ├── __init__.py ├── config.py └── core.py └── pyproject.toml /README.md: -------------------------------------------------------------------------------- 1 |
2 | NoneBotPluginLogo 3 |
4 |

NoneBotPluginText

5 |
6 | 7 |
8 | 9 | # nonebot-plugin-chatpdf 10 | 11 |
12 | 13 | # 介绍 14 | 15 | - 本插件灵感来源于最近很火的 [chatpdf](https://www.chatpdf.com)。 16 | - 将需要分析的论文/阅读材料上传到群文件,机器人可以对其进行存储分析,然后你可以向其提问有关文章内容、文章概要、对于文章的思考等问题 17 | - 本插件参考和使用了项目 [Document_QA](https://github.com/fierceX/Document_QA) 中的核心代码 18 | - 本插件可选使用OneAPI格式的第三方中转站也可以使用OpenAI官方接口,但是在速率限制的情况下本插件可能无法使用。 19 | 20 | # 效果 21 | 使用方法以最新说明为主 22 | 23 | ![Alt](./img/img2.jpg) 24 | 25 | # 安装 26 | 27 | * 手动安装 28 | ``` 29 | git clone https://github.com/Alpaca4610/nonebot_plugin_chatpdf.git 30 | ``` 31 | 32 | 下载完成后在bot项目的pyproject.toml文件手动添加插件: 33 | 34 | ``` 35 | plugin_dirs = ["xxxxxx","xxxxxx",......,"下载完成的插件路径/nonebot-plugin-chatpdf"] 36 | ``` 37 | * 使用 pip 38 | ``` 39 | pip install nonebot-plugin-chatgpt-chatpdf 40 | ``` 41 | 42 | # 配置文件 43 | 44 | 在Bot根目录下的.env文件中追加如下内容: 45 | 46 | ``` 47 | oneapi_key = "sk-xxxxxxxxxx" # (必填)OpenAI官方或者是支持OneAPI的大模型中转服务商提供的KEY 48 | oneapi_url = "https://xxxxxxxxx" # (可选)大模型中转服务商提供的中转地址,使用OpenAI官方服务不需要填写 49 | oneapi_model = "gpt-4" # (可选)使用的语言大模型 50 | ``` 51 | 52 | 53 | # 使用方法 54 | 55 | 如果设置了nonebot全局触发前缀,需要在下面的命令前加上设置的前缀。 56 | 57 | ### 使用方式:上传需要分析的pdf文件到群文件中 58 | 59 | - 分析pdf (使用该命令以上传pdf文件的方式启动chatpdf文章分析功能) 60 | - 在一分钟内,上传需要分析的pdf文件到群文件中,分析完成后会返回成功信息 61 | - askpdf (文章分析完成后,使用该命令后面接需要提问的关于文章的问题,机器人会给出答案) 62 | - 删除所有pdf (删除所有缓存) 63 | - 删除我的pdf (删除用户在本群的缓存) 64 | 65 | 66 | # 注意事项 67 | 68 | - 每次调用```分析pdf```命令时,都会清除调用者以前的分析缓存 69 | 70 | -------------------------------------------------------------------------------- /img/img1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Alpaca4610/nonebot_plugin_chatpdf/bb317be062b434fa185af38c426f7fe4d9b71362/img/img1.jpg -------------------------------------------------------------------------------- /img/img2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Alpaca4610/nonebot_plugin_chatpdf/bb317be062b434fa185af38c426f7fe4d9b71362/img/img2.jpg -------------------------------------------------------------------------------- /nonebot_plugin_chatpdf/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | from datetime import timedelta 4 | 5 | from nonebot import on_command, on_notice 6 | from nonebot.adapters.onebot.v11 import Message, MessageSegment 7 | from nonebot.adapters.onebot.v11 import ( 8 | MessageEvent, 9 | GroupMessageEvent, 10 | GroupUploadNoticeEvent, 11 | ) 12 | from nonebot.params import CommandArg 13 | 14 | 15 | from .core import QA 16 | 17 | 18 | data = {} 19 | 20 | 21 | def delete_file(dir): 22 | if os.path.exists(dir): 23 | shutil.rmtree(dir) 24 | 25 | 26 | pdf_request = on_command("分析pdf", block=True, priority=1) 27 | 28 | 29 | @pdf_request.handle() 30 | async def _(event: GroupMessageEvent): 31 | pdf_file_request = on_notice(temp=True, expire_time=timedelta(minutes=1)) 32 | id = event.get_session_id() 33 | 34 | data[id] = "" 35 | 36 | @pdf_file_request.handle() 37 | async def _(event: GroupUploadNoticeEvent): 38 | await pdf_request.send( 39 | MessageSegment.text("分析中,请稍等......"), at_sender=True 40 | ) 41 | 42 | data[id] = await QA.create(id, event.file.url) 43 | 44 | await pdf_request.finish( 45 | MessageSegment.text("现在,你可以向我提问有关于该文章的问题了"), 46 | at_sender=True, 47 | ) 48 | 49 | await pdf_request.finish( 50 | MessageSegment.text("请上传需要分析的pdf文件"), at_sender=True 51 | ) 52 | 53 | 54 | pdf_chat_request = on_command("askpdf", block=True, priority=1) 55 | 56 | 57 | @pdf_chat_request.handle() 58 | async def _(event: GroupMessageEvent, msg: Message = CommandArg()): 59 | if event.get_session_id() not in data: 60 | await pdf_chat_request.finish(MessageSegment.text("请先使用/start命令开始!")) 61 | question = msg.extract_plain_text() 62 | res = await data[event.get_session_id()].get_ans(question) 63 | await pdf_request.finish( 64 | MessageSegment.text(res), 65 | at_sender=True, 66 | ) 67 | 68 | 69 | delete_request = on_command("删除所有pdf", block=True, priority=1) 70 | 71 | 72 | @delete_request.handle() 73 | async def _(): 74 | data = {} 75 | await delete_request.finish( 76 | MessageSegment.text("全部删除文件缓存成功!"), at_sender=True 77 | ) 78 | 79 | 80 | delete_user_request = on_command("删除我的pdf", block=True, priority=1) 81 | 82 | 83 | @delete_user_request.handle() 84 | async def _(event: MessageEvent): 85 | data[event.get_session_id()] = "" 86 | await delete_user_request.finish( 87 | MessageSegment.text("成功删除你在该群的缓存文件!"), at_sender=True 88 | ) 89 | -------------------------------------------------------------------------------- /nonebot_plugin_chatpdf/config.py: -------------------------------------------------------------------------------- 1 | from pydantic import Extra, BaseModel 2 | from typing import Optional 3 | 4 | 5 | class Config(BaseModel, extra=Extra.ignore): 6 | oneapi_url: Optional[str] = "" 7 | oneapi_key: Optional[str] = "" 8 | oneapi_model: Optional[str] = "gpt-4o" 9 | 10 | 11 | class ConfigError(Exception): 12 | pass 13 | -------------------------------------------------------------------------------- /nonebot_plugin_chatpdf/core.py: -------------------------------------------------------------------------------- 1 | import io 2 | # import pickle 3 | import nonebot 4 | import faiss 5 | import numpy as np 6 | import requests 7 | import fitz 8 | 9 | from tqdm import tqdm 10 | from openai import AsyncOpenAI 11 | from .config import Config 12 | 13 | plugin_config = Config.parse_obj(nonebot.get_driver().config.dict()) 14 | 15 | if plugin_config.oneapi_url: 16 | client = AsyncOpenAI( 17 | api_key=plugin_config.oneapi_key, base_url=plugin_config.oneapi_url 18 | ) 19 | else: 20 | client = AsyncOpenAI(api_key=plugin_config.oneapi_key) 21 | 22 | # if plugin_config.openai_http_proxy: 23 | # openai.proxy = {'http': plugin_config.openai_http_proxy, 'https': plugin_config.openai_http_proxy} 24 | 25 | 26 | def read_pdf(url): 27 | response = requests.get(url) 28 | stream = io.BytesIO(response.content) 29 | 30 | # 用pymupdf来读取pdf文件 31 | with fitz.open(stream=stream) as pdf: 32 | content = "" 33 | for page in pdf: 34 | # 获取每一页的内容 35 | texts = page.get_text() 36 | # 将内容添加到总体文本中 37 | content += texts 38 | 39 | return content 40 | 41 | 42 | async def create_embeddings(input): 43 | """Create embeddings for the provided input.""" 44 | result = [] 45 | # limit about 1000 tokens per request 46 | lens = [len(text) for text in input] 47 | query_len = 0 48 | start_index = 0 49 | tokens = 0 50 | 51 | async def get_embedding(input_slice): 52 | embedding = await client.embeddings.create( 53 | model="text-embedding-ada-002", input=input_slice 54 | ) 55 | return [ 56 | (text, data.embedding) for text, data in zip(input_slice, embedding.data) 57 | ], embedding.usage.total_tokens 58 | 59 | for index, l in tqdm(enumerate(lens)): 60 | query_len += l 61 | if query_len > 4096: 62 | ebd, tk = await get_embedding(input[start_index : index + 1]) 63 | query_len = 0 64 | start_index = index + 1 65 | tokens += tk 66 | result.extend(ebd) 67 | 68 | if query_len > 0: 69 | ebd, tk = await get_embedding(input[start_index:]) 70 | tokens += tk 71 | result.extend(ebd) 72 | return result, tokens 73 | 74 | 75 | async def create_embedding(text): 76 | """Create an embedding for the provided text.""" 77 | embedding = await client.embeddings.create( 78 | model="text-embedding-ada-002", input=text 79 | ) 80 | return text, embedding.data[0].embedding 81 | 82 | 83 | class QA: 84 | def __init__(self, index, data, tokens) -> None: 85 | self.index = index 86 | self.data = data 87 | self.tokens = tokens 88 | 89 | @classmethod 90 | async def create(cls, event_id, url): 91 | # 假设 read_pdf 是一个同步函数 92 | texts = read_pdf(url) 93 | str_buf = io.StringIO(texts) 94 | lines = str_buf.readlines() 95 | texts = [line.strip() for line in lines if line.strip()] 96 | 97 | data_embe, tokens = await create_embeddings(texts) 98 | 99 | # #存储文件 100 | # pickle.dump(data_embe, open(tmpfile, 'wb')) 101 | 102 | # print("文本消耗 {} tokens".format(tokens)) 103 | 104 | d = 1536 105 | index = faiss.IndexFlatL2(d) 106 | embe = np.array([emm[1] for emm in data_embe]) 107 | data = [emm[0] for emm in data_embe] 108 | index.add(embe) 109 | return cls(index, data, tokens) 110 | 111 | async def generate_ans(self, query): 112 | embedding = await create_embedding(query) 113 | context = self.get_texts(embedding[1], limit=10) 114 | answer = await self.completion(query, context) 115 | return answer, context 116 | 117 | def get_texts(self, embeding, limit): 118 | _, text_index = self.index.search(np.array([embeding]), limit) 119 | context = [] 120 | for i in list(text_index[0]): 121 | context.extend(self.data[i : i + 5]) 122 | # context = [self.data[i] for i in list(text_index[0])] 123 | return context 124 | 125 | async def completion(self, query, context): 126 | """Create a completion.""" 127 | lens = [len(text) for text in context] 128 | 129 | maximum = 3000 130 | for index, l in enumerate(lens): 131 | maximum -= l 132 | if maximum < 0: 133 | context = context[: index + 1] 134 | # print("超过最大长度,截断到前", index + 1, "个片段") 135 | break 136 | 137 | text = "\n".join(f"{index}. {text}" for index, text in enumerate(context)) 138 | response = await client.chat.completions.create( 139 | model=plugin_config.oneapi_model, 140 | messages=[ 141 | { 142 | "role": "system", 143 | "content": f"你是一个有帮助的AI文章助手,从下文中提取有用的内容进行回答,不能回答不在下文提到的内容,相关性从高到底排序:\n\n{text}", 144 | }, 145 | {"role": "user", "content": query}, 146 | ], 147 | ) 148 | # print("使用的tokens:", response.usage.total_tokens) 149 | return response.choices[0].message.content 150 | 151 | async def get_ans(self, query): 152 | answer, context = await self.generate_ans(query) 153 | # print("回答如下\n\n") 154 | # print(answer.strip()) 155 | return answer 156 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "nonebot-plugin-chatpdf" 3 | version = "1.0.0" 4 | description = "A nonebot plugin for chatpdf" 5 | authors = ["Alpaca "] 6 | license = "MIT" 7 | readme = "README.md" 8 | 9 | [tool.poetry.dependencies] 10 | python = "^3.8" 11 | nonebot2 = "^2.0.0rc3" 12 | nonebot-adapter-onebot = "^2.2.1" 13 | openai = "^1.30.1" 14 | numpy = "^1.24.2" 15 | faiss-cpu = "^1.7.3" 16 | pymupdf = "^1.21.1" 17 | 18 | [[tool.poetry.source]] 19 | name = "tsinghua" 20 | url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/" 21 | default = true 22 | 23 | 24 | [build-system] 25 | requires = ["poetry-core>=1.0.0"] 26 | build-backend = "poetry.core.masonry.api" 27 | --------------------------------------------------------------------------------