├── README.md
├── img
├── img1.jpg
└── img2.jpg
├── nonebot_plugin_chatpdf
├── __init__.py
├── config.py
└── core.py
└── pyproject.toml
/README.md:
--------------------------------------------------------------------------------
1 |
2 |

3 |
4 |

5 |
6 |
7 |
8 |
9 | # nonebot-plugin-chatpdf
10 |
11 |
12 |
13 | # 介绍
14 |
15 | - 本插件灵感来源于最近很火的 [chatpdf](https://www.chatpdf.com)。
16 | - 将需要分析的论文/阅读材料上传到群文件,机器人可以对其进行存储分析,然后你可以向其提问有关文章内容、文章概要、对于文章的思考等问题
17 | - 本插件参考和使用了项目 [Document_QA](https://github.com/fierceX/Document_QA) 中的核心代码
18 | - 本插件可选使用OneAPI格式的第三方中转站也可以使用OpenAI官方接口,但是在速率限制的情况下本插件可能无法使用。
19 |
20 | # 效果
21 | 使用方法以最新说明为主
22 |
23 | 
24 |
25 | # 安装
26 |
27 | * 手动安装
28 | ```
29 | git clone https://github.com/Alpaca4610/nonebot_plugin_chatpdf.git
30 | ```
31 |
32 | 下载完成后在bot项目的pyproject.toml文件手动添加插件:
33 |
34 | ```
35 | plugin_dirs = ["xxxxxx","xxxxxx",......,"下载完成的插件路径/nonebot-plugin-chatpdf"]
36 | ```
37 | * 使用 pip
38 | ```
39 | pip install nonebot-plugin-chatgpt-chatpdf
40 | ```
41 |
42 | # 配置文件
43 |
44 | 在Bot根目录下的.env文件中追加如下内容:
45 |
46 | ```
47 | oneapi_key = "sk-xxxxxxxxxx" # (必填)OpenAI官方或者是支持OneAPI的大模型中转服务商提供的KEY
48 | oneapi_url = "https://xxxxxxxxx" # (可选)大模型中转服务商提供的中转地址,使用OpenAI官方服务不需要填写
49 | oneapi_model = "gpt-4" # (可选)使用的语言大模型
50 | ```
51 |
52 |
53 | # 使用方法
54 |
55 | 如果设置了nonebot全局触发前缀,需要在下面的命令前加上设置的前缀。
56 |
57 | ### 使用方式:上传需要分析的pdf文件到群文件中
58 |
59 | - 分析pdf (使用该命令以上传pdf文件的方式启动chatpdf文章分析功能)
60 | - 在一分钟内,上传需要分析的pdf文件到群文件中,分析完成后会返回成功信息
61 | - askpdf (文章分析完成后,使用该命令后面接需要提问的关于文章的问题,机器人会给出答案)
62 | - 删除所有pdf (删除所有缓存)
63 | - 删除我的pdf (删除用户在本群的缓存)
64 |
65 |
66 | # 注意事项
67 |
68 | - 每次调用```分析pdf```命令时,都会清除调用者以前的分析缓存
69 |
70 |
--------------------------------------------------------------------------------
/img/img1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Alpaca4610/nonebot_plugin_chatpdf/bb317be062b434fa185af38c426f7fe4d9b71362/img/img1.jpg
--------------------------------------------------------------------------------
/img/img2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Alpaca4610/nonebot_plugin_chatpdf/bb317be062b434fa185af38c426f7fe4d9b71362/img/img2.jpg
--------------------------------------------------------------------------------
/nonebot_plugin_chatpdf/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import shutil
3 | from datetime import timedelta
4 |
5 | from nonebot import on_command, on_notice
6 | from nonebot.adapters.onebot.v11 import Message, MessageSegment
7 | from nonebot.adapters.onebot.v11 import (
8 | MessageEvent,
9 | GroupMessageEvent,
10 | GroupUploadNoticeEvent,
11 | )
12 | from nonebot.params import CommandArg
13 |
14 |
15 | from .core import QA
16 |
17 |
18 | data = {}
19 |
20 |
21 | def delete_file(dir):
22 | if os.path.exists(dir):
23 | shutil.rmtree(dir)
24 |
25 |
26 | pdf_request = on_command("分析pdf", block=True, priority=1)
27 |
28 |
29 | @pdf_request.handle()
30 | async def _(event: GroupMessageEvent):
31 | pdf_file_request = on_notice(temp=True, expire_time=timedelta(minutes=1))
32 | id = event.get_session_id()
33 |
34 | data[id] = ""
35 |
36 | @pdf_file_request.handle()
37 | async def _(event: GroupUploadNoticeEvent):
38 | await pdf_request.send(
39 | MessageSegment.text("分析中,请稍等......"), at_sender=True
40 | )
41 |
42 | data[id] = await QA.create(id, event.file.url)
43 |
44 | await pdf_request.finish(
45 | MessageSegment.text("现在,你可以向我提问有关于该文章的问题了"),
46 | at_sender=True,
47 | )
48 |
49 | await pdf_request.finish(
50 | MessageSegment.text("请上传需要分析的pdf文件"), at_sender=True
51 | )
52 |
53 |
54 | pdf_chat_request = on_command("askpdf", block=True, priority=1)
55 |
56 |
57 | @pdf_chat_request.handle()
58 | async def _(event: GroupMessageEvent, msg: Message = CommandArg()):
59 | if event.get_session_id() not in data:
60 | await pdf_chat_request.finish(MessageSegment.text("请先使用/start命令开始!"))
61 | question = msg.extract_plain_text()
62 | res = await data[event.get_session_id()].get_ans(question)
63 | await pdf_request.finish(
64 | MessageSegment.text(res),
65 | at_sender=True,
66 | )
67 |
68 |
69 | delete_request = on_command("删除所有pdf", block=True, priority=1)
70 |
71 |
72 | @delete_request.handle()
73 | async def _():
74 | data = {}
75 | await delete_request.finish(
76 | MessageSegment.text("全部删除文件缓存成功!"), at_sender=True
77 | )
78 |
79 |
80 | delete_user_request = on_command("删除我的pdf", block=True, priority=1)
81 |
82 |
83 | @delete_user_request.handle()
84 | async def _(event: MessageEvent):
85 | data[event.get_session_id()] = ""
86 | await delete_user_request.finish(
87 | MessageSegment.text("成功删除你在该群的缓存文件!"), at_sender=True
88 | )
89 |
--------------------------------------------------------------------------------
/nonebot_plugin_chatpdf/config.py:
--------------------------------------------------------------------------------
1 | from pydantic import Extra, BaseModel
2 | from typing import Optional
3 |
4 |
5 | class Config(BaseModel, extra=Extra.ignore):
6 | oneapi_url: Optional[str] = ""
7 | oneapi_key: Optional[str] = ""
8 | oneapi_model: Optional[str] = "gpt-4o"
9 |
10 |
11 | class ConfigError(Exception):
12 | pass
13 |
--------------------------------------------------------------------------------
/nonebot_plugin_chatpdf/core.py:
--------------------------------------------------------------------------------
1 | import io
2 | # import pickle
3 | import nonebot
4 | import faiss
5 | import numpy as np
6 | import requests
7 | import fitz
8 |
9 | from tqdm import tqdm
10 | from openai import AsyncOpenAI
11 | from .config import Config
12 |
13 | plugin_config = Config.parse_obj(nonebot.get_driver().config.dict())
14 |
15 | if plugin_config.oneapi_url:
16 | client = AsyncOpenAI(
17 | api_key=plugin_config.oneapi_key, base_url=plugin_config.oneapi_url
18 | )
19 | else:
20 | client = AsyncOpenAI(api_key=plugin_config.oneapi_key)
21 |
22 | # if plugin_config.openai_http_proxy:
23 | # openai.proxy = {'http': plugin_config.openai_http_proxy, 'https': plugin_config.openai_http_proxy}
24 |
25 |
26 | def read_pdf(url):
27 | response = requests.get(url)
28 | stream = io.BytesIO(response.content)
29 |
30 | # 用pymupdf来读取pdf文件
31 | with fitz.open(stream=stream) as pdf:
32 | content = ""
33 | for page in pdf:
34 | # 获取每一页的内容
35 | texts = page.get_text()
36 | # 将内容添加到总体文本中
37 | content += texts
38 |
39 | return content
40 |
41 |
42 | async def create_embeddings(input):
43 | """Create embeddings for the provided input."""
44 | result = []
45 | # limit about 1000 tokens per request
46 | lens = [len(text) for text in input]
47 | query_len = 0
48 | start_index = 0
49 | tokens = 0
50 |
51 | async def get_embedding(input_slice):
52 | embedding = await client.embeddings.create(
53 | model="text-embedding-ada-002", input=input_slice
54 | )
55 | return [
56 | (text, data.embedding) for text, data in zip(input_slice, embedding.data)
57 | ], embedding.usage.total_tokens
58 |
59 | for index, l in tqdm(enumerate(lens)):
60 | query_len += l
61 | if query_len > 4096:
62 | ebd, tk = await get_embedding(input[start_index : index + 1])
63 | query_len = 0
64 | start_index = index + 1
65 | tokens += tk
66 | result.extend(ebd)
67 |
68 | if query_len > 0:
69 | ebd, tk = await get_embedding(input[start_index:])
70 | tokens += tk
71 | result.extend(ebd)
72 | return result, tokens
73 |
74 |
75 | async def create_embedding(text):
76 | """Create an embedding for the provided text."""
77 | embedding = await client.embeddings.create(
78 | model="text-embedding-ada-002", input=text
79 | )
80 | return text, embedding.data[0].embedding
81 |
82 |
83 | class QA:
84 | def __init__(self, index, data, tokens) -> None:
85 | self.index = index
86 | self.data = data
87 | self.tokens = tokens
88 |
89 | @classmethod
90 | async def create(cls, event_id, url):
91 | # 假设 read_pdf 是一个同步函数
92 | texts = read_pdf(url)
93 | str_buf = io.StringIO(texts)
94 | lines = str_buf.readlines()
95 | texts = [line.strip() for line in lines if line.strip()]
96 |
97 | data_embe, tokens = await create_embeddings(texts)
98 |
99 | # #存储文件
100 | # pickle.dump(data_embe, open(tmpfile, 'wb'))
101 |
102 | # print("文本消耗 {} tokens".format(tokens))
103 |
104 | d = 1536
105 | index = faiss.IndexFlatL2(d)
106 | embe = np.array([emm[1] for emm in data_embe])
107 | data = [emm[0] for emm in data_embe]
108 | index.add(embe)
109 | return cls(index, data, tokens)
110 |
111 | async def generate_ans(self, query):
112 | embedding = await create_embedding(query)
113 | context = self.get_texts(embedding[1], limit=10)
114 | answer = await self.completion(query, context)
115 | return answer, context
116 |
117 | def get_texts(self, embeding, limit):
118 | _, text_index = self.index.search(np.array([embeding]), limit)
119 | context = []
120 | for i in list(text_index[0]):
121 | context.extend(self.data[i : i + 5])
122 | # context = [self.data[i] for i in list(text_index[0])]
123 | return context
124 |
125 | async def completion(self, query, context):
126 | """Create a completion."""
127 | lens = [len(text) for text in context]
128 |
129 | maximum = 3000
130 | for index, l in enumerate(lens):
131 | maximum -= l
132 | if maximum < 0:
133 | context = context[: index + 1]
134 | # print("超过最大长度,截断到前", index + 1, "个片段")
135 | break
136 |
137 | text = "\n".join(f"{index}. {text}" for index, text in enumerate(context))
138 | response = await client.chat.completions.create(
139 | model=plugin_config.oneapi_model,
140 | messages=[
141 | {
142 | "role": "system",
143 | "content": f"你是一个有帮助的AI文章助手,从下文中提取有用的内容进行回答,不能回答不在下文提到的内容,相关性从高到底排序:\n\n{text}",
144 | },
145 | {"role": "user", "content": query},
146 | ],
147 | )
148 | # print("使用的tokens:", response.usage.total_tokens)
149 | return response.choices[0].message.content
150 |
151 | async def get_ans(self, query):
152 | answer, context = await self.generate_ans(query)
153 | # print("回答如下\n\n")
154 | # print(answer.strip())
155 | return answer
156 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.poetry]
2 | name = "nonebot-plugin-chatpdf"
3 | version = "1.0.0"
4 | description = "A nonebot plugin for chatpdf"
5 | authors = ["Alpaca "]
6 | license = "MIT"
7 | readme = "README.md"
8 |
9 | [tool.poetry.dependencies]
10 | python = "^3.8"
11 | nonebot2 = "^2.0.0rc3"
12 | nonebot-adapter-onebot = "^2.2.1"
13 | openai = "^1.30.1"
14 | numpy = "^1.24.2"
15 | faiss-cpu = "^1.7.3"
16 | pymupdf = "^1.21.1"
17 |
18 | [[tool.poetry.source]]
19 | name = "tsinghua"
20 | url = "https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple/"
21 | default = true
22 |
23 |
24 | [build-system]
25 | requires = ["poetry-core>=1.0.0"]
26 | build-backend = "poetry.core.masonry.api"
27 |
--------------------------------------------------------------------------------