├── .gitignore ├── base ├── __init__.py └── config.py ├── embedding ├── __init__.py ├── llm │ ├── __init__.py │ ├── base.py │ └── openapi.py └── vectordb │ ├── __init__.py │ └── chromadb.py ├── images ├── 3a68a873.png ├── 53d81b7e.png ├── 7889b23b.png └── d19a83a6.png ├── main.py ├── readme.md ├── requirements.txt └── web ├── __init__.py ├── static ├── avatar.jpg └── momo.jpg └── templates └── content.html /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | venv 3 | test 4 | db -------------------------------------------------------------------------------- /base/__init__.py: -------------------------------------------------------------------------------- 1 | from base.config import Config 2 | 3 | EMBEDDING = "embedding" 4 | -------------------------------------------------------------------------------- /base/config.py: -------------------------------------------------------------------------------- 1 | class Config: 2 | def __init__(self): 3 | self.openapi_base = "xxxx" 4 | self.openapi_key = "xxxx" 5 | -------------------------------------------------------------------------------- /embedding/__init__.py: -------------------------------------------------------------------------------- 1 | from embedding.llm.openapi import Openapi 2 | from embedding.vectordb.chromadb import ChromadbDb 3 | from base.config import Config 4 | 5 | 6 | class Embedding: 7 | def __init__(self): 8 | self.__config = Config() 9 | self._llm = Openapi(self.__config) 10 | self.__db = ChromadbDb() 11 | 12 | # 添加新文本 13 | def add_text(self, content: str, url: str): 14 | embedding = self._llm.embedding(content) 15 | self.__db.add_text(content, embedding, {"url": url}) 16 | 17 | # 文本查询 18 | def query_text(self, query: str, size: int): 19 | return self.__db.query_text(query, size) 20 | 21 | # 获取数据 22 | def get_data(self, no: int, size: int): 23 | total, data = self.__db.get_data(no, size) 24 | data_list = [] 25 | for i in range(len(data["ids"])): 26 | data_list.append({ 27 | "id": data["ids"][i], 28 | "content": data["documents"][i], 29 | "url": data["metadatas"][i]["url"], 30 | }) 31 | return { 32 | "count": total, 33 | "data": data_list 34 | } 35 | 36 | # 删除数据 37 | def delete_data(self, ids: str): 38 | self.__db.delete_data(ids.split(",")) 39 | 40 | # 询问问题 41 | def ask_question(self, text: str): 42 | content = self.__db.query_text(text, 1) 43 | documents = content.get('documents') 44 | if len(documents) > 0: 45 | return self._llm.ask(query=text, context=documents[0]) 46 | return "" 47 | # if content.get() 48 | # self._llm.ask(text) 49 | 50 | # 清除上下文 51 | def clear_question(self): 52 | self._llm.clear() 53 | -------------------------------------------------------------------------------- /embedding/llm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoyou-bilibili/llm_knowledge/d05f7771e988eaa835a66646a8612e9e36d63684/embedding/llm/__init__.py -------------------------------------------------------------------------------- /embedding/llm/base.py: -------------------------------------------------------------------------------- 1 | # 默认大模型 2 | class BaseLLM: 3 | # 把文本编码为特征向量数据 4 | def embedding(self, text: str) -> dict[dict]: 5 | pass 6 | 7 | # 询问ai问题并获得解答 8 | def ask(self, query: str, context: str) -> str: 9 | pass 10 | 11 | # 清除上下文 12 | def clear(self): 13 | pass -------------------------------------------------------------------------------- /embedding/llm/openapi.py: -------------------------------------------------------------------------------- 1 | from embedding.llm.base import BaseLLM 2 | from base import Config 3 | import openai 4 | 5 | 6 | class Openapi(BaseLLM): 7 | def __init__(self, config: 'Config'): 8 | self.__config = config 9 | openai.api_base = config.openapi_base 10 | openai.api_key = config.openapi_key 11 | self.__context = [] 12 | 13 | def embedding(self, text: str) -> dict[dict]: 14 | embedding = openai.Embedding.create(model="text-embedding-ada-002", input=text) 15 | return embedding.data[0].embedding 16 | 17 | def ask(self, query: str, context: str) -> str: 18 | messages = [ 19 | {"role": "system", "content": f'你是一个乐于助人的作者,你需要从下文中提取有用的内容来解答用户提出的问题,不能回答不在下文提到的内容,回答请以我的视角回答:\n\n{context}'} 20 | ] 21 | self.__context.append({"role": "user", "content": query}) 22 | messages.extend(self.__context) 23 | print(messages) 24 | response = openai.ChatCompletion.create( 25 | model="gpt-3.5-turbo-16k", 26 | messages=messages 27 | ) 28 | answer = response.choices[0].message.content 29 | print("使用的tokens:", response.usage.total_tokens) 30 | self.__context.append({"role": "assistant", "content": answer}) 31 | return answer 32 | 33 | def clear(self): 34 | self.__context = [] -------------------------------------------------------------------------------- /embedding/vectordb/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoyou-bilibili/llm_knowledge/d05f7771e988eaa835a66646a8612e9e36d63684/embedding/vectordb/__init__.py -------------------------------------------------------------------------------- /embedding/vectordb/chromadb.py: -------------------------------------------------------------------------------- 1 | import chromadb 2 | import uuid 3 | from chromadb.utils import embedding_functions 4 | from base import Config 5 | 6 | 7 | class ChromadbDb: 8 | def __init__(self): 9 | self.__config = Config() 10 | self.__chroma_client = chromadb.PersistentClient(path="db") 11 | self.__collection = self.__chroma_client.get_or_create_collection( 12 | name="embedding", 13 | embedding_function=embedding_functions.OpenAIEmbeddingFunction( 14 | api_base=self.__config.openapi_base, 15 | api_key=self.__config.openapi_key, 16 | model_name="text-embedding-ada-002", 17 | )) 18 | 19 | # 添加文本 20 | def add_text(self, text: str, embedding: dict[dict], meta: dict): 21 | self.__collection.add( 22 | documents=[text], 23 | embeddings=[embedding], 24 | metadatas=[meta], 25 | ids=[uuid.uuid4().hex] 26 | ) 27 | 28 | def query_text(self, query: str, result: int): 29 | results = self.__collection.query( 30 | query_texts=[query], 31 | n_results=result 32 | ) 33 | return results 34 | 35 | def get_data(self, no: int, size: int) -> [int, dict]: 36 | count = self.__collection.count() 37 | result = self.__collection.get(limit=size, offset=(no - 1) * size) 38 | return count, result 39 | 40 | def delete_data(self, ids: list[str]): 41 | self.__collection.delete(ids=ids) 42 | -------------------------------------------------------------------------------- /images/3a68a873.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoyou-bilibili/llm_knowledge/d05f7771e988eaa835a66646a8612e9e36d63684/images/3a68a873.png -------------------------------------------------------------------------------- /images/53d81b7e.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoyou-bilibili/llm_knowledge/d05f7771e988eaa835a66646a8612e9e36d63684/images/53d81b7e.png -------------------------------------------------------------------------------- /images/7889b23b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoyou-bilibili/llm_knowledge/d05f7771e988eaa835a66646a8612e9e36d63684/images/7889b23b.png -------------------------------------------------------------------------------- /images/d19a83a6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoyou-bilibili/llm_knowledge/d05f7771e988eaa835a66646a8612e9e36d63684/images/d19a83a6.png -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from web import app 2 | from embedding import Embedding 3 | from base import EMBEDDING 4 | 5 | if __name__ == '__main__': 6 | # 配置相关服务 7 | app.config[EMBEDDING] = Embedding() 8 | # 运行flask 9 | app.run(host='0.0.0.0', port=7001) 10 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # 基于大语言模型的个人知识库 2 | 3 | ## 项目功能 4 | 5 | - 基于embedding的文档搜索,每次只会只搜索最相关的文档,不会把所有的文档都喂给gpt 6 | - 提供网页爬取和文本导入功能,可以导入自己想要的内容 7 | - 提供数据管理界面,可以看到自己的数据,以及对数据删除的功能 8 | - 提供聊天界面,支持保存上下文 9 | - 支持日常聊天,文章总结,问题询问等功能 10 | 11 | ## 项目展示 12 | 13 | 网页爬取功能 14 | 15 | ![](images/d19a83a6.png) 16 | 17 | 知识内容管理 18 | 19 | ![](images/7889b23b.png) 20 | 21 | 可以问一些文档里面的问题 22 | 23 | ![](images/3a68a873.png) 24 | 25 | 还可以对文章进行总结 26 | 27 | ![](images/53d81b7e.png) 28 | 29 | ## 项目运行 30 | 31 | 先到 `base/config.py` 修改openapi的key信息,然后使用`pip install -r requirements.txt`安装一下依赖,最后使用`pyton main.py`就可以启动了 32 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | openai~=0.28.0 2 | chromadb~=0.4.10 3 | requests~=2.31.0 4 | beautifulsoup4~=4.12.2 5 | flask~=2.3.3 -------------------------------------------------------------------------------- /web/__init__.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import requests 3 | from flask import Flask, request, Response, render_template 4 | import json 5 | from embedding import Embedding 6 | from base import EMBEDDING 7 | 8 | # 初始化flaskAPP 9 | app = Flask(__name__) 10 | 11 | 12 | # 返回JSON字符串 13 | def return_json(data): 14 | return Response(json.dumps(data, ensure_ascii=False), mimetype='application/json') 15 | 16 | 17 | # 网页爬取 18 | @app.route('/content/web', methods=['POST']) 19 | def content_web(): 20 | data = request.get_json() 21 | # 获取网页内容 22 | response = requests.get(data['url']) 23 | # 创建BeautifulSoup对象 24 | content = BeautifulSoup(response.text, 'html.parser') 25 | if data["id"] != "": 26 | content = content.find(id=data['id']) 27 | # 返回json类型字符串 28 | return return_json({"content": content.get_text()}) 29 | 30 | 31 | # 普通文本 32 | @app.route('/content/text', methods=['POST']) 33 | def content_text(): 34 | embedding: Embedding = app.config[EMBEDDING] 35 | data = request.get_json() 36 | embedding.add_text(data["content"], data["url"]) 37 | # 返回json类型字符串 38 | return return_json(data) 39 | 40 | 41 | # 查询数据 42 | @app.route('/db/query', methods=['GET']) 43 | def db_query(): 44 | embedding: Embedding = app.config[EMBEDDING] 45 | query = request.args.get('query') 46 | try: 47 | size = int(str(request.args.get('size'))) 48 | except (ValueError, TypeError): 49 | size = 1 50 | if query is None or query == "": 51 | return return_json({ 52 | "msg": "请输入query" 53 | }) 54 | # 返回json类型字符串 55 | return return_json(embedding.query_text(query, size)) 56 | 57 | 58 | # 查询数据 59 | @app.route('/db/get', methods=['GET']) 60 | def db_get(): 61 | embedding: Embedding = app.config[EMBEDDING] 62 | try: 63 | no = int(str(request.args.get('page'))) 64 | size = int(str(request.args.get('limit'))) 65 | except (ValueError, TypeError): 66 | no = 1 67 | size = 20 68 | data = embedding.get_data(no, size) 69 | data["code"] = 0 70 | # 返回json类型字符串 71 | return return_json(data) 72 | 73 | 74 | # 查询数据 75 | @app.route('/db/delete', methods=['GET']) 76 | def db_delete(): 77 | embedding: Embedding = app.config[EMBEDDING] 78 | embedding.delete_data(str(request.args.get('id'))) 79 | # 返回json类型字符串 80 | return return_json({}) 81 | 82 | 83 | # 询问问题 84 | @app.route('/chat/ask', methods=['POST']) 85 | def ask_question(): 86 | embedding: Embedding = app.config[EMBEDDING] 87 | data = request.get_json() 88 | # 返回json类型字符串 89 | return return_json({ 90 | "answer": embedding.ask_question(data["question"]) 91 | }) 92 | 93 | 94 | # 清除上下文 95 | @app.route('/chat/clear', methods=['POST']) 96 | def clear_ask(): 97 | embedding: Embedding = app.config[EMBEDDING] 98 | embedding.clear_question() 99 | return return_json({}) 100 | 101 | 102 | # 主页显示HTML 103 | @app.route('/', methods=['GET']) 104 | def index(): 105 | return render_template('content.html') 106 | -------------------------------------------------------------------------------- /web/static/avatar.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoyou-bilibili/llm_knowledge/d05f7771e988eaa835a66646a8612e9e36d63684/web/static/avatar.jpg -------------------------------------------------------------------------------- /web/static/momo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoyou-bilibili/llm_knowledge/d05f7771e988eaa835a66646a8612e9e36d63684/web/static/momo.jpg -------------------------------------------------------------------------------- /web/templates/content.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 个人知识库 6 | 7 | 8 | 9 | 10 | 11 | 29 | 30 | 31 |
32 | 37 |
38 |
39 | 40 |
41 | 42 |
43 | 44 |
45 | 46 |
47 | 48 | 49 | 50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 | 62 | 63 | 64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 | 74 | 75 | 76 | 78 | 79 | 80 | 81 | 82 | 98 | 101 | 160 | 161 | --------------------------------------------------------------------------------