├── .gitignore
├── base
    ├── __init__.py
    └── config.py
├── embedding
    ├── __init__.py
    ├── llm
    │   ├── __init__.py
    │   ├── base.py
    │   └── openapi.py
    └── vectordb
    │   ├── __init__.py
    │   └── chromadb.py
├── images
    ├── 3a68a873.png
    ├── 53d81b7e.png
    ├── 7889b23b.png
    └── d19a83a6.png
├── main.py
├── readme.md
├── requirements.txt
└── web
    ├── __init__.py
    ├── static
        ├── avatar.jpg
        └── momo.jpg
    └── templates
        └── content.html


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | venv
3 | test
4 | db


--------------------------------------------------------------------------------
/base/__init__.py:
--------------------------------------------------------------------------------
1 | from base.config import Config
2 | 
3 | EMBEDDING = "embedding"
4 | 


--------------------------------------------------------------------------------
/base/config.py:
--------------------------------------------------------------------------------
1 | class Config:
2 |     def __init__(self):
3 |         self.openapi_base = "xxxx"
4 |         self.openapi_key = "xxxx"
5 | 


--------------------------------------------------------------------------------
/embedding/__init__.py:
--------------------------------------------------------------------------------
 1 | from embedding.llm.openapi import Openapi
 2 | from embedding.vectordb.chromadb import ChromadbDb
 3 | from base.config import Config
 4 | 
 5 | 
 6 | class Embedding:
 7 |     def __init__(self):
 8 |         self.__config = Config()
 9 |         self._llm = Openapi(self.__config)
10 |         self.__db = ChromadbDb()
11 | 
12 |     # 添加新文本
13 |     def add_text(self, content: str, url: str):
14 |         embedding = self._llm.embedding(content)
15 |         self.__db.add_text(content, embedding, {"url": url})
16 | 
17 |     # 文本查询
18 |     def query_text(self, query: str, size: int):
19 |         return self.__db.query_text(query, size)
20 | 
21 |     # 获取数据
22 |     def get_data(self, no: int, size: int):
23 |         total, data = self.__db.get_data(no, size)
24 |         data_list = []
25 |         for i in range(len(data["ids"])):
26 |             data_list.append({
27 |                 "id": data["ids"][i],
28 |                 "content": data["documents"][i],
29 |                 "url": data["metadatas"][i]["url"],
30 |             })
31 |         return {
32 |             "count": total,
33 |             "data": data_list
34 |         }
35 | 
36 |     # 删除数据
37 |     def delete_data(self, ids: str):
38 |         self.__db.delete_data(ids.split(","))
39 | 
40 |     # 询问问题
41 |     def ask_question(self, text: str):
42 |         content = self.__db.query_text(text, 1)
43 |         documents = content.get('documents')
44 |         if len(documents) > 0:
45 |             return self._llm.ask(query=text, context=documents[0])
46 |         return ""
47 |         # if content.get()
48 |         # self._llm.ask(text)
49 | 
50 |     # 清除上下文
51 |     def clear_question(self):
52 |         self._llm.clear()
53 | 


--------------------------------------------------------------------------------
/embedding/llm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaoyou-bilibili/llm_knowledge/d05f7771e988eaa835a66646a8612e9e36d63684/embedding/llm/__init__.py


--------------------------------------------------------------------------------
/embedding/llm/base.py:
--------------------------------------------------------------------------------
 1 | # 默认大模型
 2 | class BaseLLM:
 3 |     # 把文本编码为特征向量数据
 4 |     def embedding(self, text: str) -> dict[dict]:
 5 |         pass
 6 | 
 7 |     # 询问ai问题并获得解答
 8 |     def ask(self, query: str, context: str) -> str:
 9 |         pass
10 | 
11 |     # 清除上下文
12 |     def clear(self):
13 |         pass


--------------------------------------------------------------------------------
/embedding/llm/openapi.py:
--------------------------------------------------------------------------------
 1 | from embedding.llm.base import BaseLLM
 2 | from base import Config
 3 | import openai
 4 | 
 5 | 
 6 | class Openapi(BaseLLM):
 7 |     def __init__(self, config: 'Config'):
 8 |         self.__config = config
 9 |         openai.api_base = config.openapi_base
10 |         openai.api_key = config.openapi_key
11 |         self.__context = []
12 | 
13 |     def embedding(self, text: str) -> dict[dict]:
14 |         embedding = openai.Embedding.create(model="text-embedding-ada-002", input=text)
15 |         return embedding.data[0].embedding
16 | 
17 |     def ask(self, query: str, context: str) -> str:
18 |         messages = [
19 |             {"role": "system", "content": f'你是一个乐于助人的作者，你需要从下文中提取有用的内容来解答用户提出的问题，不能回答不在下文提到的内容，回答请以我的视角回答：\n\n{context}'}
20 |         ]
21 |         self.__context.append({"role": "user", "content": query})
22 |         messages.extend(self.__context)
23 |         print(messages)
24 |         response = openai.ChatCompletion.create(
25 |             model="gpt-3.5-turbo-16k",
26 |             messages=messages
27 |         )
28 |         answer = response.choices[0].message.content
29 |         print("使用的tokens：", response.usage.total_tokens)
30 |         self.__context.append({"role": "assistant", "content": answer})
31 |         return answer
32 | 
33 |     def clear(self):
34 |         self.__context = []


--------------------------------------------------------------------------------
/embedding/vectordb/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaoyou-bilibili/llm_knowledge/d05f7771e988eaa835a66646a8612e9e36d63684/embedding/vectordb/__init__.py


--------------------------------------------------------------------------------
/embedding/vectordb/chromadb.py:
--------------------------------------------------------------------------------
 1 | import chromadb
 2 | import uuid
 3 | from chromadb.utils import embedding_functions
 4 | from base import Config
 5 | 
 6 | 
 7 | class ChromadbDb:
 8 |     def __init__(self):
 9 |         self.__config = Config()
10 |         self.__chroma_client = chromadb.PersistentClient(path="db")
11 |         self.__collection = self.__chroma_client.get_or_create_collection(
12 |             name="embedding",
13 |             embedding_function=embedding_functions.OpenAIEmbeddingFunction(
14 |                 api_base=self.__config.openapi_base,
15 |                 api_key=self.__config.openapi_key,
16 |                 model_name="text-embedding-ada-002",
17 |             ))
18 | 
19 |     # 添加文本
20 |     def add_text(self, text: str, embedding: dict[dict], meta: dict):
21 |         self.__collection.add(
22 |             documents=[text],
23 |             embeddings=[embedding],
24 |             metadatas=[meta],
25 |             ids=[uuid.uuid4().hex]
26 |         )
27 | 
28 |     def query_text(self, query: str, result: int):
29 |         results = self.__collection.query(
30 |             query_texts=[query],
31 |             n_results=result
32 |         )
33 |         return results
34 | 
35 |     def get_data(self, no: int, size: int) -> [int, dict]:
36 |         count = self.__collection.count()
37 |         result = self.__collection.get(limit=size, offset=(no - 1) * size)
38 |         return count, result
39 | 
40 |     def delete_data(self, ids: list[str]):
41 |         self.__collection.delete(ids=ids)
42 | 


--------------------------------------------------------------------------------
/images/3a68a873.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaoyou-bilibili/llm_knowledge/d05f7771e988eaa835a66646a8612e9e36d63684/images/3a68a873.png


--------------------------------------------------------------------------------
/images/53d81b7e.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaoyou-bilibili/llm_knowledge/d05f7771e988eaa835a66646a8612e9e36d63684/images/53d81b7e.png


--------------------------------------------------------------------------------
/images/7889b23b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaoyou-bilibili/llm_knowledge/d05f7771e988eaa835a66646a8612e9e36d63684/images/7889b23b.png


--------------------------------------------------------------------------------
/images/d19a83a6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaoyou-bilibili/llm_knowledge/d05f7771e988eaa835a66646a8612e9e36d63684/images/d19a83a6.png


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | from web import app
 2 | from embedding import Embedding
 3 | from base import EMBEDDING
 4 | 
 5 | if __name__ == '__main__':
 6 |     # 配置相关服务
 7 |     app.config[EMBEDDING] = Embedding()
 8 |     # 运行flask
 9 |     app.run(host='0.0.0.0', port=7001)
10 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | # 基于大语言模型的个人知识库
 2 | 
 3 | ## 项目功能
 4 | 
 5 | - 基于embedding的文档搜索，每次只会只搜索最相关的文档，不会把所有的文档都喂给gpt
 6 | - 提供网页爬取和文本导入功能，可以导入自己想要的内容
 7 | - 提供数据管理界面，可以看到自己的数据，以及对数据删除的功能
 8 | - 提供聊天界面，支持保存上下文
 9 | - 支持日常聊天，文章总结，问题询问等功能
10 | 
11 | ## 项目展示
12 | 
13 | 网页爬取功能
14 | 
15 | ![](images/d19a83a6.png)
16 | 
17 | 知识内容管理
18 | 
19 | ![](images/7889b23b.png)
20 | 
21 | 可以问一些文档里面的问题
22 | 
23 | ![](images/3a68a873.png)
24 | 
25 | 还可以对文章进行总结
26 | 
27 | ![](images/53d81b7e.png)
28 | 
29 | ## 项目运行
30 | 
31 | 先到 `base/config.py` 修改openapi的key信息，然后使用`pip install -r requirements.txt`安装一下依赖，最后使用`pyton main.py`就可以启动了
32 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | openai~=0.28.0
2 | chromadb~=0.4.10
3 | requests~=2.31.0
4 | beautifulsoup4~=4.12.2
5 | flask~=2.3.3


--------------------------------------------------------------------------------
/web/__init__.py:
--------------------------------------------------------------------------------
  1 | from bs4 import BeautifulSoup
  2 | import requests
  3 | from flask import Flask, request, Response, render_template
  4 | import json
  5 | from embedding import Embedding
  6 | from base import EMBEDDING
  7 | 
  8 | # 初始化flaskAPP
  9 | app = Flask(__name__)
 10 | 
 11 | 
 12 | # 返回JSON字符串
 13 | def return_json(data):
 14 |     return Response(json.dumps(data, ensure_ascii=False), mimetype='application/json')
 15 | 
 16 | 
 17 | # 网页爬取
 18 | @app.route('/content/web', methods=['POST'])
 19 | def content_web():
 20 |     data = request.get_json()
 21 |     # 获取网页内容
 22 |     response = requests.get(data['url'])
 23 |     # 创建BeautifulSoup对象
 24 |     content = BeautifulSoup(response.text, 'html.parser')
 25 |     if data["id"] != "":
 26 |         content = content.find(id=data['id'])
 27 |     # 返回json类型字符串
 28 |     return return_json({"content": content.get_text()})
 29 | 
 30 | 
 31 | # 普通文本
 32 | @app.route('/content/text', methods=['POST'])
 33 | def content_text():
 34 |     embedding: Embedding = app.config[EMBEDDING]
 35 |     data = request.get_json()
 36 |     embedding.add_text(data["content"], data["url"])
 37 |     # 返回json类型字符串
 38 |     return return_json(data)
 39 | 
 40 | 
 41 | # 查询数据
 42 | @app.route('/db/query', methods=['GET'])
 43 | def db_query():
 44 |     embedding: Embedding = app.config[EMBEDDING]
 45 |     query = request.args.get('query')
 46 |     try:
 47 |         size = int(str(request.args.get('size')))
 48 |     except (ValueError, TypeError):
 49 |         size = 1
 50 |     if query is None or query == "":
 51 |         return return_json({
 52 |             "msg": "请输入query"
 53 |         })
 54 |     # 返回json类型字符串
 55 |     return return_json(embedding.query_text(query, size))
 56 | 
 57 | 
 58 | # 查询数据
 59 | @app.route('/db/get', methods=['GET'])
 60 | def db_get():
 61 |     embedding: Embedding = app.config[EMBEDDING]
 62 |     try:
 63 |         no = int(str(request.args.get('page')))
 64 |         size = int(str(request.args.get('limit')))
 65 |     except (ValueError, TypeError):
 66 |         no = 1
 67 |         size = 20
 68 |     data = embedding.get_data(no, size)
 69 |     data["code"] = 0
 70 |     # 返回json类型字符串
 71 |     return return_json(data)
 72 | 
 73 | 
 74 | # 查询数据
 75 | @app.route('/db/delete', methods=['GET'])
 76 | def db_delete():
 77 |     embedding: Embedding = app.config[EMBEDDING]
 78 |     embedding.delete_data(str(request.args.get('id')))
 79 |     # 返回json类型字符串
 80 |     return return_json({})
 81 | 
 82 | 
 83 | # 询问问题
 84 | @app.route('/chat/ask', methods=['POST'])
 85 | def ask_question():
 86 |     embedding: Embedding = app.config[EMBEDDING]
 87 |     data = request.get_json()
 88 |     # 返回json类型字符串
 89 |     return return_json({
 90 |         "answer": embedding.ask_question(data["question"])
 91 |     })
 92 | 
 93 | 
 94 | # 清除上下文
 95 | @app.route('/chat/clear', methods=['POST'])
 96 | def clear_ask():
 97 |     embedding: Embedding = app.config[EMBEDDING]
 98 |     embedding.clear_question()
 99 |     return return_json({})
100 | 
101 | 
102 | # 主页显示HTML
103 | @app.route('/', methods=['GET'])
104 | def index():
105 |     return render_template('content.html')
106 | 


--------------------------------------------------------------------------------
/web/static/avatar.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaoyou-bilibili/llm_knowledge/d05f7771e988eaa835a66646a8612e9e36d63684/web/static/avatar.jpg


--------------------------------------------------------------------------------
/web/static/momo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaoyou-bilibili/llm_knowledge/d05f7771e988eaa835a66646a8612e9e36d63684/web/static/momo.jpg


--------------------------------------------------------------------------------
/web/templates/content.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en">
  3 | <head>
  4 |     <meta charset="UTF-8">
  5 |     <title>个人知识库</title>
  6 |     <!-- 两个任选一个 github pages -->
  7 |     <link type="text/css" href="https://cdn.jsdelivr.net/gh/MorFansLab/LiteWebChat_Frame/dist/css/litewebchat.min.css" rel="stylesheet"/>
  8 |     <link type="text/css" href="https://cdn.jsdelivr.net/gh/MorFansLab/LiteWebChat_Frame/dist/css/litewebchat_input.min.css" rel="stylesheet"/>
  9 |     <!-- 引入 layui.css -->
 10 |     <link rel="stylesheet" href="//unpkg.com/layui@2.6.8/dist/css/layui.css">
 11 |     <style>
 12 |         html,
 13 |         body {
 14 |           height: 100%;
 15 |           margin: 0;
 16 |           padding: 0;
 17 |           overflow: hidden;
 18 |         }
 19 | 
 20 |         /* 手动指定其父容器大小 */
 21 |         .lite-chatmaster {
 22 |           height: 100%;
 23 |           width: 100%;
 24 |         }
 25 |         .layui-tab-item {
 26 |             height: 100%;
 27 |         }
 28 |     </style>
 29 | </head>
 30 | <body>
 31 | <div class="layui-tab layui-tab-brief" lay-filter="docDemoTabBrief" style="height: 100%">
 32 |   <ul class="layui-tab-title">
 33 |     <li class="layui-this">聊天</li>
 34 |     <li>文本添加</li>
 35 |     <li>数据管理</li>
 36 |   </ul>
 37 |   <div class="layui-tab-content" style="height: 90%">
 38 |       <div class="layui-tab-item layui-show">
 39 |         <!-- 父容器 -->
 40 |         <div class="lite-chatmaster">
 41 |             <!-- 聊天栏 -->
 42 |             <div class="lite-chatbox"></div>
 43 |             <!-- 输入框 -->
 44 |             <div class="lite-chatinput">
 45 |                 <!-- 分界线 -->
 46 |                 <hr class="boundary" />
 47 |                 <!-- 文字输入框 -->
 48 |                 <button class="send" id="send_message">发送</button>
 49 |                 <button class="send" id="clear_context">清除上下文</button>
 50 |                 <div aria-label="input area" id="chatinput" class="editor chatinput" contenteditable="true" ref="editor"></div>
 51 |             </div>
 52 |         </div>
 53 |       </div>
 54 |       <div class="layui-tab-item">
 55 |         <form class="layui-form" action="javascript:void(0)">
 56 |           <div class="layui-form-item"><label class="layui-form-label">网址</label><div class="layui-input-block"><input type="text" name="url" placeholder="输入爬取的网址" autocomplete="off" class="layui-input"></div></div>
 57 |           <div class="layui-form-item"><label class="layui-form-label">爬取元素id</label><div class="layui-input-block"><input type="text" name="id" placeholder="HTML标签的id值" autocomplete="off" class="layui-input"></div></div>
 58 |           <div class="layui-form-item"><label class="layui-form-label">文本</label><div class="layui-input-block"><textarea style="height: 400px" id="text-content" name="content" placeholder="请输入内容" class="layui-textarea"></textarea></div></div>
 59 |           <div class="layui-form-item">
 60 |             <div class="layui-input-block">
 61 |               <button class="layui-btn" lay-submit lay-filter="web">爬取网页</button>
 62 |               <button class="layui-btn" lay-submit lay-filter="text">添加文本</button>
 63 |               <button type="reset" class="layui-btn layui-btn-primary">重置</button>
 64 |             </div>
 65 |           </div>
 66 |         </form>
 67 |       </div>
 68 |       <div class="layui-tab-item">
 69 |           <table id="data-list" lay-filter="test"></table>
 70 |       </div>
 71 |   </div>
 72 | </div>
 73 | 
 74 | <!-- 引入 layui.js -->
 75 | <script src="https://cdn.bootcdn.net/ajax/libs/jquery/3.6.0/jquery.min.js"></script>
 76 | <script src="//unpkg.com/layui@2.6.8/dist/layui.js">
 77 | <script src="https://cdn.jsdelivr.net/gh/MorFansLab/LiteWebChat_Frame/dist/js/litewebchat_input.min.js"></script>
 78 | <!-- 渲染 -->
 79 | <script src="https://cdn.jsdelivr.net/gh/MorFansLab/LiteWebChat_Frame/dist/js/litewebchat_render.min.js"></script>
 80 | <!-- 抽离的聊天信息组件 -->
 81 | <!--<script src="https://cdn.jsdelivr.net/gh/MorFansLab/LiteWebChat_Frame/lite-chatbox.min.js"></script>-->
 82 | <script>
 83 |   const htmls = [{messageType: "text", headIcon: "/static/avatar.jpg", name: "小游", position: "left", html: "你好，请问你有啥问题？"}];
 84 |   function addMessage(message) {
 85 |       htmls.push(message)
 86 |       beforeRenderingHTML(htmls, ".lite-chatbox");
 87 |   }
 88 |   function sendPostRequest(url, data, success) {
 89 |       $.ajax({ type: "POST", url: url,
 90 |         data: JSON.stringify(data), // 将 JSON 数据转换为字符串
 91 |         contentType: "application/json", // 指定请求的内容类型为 JSON
 92 |         dataType: "json", // 指定响应的内容类型为 JSON
 93 |         success: success
 94 |       });
 95 |   }
 96 |   beforeRenderingHTML(htmls, ".lite-chatbox");
 97 | </script>
 98 | <script type="text/html" id="content-bar">
 99 |   <a class="layui-btn layui-btn-danger layui-btn-xs" lay-event="del">删除</a>
100 | </script>
101 | <script>
102 |     // 发送消息
103 |     $("#send_message").on('click', function() {
104 |         let input = $("#chatinput")
105 |         question = input.text()
106 |         sendPostRequest("/chat/ask", {question}, function(response) {
107 |             addMessage({messageType: "text", headIcon: "/static/avatar.jpg", name: "小游", position: "left", html: response.answer})
108 |         })
109 |         addMessage({messageType: "text", headIcon: "/static/momo.jpg", name: "用户", position: "right", html: question});
110 |         input.text("")
111 |     })
112 |     // 清除上下文
113 |     $("#clear_context").on('click', function() {
114 |         sendPostRequest("/chat/clear",{},function (respose) {
115 |             $(".lite-chatbox").empty()
116 |             addMessage({messageType: "text", headIcon: "/static/avatar.jpg", name: "小游", position: "left", html: "已清除上下文，请继续提问吧！"})
117 |         })
118 |     })
119 |     // 爬取网页
120 |     layui.use('form', function() {
121 |         var form = layui.form;
122 |         // 网页数据
123 |         form.on('submit(web)', function (data) {
124 |             sendPostRequest("/content/web", data.field, function(response) {
125 |                 layer.msg("爬取成功");
126 |                 $('#text-content').val(response.content);
127 |             })
128 |             return true
129 |         })
130 |         // 普通文本
131 |         form.on('submit(text)', function (data) {
132 |             sendPostRequest("/content/text", data.field, function(response) {
133 |                 layer.msg("添加成功");
134 |                 $('#text-content').val("");
135 |             })
136 |             return true
137 |         })
138 |     })
139 |     layui.use('table', function(){
140 |         var table = layui.table;
141 |         // 表格渲染
142 |         table.render({elem: '#data-list', id: "data", url: '/db/get',page: true ,cols: [[
143 |             {field: 'id', title: 'ID', width:80},
144 |             {field: 'content', title: '内容'},
145 |             {field: 'url', title: 'url', width:80},
146 |             {fixed: 'right', width:150, align:'center', toolbar: '#content-bar'}
147 |         ]]});
148 |         //工具条事件
149 |         table.on('tool(test)', function(obj) {
150 |             console.log(obj)
151 |             let data = obj.data;
152 |             if (obj.event === 'del') {
153 |                 $.ajax({ type: "GET", url: `db/delete?id=${data.id}`, success: () => {
154 |                     table.reload('data', {})
155 |                 }});
156 |             }
157 |         })
158 |     });
159 | </script>
160 | </body>
161 | </html>


--------------------------------------------------------------------------------