├── .gitignore
├── README.md
├── app.py
├── app_modules
├── __pycache__
│ ├── overwrites.cpython-310.pyc
│ ├── presets.cpython-310.pyc
│ ├── presets.cpython-39.pyc
│ └── utils.cpython-310.pyc
├── overwrites.py
├── presets.py
└── utils.py
├── assets
├── Kelpy-Codos.js
├── custom.css
├── custom.js
└── favicon.ico
├── clc
├── __init__.py
├── __pycache__
│ ├── __init__.cpython-310.pyc
│ ├── __init__.cpython-39.pyc
│ ├── config.cpython-310.pyc
│ ├── gpt_service.cpython-310.pyc
│ ├── gpt_service.cpython-39.pyc
│ ├── langchain_application.cpython-310.pyc
│ ├── langchain_application.cpython-39.pyc
│ ├── source_service.cpython-310.pyc
│ └── source_service.cpython-39.pyc
├── config.py
├── gpt_service.py
├── langchain_application.py
└── source_service.py
├── corpus
└── zh_wikipedia
│ ├── v1
│ ├── README.md
│ ├── chinese_t2s.py
│ ├── clean_corpus.py
│ └── wiki_process.py
│ └── v2
│ ├── make_corpus.py
│ └── wiki_extract.sh
├── create_knowledge.py
├── docs
├── added
│ └── 马保国.txt
├── 姚明.txt
├── 王治郅.txt
└── 科比.txt
├── images
├── ch.jpg
├── chatgroup.jpg
├── computing.png
├── personal.jpg
├── web_demos
│ ├── v1.png
│ ├── v2.png
│ └── v3.png
└── wiki_process.png
├── main.py
├── requirements.txt
├── resources
└── OpenCC-1.1.6-cp310-cp310-manylinux1_x86_64.whl
└── tests
├── test_duckduckgo_search.py
├── test_duckpy.py
├── test_gradio_slient.py
├── test_langchain.py
└── test_vector_store.py
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | cache
3 | docs/zh_wikipedia
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | license: openrail
3 | title: 'Chinese-LangChain '
4 | sdk: gradio
5 | emoji: 🚀
6 | colorFrom: yellow
7 | colorTo: yellow
8 | pinned: true
9 | app_file: app.py
10 | ---
11 |
12 | # Chinese-LangChain
13 |
14 | > Chinese-LangChain:中文langchain项目,基于ChatGLM-6b+langchain实现本地化知识库检索与智能答案生成
15 |
16 | https://github.com/yanqiangmiffy/Chinese-LangChain
17 |
18 | 俗称:小必应,Q.Talk,强聊,QiangTalk
19 |
20 | ## 🔥 效果演示
21 |
22 | 
23 | 
24 |
25 | ## 🚋 使用教程
26 |
27 | - 选择知识库询问相关领域的问题
28 |
29 | ## 🏗️ 部署教程
30 |
31 | ### 运行配置
32 |
33 | - 显存:12g,实际运行9g够了
34 | - 运行内存:32g
35 |
36 | ### 运行环境
37 |
38 | ```text
39 | langchain
40 | gradio
41 | transformers
42 | sentence_transformers
43 | faiss-cpu
44 | unstructured
45 | duckduckgo_search
46 | mdtex2html
47 | chardet
48 | cchardet
49 | ```
50 |
51 | ### 启动Gradio
52 |
53 | ```shell
54 | python main.py
55 | ```
56 |
57 | ## 🚀 特性
58 | - 🚀 2023/05/19 [yanlijun573](https://github.com/yanlijun573)提供[streamlit](https://github.com/yanqiangmiffy/Chinese-LangChain/tree/streamlit)分支
59 | - 🚀 2023/04/22 支持模型多机多卡推理
60 | - 🔭 2023/04/20 支持模型问答与检索问答模式切换
61 | - 💻 2023/04/20 感谢HF官方提供免费算力,添加HuggingFace
62 | Spaces在线体验[[🤗 DEMO](https://huggingface.co/spaces/ChallengeHub/Chinese-LangChain)
63 | - 🧫 2023/04/19 发布45万Wikipedia的文本预处理语料以及FAISS索引向量
64 | - 🐯 2023/04/19 引入ChuanhuChatGPT皮肤
65 | - 📱 2023/04/19 增加web search功能,需要确保网络畅通!(感谢[@wanghao07456](https://github.com/wanghao07456),提供的idea)
66 | - 📚 2023/04/18 webui增加知识库选择功能
67 | - 🚀 2023/04/18 修复推理预测超时5s报错问题
68 | - 🎉 2023/04/17 支持多种文档上传与内容解析:pdf、docx,ppt等
69 | - 🎉 2023/04/17 支持知识增量更新
70 |
71 | [//]: # (- 支持检索结果与LLM生成结果对比)
72 |
73 | ## 🧰 知识库
74 |
75 | ### 构建知识库
76 |
77 | - Wikipedia-zh
78 |
79 | > 详情见:corpus/zh_wikipedia/README.md
80 |
81 | ### 知识库向量索引
82 |
83 | | 知识库数据 | FAISS向量 |
84 | |-------------------------------------------------------------------------------|----------------------------------------------------------------------|
85 | | 中文维基百科截止4月份数据,45万 | 链接:https://pan.baidu.com/s/1VQeA_dq92fxKOtLL3u3Zpg?pwd=l3pn 提取码:l3pn |
86 | | 截止去年九月的130w条中文维基百科处理结果和对应faiss向量文件 @[yubuyuabc](https://github.com/yubuyuabc) | 链接:https://pan.baidu.com/s/1Yls_Qtg15W1gneNuFP9O_w?pwd=exij 提取码:exij |
87 | | 💹 [大规模金融研报知识图谱](http://openkg.cn/dataset/fr2kg) | 链接:https://pan.baidu.com/s/1FcIH5Fi3EfpS346DnDu51Q?pwd=ujjv 提取码:ujjv |
88 |
89 | ## 🔨 TODO
90 |
91 | * [x] 支持上下文
92 | * [x] 支持知识增量更新
93 | * [x] 支持加载不同知识库
94 | * [x] 支持检索结果与LLM生成结果对比
95 | * [ ] 支持检索生成结果与原始LLM生成结果对比
96 | * [ ] 支持模型问答与检索问答
97 | * [ ] 检索结果过滤与排序
98 | * [x] 互联网检索结果接入
99 | * [ ] 模型初始化有问题
100 | * [ ] 增加非LangChain策略
101 | * [ ] 显示当前对话策略
102 | * [ ] 构建一个垂直业务场景知识库,非通用性
103 |
104 | ## 交流
105 |
106 | 欢迎多提建议、Bad cases,目前尚不完善,欢迎进群及时交流,也欢迎大家多提PR
107 |
108 |
110 |
111 |
112 |
117 |
118 | ## ❤️引用
119 |
120 | - webui参考:https://github.com/thomas-yanxin/LangChain-ChatGLM-Webui
121 | - knowledge问答参考:https://github.com/imClumsyPanda/langchain-ChatGLM
122 | - LLM模型:https://github.com/THUDM/ChatGLM-6B
123 | - CSS:https://huggingface.co/spaces/JohnSmith9982/ChuanhuChatGPT
124 |
125 |
126 |
127 | ## ⭐️ Star History
128 |
129 | [](https://star-history.com/#yanqiangmiffy/Chinese-LangChain&Date)
130 |
--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
1 | import os
2 | import shutil
3 |
4 | from app_modules.presets import *
5 | from clc.langchain_application import LangChainApplication
6 |
7 |
8 | # 修改成自己的配置!!!
9 | class LangChainCFG:
10 | llm_model_name = 'THUDM/chatglm-6b-int4-qe' # 本地模型文件 or huggingface远程仓库
11 | embedding_model_name = 'GanymedeNil/text2vec-large-chinese' # 检索模型文件 or huggingface远程仓库
12 | vector_store_path = './cache'
13 | docs_path = './docs'
14 | kg_vector_stores = {
15 | '中文维基百科': './cache/zh_wikipedia',
16 | '大规模金融研报': './cache/financial_research_reports',
17 | '初始化': './cache',
18 | } # 可以替换成自己的知识库,如果没有需要设置为None
19 | # kg_vector_stores=None
20 | patterns = ['模型问答', '知识库问答'] #
21 |
22 |
23 | config = LangChainCFG()
24 | application = LangChainApplication(config)
25 |
26 |
27 | def get_file_list():
28 | if not os.path.exists("docs"):
29 | return []
30 | return [f for f in os.listdir("docs")]
31 |
32 |
33 | file_list = get_file_list()
34 |
35 |
36 | def upload_file(file):
37 | if not os.path.exists("docs"):
38 | os.mkdir("docs")
39 | filename = os.path.basename(file.name)
40 | shutil.move(file.name, "docs/" + filename)
41 | # file_list首位插入新上传的文件
42 | file_list.insert(0, filename)
43 | application.source_service.add_document("docs/" + filename)
44 | return gr.Dropdown.update(choices=file_list, value=filename)
45 |
46 |
47 | def set_knowledge(kg_name, history):
48 | try:
49 | application.source_service.load_vector_store(config.kg_vector_stores[kg_name])
50 | msg_status = f'{kg_name}知识库已成功加载'
51 | except Exception as e:
52 | print(e)
53 | msg_status = f'{kg_name}知识库未成功加载'
54 | return history + [[None, msg_status]]
55 |
56 |
57 | def clear_session():
58 | return '', None
59 |
60 |
61 | def predict(input,
62 | large_language_model,
63 | embedding_model,
64 | top_k,
65 | use_web,
66 | use_pattern,
67 | history=None):
68 | # print(large_language_model, embedding_model)
69 | print(input)
70 | if history == None:
71 | history = []
72 |
73 | if use_web == '使用':
74 | web_content = application.source_service.search_web(query=input)
75 | else:
76 | web_content = ''
77 | search_text = ''
78 | if use_pattern == '模型问答':
79 | result = application.get_llm_answer(query=input, web_content=web_content)
80 | history.append((input, result))
81 | search_text += web_content
82 | return '', history, history, search_text
83 |
84 | else:
85 | resp = application.get_knowledge_based_answer(
86 | query=input,
87 | history_len=1,
88 | temperature=0.1,
89 | top_p=0.9,
90 | top_k=top_k,
91 | web_content=web_content,
92 | chat_history=history
93 | )
94 | history.append((input, resp['result']))
95 | for idx, source in enumerate(resp['source_documents'][:4]):
96 | sep = f'----------【搜索结果{idx + 1}:】---------------\n'
97 | search_text += f'{sep}\n{source.page_content}\n\n'
98 | print(search_text)
99 | search_text += "----------【网络检索内容】-----------\n"
100 | search_text += web_content
101 | return '', history, history, search_text
102 |
103 |
104 | with open("assets/custom.css", "r", encoding="utf-8") as f:
105 | customCSS = f.read()
106 | with gr.Blocks(css=customCSS, theme=small_and_beautiful_theme) as demo:
107 | gr.Markdown("""