├── README.md
├── __init__.py
├── chainKBQA.py
├── config.py
├── document.py
├── llm.py
├── resource
    └── txt
    │   ├── dilireba.txt
    │   └── yami.txt
└── utils
    ├── AliTextSplitter.py
    └── __init__.py


/README.md:
--------------------------------------------------------------------------------
 1 | # LongChainKBQA 
 2 | #### 项目技术
 3 | +  ChatYuan-large-v2 大语言模型进行基于知识库的问答 
 4 | +  nlp_bert_document-segmentation_chinese-base 语义分割模型对文本进行拆分
 5 | +  text2vec-large-chinese 模型 对文本向量化  
 6 | +  faiss进行向量检索
 7 | +  langchain 进行各个模块的组合，并完成基于知识库的问答
 8 | #### 项目结构
 9 | + config.py：配置文件,配置llm模型和文本向量化模型
10 | + document.py：文本拆分和文本向量化
11 | + llm.py：大语言模型加载
12 | + chainKBQA.py：利用文本向量化搜索和大语言模型进行知识库问答
13 | 
14 | #### 项目使用
15 | + 运行document.py 主函数对文本进行拆分以及向量化
16 | + 运行chainKBQA.py 加载文本向量和llm模型进行知识问答
17 | #### 结果如下：
18 | > load llm model 
19 | > load documents
20 | > 
21 | > 大模型自己回答的结果
22 | > 
23 | > 迪丽热巴的作品包括：
24 | > 《三生三世十里桃花》、
25 | > 《三生三世枕上书》、《三生三世枕上书之长生诀》、
26 | > 《三生三世枕上书之长生诀之长生诀之长生诀之长生决》、
27 | > 《三生三世枕上书之长生诀》等。
28 | > 
29 | > 大模型+知识库后回答的结果
30 | > 
31 | > {
32 | > 	 'query': '迪丽热巴的作品有什么',
33 | > 	 'result': '迪丽热巴的作品包括：《阿娜尔罕》、《逆光之恋》、《克拉恋人》、《漂亮的李慧珍》、《三生三世十里桃花》、《金鹰女神》等。', 
34 | >  
35 | >  
36 | > 	'source_documents': [Document(page_content='2013年，迪丽热巴因主演个人首部电视剧《阿娜尔罕》而出道 [1] 。
37 | > 	2014年，主演奇幻剧《逆光之恋》。
38 | > 	2015年，凭借爱情剧《克拉恋人》赢得高人气，并获得国剧盛典最受欢迎新人女演员奖 [168] 。
39 | > 	2016年，主演现代剧《麻辣变形计》 [2] ；同年，主演都市爱情励志喜剧《漂亮的李慧珍》，还凭借喜剧片《傲娇与偏见》获得中英电影节最佳新人奖 [3] 。
40 | > 	2017年，凭借玄幻剧《三生三世十里桃花》获得白玉兰奖最佳女配角提名 [4] 。2018年，迪丽热巴成为了金鹰电视艺术节的第七位“金鹰女神” [5] ，
41 | > 	并获得了第29届中国电视金鹰奖观众喜爱的女演员、第12届中国金鹰电视艺术节最具人气女演员两项殊荣 [6] 。', metadata={'source': 'source/txt/dilireba.txt'})]
42 | > }
43 | #### TODO
44 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wp931120/LongChainKBQA/72ccb921e4bba18ef0e6edfcf89ce8c78c78aa90/__init__.py


--------------------------------------------------------------------------------
/chainKBQA.py:
--------------------------------------------------------------------------------
 1 | from langchain.chains import RetrievalQA
 2 | from langchain.prompts.prompt import PromptTemplate
 3 | 
 4 | from config import Config
 5 | from document import DocumentService
 6 | from llm import LLMService
 7 | 
 8 | 
 9 | class LangChainApplication(object):
10 | 
11 |     def __init__(self):
12 |         self.config = Config
13 |         self.llm_service = LLMService()
14 |         ###加载llm和知识库向量
15 |         print("load llm model ")
16 |         self.llm_service.load_model(model_name_or_path=self.config.llm_model_name)
17 |         self.doc_service = DocumentService()
18 |         print("load documents")
19 |         self.doc_service.load_vector_store()
20 | 
21 |     def get_knowledge_based_answer(self, query,
22 |                                    history_len=5,
23 |                                    temperature=0.1,
24 |                                    top_p=0.9,
25 |                                    top_k=1,
26 |                                    chat_history=[]):
27 |         #定义prompt
28 |         prompt_template = """基于以下已知信息，简洁和专业的来回答用户的问题。
29 |                                         如果无法从中得到答案，请说 "根据已知信息无法回答该问题" 或 "没有提供足够的相关信息"，不允许在答案中添加编造成分，答案请使用中文。
30 |                                         已知内容:
31 |                                         {context}
32 |                                         问题:
33 |                                         {question}"""
34 |         prompt = PromptTemplate(template=prompt_template,
35 |                                 input_variables=["context", "question"])
36 |         self.llm_service.history = chat_history[-history_len:] if history_len > 0 else []
37 | 
38 |         self.llm_service.temperature = temperature
39 |         self.llm_service.top_p = top_p
40 |         # 声明一个知识库问答llm,传入之前初始化好的llm和向量知识搜索服务
41 |         knowledge_chain = RetrievalQA.from_llm(
42 |             llm=self.llm_service,
43 |             retriever=self.doc_service.vector_store.as_retriever(
44 |                 search_kwargs={"k": top_k}),
45 |             prompt=prompt)
46 | 
47 |         knowledge_chain.combine_documents_chain.document_prompt = PromptTemplate(
48 |             input_variables=["page_content"], template="{page_content}")
49 |         knowledge_chain.return_source_documents = True
50 | 
51 |         ### 基于知识库的问答
52 |         result = knowledge_chain({"query": query})
53 |         return result
54 | 
55 |     def get_llm_answer(self, query=''):
56 |         prompt_template = """请回答下列问题:
57 |                             {}""".format(query)
58 |         ### 基于大模型的问答
59 |         result = self.llm_service._call(prompt_template)
60 |         return result
61 | 
62 | 
63 | if __name__ == '__main__':
64 |     application = LangChainApplication()
65 |     print("大模型自己回答的结果")
66 |     result = application.get_llm_answer('迪丽热巴的作品有什么')
67 |     print(result)
68 |     print("大模型+知识库后回答的结果")
69 |     result = application.get_knowledge_based_answer('迪丽热巴的作品有什么')
70 |     print(result)
71 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 | class Config:
2 |     llm_model_name = 'ClueAI/ChatYuan-large-v2'  # 本地模型文件 or huggingface远程仓库
3 |     embedding_model_name = 'GanymedeNil/text2vec-large-chinese'  # 检索模型文件 or huggingface远程仓库
4 |     vector_store_path = 'resource/faiss/'
5 |     docs_path = 'resource/txt/'


--------------------------------------------------------------------------------
/document.py:
--------------------------------------------------------------------------------
 1 | from langchain.document_loaders import UnstructuredFileLoader, TextLoader, DirectoryLoader
 2 | from langchain.embeddings.huggingface import HuggingFaceEmbeddings
 3 | from langchain.vectorstores import FAISS
 4 | from config import Config
 5 | from utils.AliTextSplitter import AliTextSplitter
 6 | 
 7 | 
 8 | class DocumentService(object):
 9 |     def __init__(self):
10 | 
11 |         self.config = Config.vector_store_path
12 |         self.embeddings = HuggingFaceEmbeddings(model_name=Config.embedding_model_name)
13 |         self.docs_path = Config.docs_path
14 |         self.vector_store_path = Config.vector_store_path
15 |         self.vector_store = None
16 | 
17 |     def init_source_vector(self):
18 |         """
19 |         初始化本地知识库向量
20 |         :return:
21 |         """
22 |         loader = DirectoryLoader(self.docs_path, glob="**/*.txt", loader_cls=TextLoader)
23 |         # 读取文本文件
24 |         documents = loader.load()
25 |         text_splitter = AliTextSplitter()
26 |         # 使用阿里的分段模型对文本进行分段
27 |         split_text = text_splitter.split_documents(documents)
28 |         # 采用embeding模型对文本进行向量化
29 |         self.vector_store = FAISS.from_documents(split_text, self.embeddings)
30 |         # 把结果存到faiss索引里面
31 |         self.vector_store.save_local(self.vector_store_path)
32 | 
33 |     def load_vector_store(self):
34 |         self.vector_store = FAISS.load_local(self.vector_store_path, self.embeddings)
35 | 
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     s = DocumentService()
40 |     ###将文本分块向量化存储起来
41 |     s.init_source_vector()
42 | 


--------------------------------------------------------------------------------
/llm.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional
 2 | from langchain.llms.base import LLM
 3 | from langchain.llms.utils import enforce_stop_tokens
 4 | from transformers import AutoModel, AutoTokenizer
 5 | from config import Config
 6 | 
 7 | 
 8 | class LLMService(LLM):
 9 | 
10 |     """ 
11 |     模型服务 
12 |     """
13 |     max_token: int = 10000
14 |     temperature: float = 0.1
15 |     top_p = 0.9
16 |     history = []
17 |     tokenizer: object = None
18 |     model: object = None
19 | 
20 |     def __init__(self):
21 |         super().__init__()
22 | 
23 |     @property
24 |     def _llm_type(self) -> str:
25 |         return "LLM"
26 | 
27 |     def _call(self,
28 |               prompt: str,
29 |               stop: Optional[List[str]] = None) -> str:
30 |         response, _ = self.model.chat(
31 |             self.tokenizer,
32 |             prompt,
33 |             history=self.history,
34 |             max_length=self.max_token,
35 |             temperature=self.temperature,
36 |         )
37 |         if stop is not None:
38 |             response = enforce_stop_tokens(response, stop)
39 |         self.history = self.history + [[None, response]]
40 |         return response
41 | 
42 |     def load_model(self, model_name_or_path: str = "ClueAI/ChatYuan-large-v2"):
43 |         """
44 |         加载大模型LLM
45 |         :return:
46 |         """
47 |         self.tokenizer = AutoTokenizer.from_pretrained(
48 |             Config.llm_model_name,
49 |             trust_remote_code=True
50 |         )
51 |         self.model = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True)
52 |         self.model = self.model.eval()
53 | 
54 | if __name__ == '__main__':
55 |     chatLLM = LLMService()
56 |     chatLLM.load_model()


--------------------------------------------------------------------------------
/resource/txt/dilireba.txt:
--------------------------------------------------------------------------------
1 | 迪丽热巴（Dilraba），1992年6月3日出生于中国新疆乌鲁木齐市，毕业于上海戏剧学院，中国内地影视女演员。
2 | 2013年，迪丽热巴因主演个人首部电视剧《阿娜尔罕》而出道 [1] 。2014年，主演奇幻剧《逆光之恋》。2015年，凭借爱情剧《克拉恋人》赢得高人气，并获得国剧盛典最受欢迎新人女演员奖 [168] 。2016年，主演现代剧《麻辣变形计》 [2] ；同年，主演都市爱情励志喜剧《漂亮的李慧珍》，还凭借喜剧片《傲娇与偏见》获得中英电影节最佳新人奖 [3] 。2017年，凭借玄幻剧《三生三世十里桃花》获得白玉兰奖最佳女配角提名 [4] 。
3 | 2018年，迪丽热巴成为了金鹰电视艺术节的第七位“金鹰女神” [5] ，并获得了第29届中国电视金鹰奖观众喜爱的女演员、第12届中国金鹰电视艺术节最具人气女演员两项殊荣 [6] 。2020年，主演的玄幻剧《三生三世枕上书》播出。2021年3月，主演古装传奇剧《长歌行》 [136] 。2022年，主演的古装玄幻剧《与君初相识》 [156] 、《恰似故人归》相继播出 [157] 。


--------------------------------------------------------------------------------
/resource/txt/yami.txt:
--------------------------------------------------------------------------------
1 | 杨幂，1986年9月12日出生于北京市，中国内地影视女演员、流行乐歌手、影视制片人。
2 | 2005年，杨幂进入北京电影学院表演系本科班就读。2006年，因出演金庸武侠剧《神雕侠侣》崭露头角。2008年，凭借古装剧《王昭君》获得第24届中国电视金鹰奖观众喜爱的电视剧女演员奖提名 [1] 。2009年，在“80后新生代娱乐大明星”评选中被评为“四小花旦” [2] 。2011年，凭借穿越剧《宫锁心玉》赢得广泛关注 [3] ，并获得了第17届上海电视节白玉兰奖观众票选最具人气女演员奖 [4] 。2012年，不仅成立杨幂工作室，还凭借都市剧《北京爱情故事》获得了多项荣誉 [5-6]。
3 | 2015年，主演的《小时代》系列电影票房突破18亿人民币 [7] 。2016年，其主演的职场剧《亲爱的翻译官》取得全国年度电视剧收视冠军 [8] 。2017年，杨幂主演的神话剧《三生三世十里桃花》获得颇高关注；同年，她还凭借科幻片《逆时营救》获得休斯顿国际电影节最佳女主角奖 [9] 。2018年，凭借古装片《绣春刀Ⅱ：修罗战场》获得北京大学生电影节最受大学生欢迎女演员奖 [10] ；同年，她还获得了第五届中国电视好演员奖绿宝石女演员奖 [11] 。2021年，首次参加央视春节联欢晚会 [12] 。


--------------------------------------------------------------------------------
/utils/AliTextSplitter.py:
--------------------------------------------------------------------------------
 1 | from langchain.text_splitter import CharacterTextSplitter
 2 | import re
 3 | from typing import List
 4 | 
 5 | 
 6 | class AliTextSplitter(CharacterTextSplitter):
 7 |     def __init__(self, pdf: bool = False, **kwargs):
 8 |         super().__init__(**kwargs)
 9 |         self.pdf = pdf
10 | 
11 |     # 此处采取的文档语义分割模型为达摩院开源的nlp_bert_document-segmentation_chinese-base，论文见https://arxiv.org/abs/2107.09278
12 |     def split_text(self, text: str) -> List[str]:
13 |         if self.pdf:
14 |             text = re.sub(r"\n{3,}", r"\n", text)
15 |             text = re.sub('\s', " ", text)
16 |             text = re.sub("\n\n", "", text)
17 |         from modelscope.pipelines import pipeline
18 | 
19 |         p = pipeline(
20 |             task="document-segmentation",
21 |             model='damo/nlp_bert_document-segmentation_chinese-base',
22 |             device="cpu")
23 |         result = p(documents=text)
24 |         sent_list = [i for i in result["text"].split("\n\t") if i]
25 |         return sent_list
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     # 使用text_splitter对文档进行分割
30 |     text_splitter = AliTextSplitter()
31 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wp931120/LongChainKBQA/72ccb921e4bba18ef0e6edfcf89ce8c78c78aa90/utils/__init__.py


--------------------------------------------------------------------------------