├── .gitignore ├── README.md ├── llm_classification ├── config │ └── toutiao_config.py ├── data │ └── toutiao_cat_data │ │ └── class_def.tsv ├── script │ ├── build_vec_index.py │ └── run_toutiao_cases.py └── src │ ├── classifier.py │ ├── models │ ├── llm │ │ ├── llm_model.py │ │ └── test_qwen.py │ └── vec_model │ │ ├── simcse_model.py │ │ └── vec_model.py │ ├── searcher │ ├── searcher.py │ └── vec_searcher │ │ ├── vec_index.py │ │ └── vec_searcher.py │ └── utils │ └── data_processing.py ├── mt5_summary ├── arg_config.py ├── data.py ├── mt5_summary_main.py ├── run.sh └── tools.py └── vec_searcher ├── script └── build_vec_index.py ├── searcher.py ├── utils └── data_processing.py ├── vec_model ├── simcse_model.py └── vec_model.py └── vec_searcher ├── vec_index.py └── vec_searcher.py /.gitignore: -------------------------------------------------------------------------------- 1 | # ---> Python 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | **/__pycache__/ 5 | 6 | # Distribution / packaging 7 | .Python 8 | build/ 9 | develop-eggs/ 10 | dist/ 11 | downloads/ 12 | eggs/ 13 | .eggs/ 14 | lib/ 15 | lib64/ 16 | parts/ 17 | sdist/ 18 | var/ 19 | wheels/ 20 | pip-wheel-metadata/ 21 | share/python-wheels/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | MANIFEST 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .nox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | *.py,cover 48 | .hypothesis/ 49 | .pytest_cache/ 50 | 51 | # Sphinx documentation 52 | docs/_build/ 53 | 54 | # PyBuilder 55 | target/ 56 | 57 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 58 | __pypackages__/ 59 | 60 | 61 | # ---> VisualStudioCode 62 | .vscode/* 63 | .vscode/settings.json 64 | *.code-workspace 65 | 66 | # local config 67 | local_config.py 68 | 69 | # log 70 | *.log.* 71 | 72 | # .env 73 | ! default.env 74 | 75 | # .idea 配置文件 76 | .idea/ 77 | **/data/* -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # poc_project 2 | 3 | 通用简单工具项目,用于存放简单通用工具。 4 | 5 | - llm_classification:基于大模型的通用文本分类方案 6 | - vec_searcher:Faiss向量召回工具 -------------------------------------------------------------------------------- /llm_classification/config/toutiao_config.py: -------------------------------------------------------------------------------- 1 | VEC_MODEL_PATH = "C:/work/tool/huggingface/models/simcse-chinese-roberta-wwm-ext" 2 | # VEC_INDEX_DATA = "vec_index_toutiao_20240629" 3 | VEC_INDEX_DATA = "vec_index_toutiao_20240702_FEW" 4 | 5 | LLM_PATH = "C:/work/tool/qwen2-1.5b-instruct" 6 | LLM_CONFIG = {"max_length": 2048, 7 | "do_sample": False, 8 | "top_k": 1, 9 | "temperature": 0.8} 10 | 11 | CLASS_DEF_PATH = "data/toutiao_cat_data/class_def.tsv" 12 | 13 | PROMPT_TEMPLATE = """你是一个优秀的句子分类师,能把给定的用户query划分到正确的类目中。现在请你根据给定信息和要求,为给定用户query,从备选类目中选择最合适的类目。 14 | 15 | 下面是“参考案例”即被标注的正确结果,可供参考: 16 | 17 | 18 | 备选类目: 19 | 20 | 21 | 类目概念: 22 | 23 | 24 | 用户query: 25 | 26 | 27 | 请注意: 28 | 1. 用户query所选类目,仅能在【备选类目】中进行选择,用户query仅属于一个类目。 29 | 2. “参考案例”中的内容可供推理分析,可以仿照案例来分析用户query的所选类目。 30 | 3. 请仔细比对【备选类目】的概念和用户query的差异。 31 | 4. 如果用户quer也不属于【备选类目】中给定的类目,或者比较模糊,请选择“拒识”。 32 | 5. 请在“所选类目:”后回复结果,不需要说明理由。 33 | 34 | 所选类目:""" -------------------------------------------------------------------------------- /llm_classification/data/toutiao_cat_data/class_def.tsv: -------------------------------------------------------------------------------- 1 | 民生-故事 老百姓生活问题的新闻 2 | 文化-文化 文学、艺术、教育、历史、哲学、宗教等多个方面的文化新闻 3 | 娱乐-娱乐 明星 、电影、最新影讯/影评、电影院在线购票订座、电视剧、音乐、戏剧、演出等娱乐信息 4 | 体育-体育 国内国际最热门体育赛事比分和赛果 5 | 财经-财经 股票、债券、基金、期货、信托、理财、管理等服务新闻 6 | 房产-房产 涵盖土地政策、房产金融、房产营销等信息 7 | 汽车-汽车 海外、国内所有汽车品牌、高清车模、购车指南、车展报道 8 | 教育-教育 高考、考研、自考、成人高考、教师招聘、就业、留学等权威的招考、招生、就业、招聘新闻 9 | 科技-科技 通信、互联网、IT产业、IT产品和科普探索等前沿科技领域知识 10 | 军事-军事 权威军事资讯、追踪军事热点、反映军事动态、介绍国内外最新武器发展动态 11 | 旅游-旅游 文化和旅游部相关政策法规、跟踪报道全国各地旅游重点事件、大型旅游会议活动专题 12 | 国际-国际 世界新闻、国际博览、新闻人物、评论分析、媒体聚焦 13 | 证券-股票 证券、股票方面的新闻 14 | 农业-三农 种植业、林业、渔业、牧业等农业生产活动的新闻 15 | 电竞-游戏 各种电竞赛事的报道、游戏更新、行业趋势、选手动态、以及与电子竞技相关的文化和活动 16 | -------------------------------------------------------------------------------- /llm_classification/script/build_vec_index.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Filename: build_vec_index.py 3 | # Author: ZENGGUANRONG 4 | # Date: 2023-12-12 5 | # description: 构造向量索引脚本 6 | 7 | import json,torch,copy,random 8 | from tqdm import tqdm 9 | from loguru import logger 10 | from sklearn.model_selection import train_test_split 11 | 12 | from src.utils.data_processing import load_toutiao_data 13 | from src.models.vec_model.vec_model import VectorizeModel 14 | from src.searcher.vec_searcher.vec_searcher import VecSearcher 15 | 16 | if __name__ == "__main__": 17 | # 0. 必要配置 18 | MODE = "DEBUG" 19 | # MODE = "PRO" 20 | MODE = "FEW" 21 | 22 | VERSION = "20240702" 23 | VEC_MODEL_PATH = "C:/work/tool/huggingface/models/simcse-chinese-roberta-wwm-ext" 24 | SOURCE_INDEX_DATA_PATH = "./data/toutiao_cat_data/toutiao_cat_data.txt" # 数据来源:https://github.com/aceimnorstuvwxz/toutiao-text-classfication-dataset 25 | VEC_INDEX_DATA = "vec_index_toutiao_{}_{}".format(VERSION,MODE) 26 | TESE_DATA_PATH = "./data/toutiao_cat_data/test_set_{}_{}.txt".format(VERSION,MODE) 27 | RANDOM_SEED = 100 28 | 29 | DEVICE = torch.device('cuda' if torch.cuda.is_available() else "cpu") 30 | TEST_SIZE = 0.1 31 | # 类目体系 32 | CLASS_INFO = [ 33 | ["100", '民生-故事', 'news_story'], 34 | ["101", '文化-文化', 'news_culture'], 35 | ["102", '娱乐-娱乐', 'news_entertainment'], 36 | ["103", '体育-体育', 'news_sports'], 37 | ["104", '财经-财经', 'news_finance'], 38 | # ["105", '时政 新时代', 'nineteenth'], 39 | ["106", '房产-房产', 'news_house'], 40 | ["107", '汽车-汽车', 'news_car'], 41 | ["108", '教育-教育', 'news_edu' ], 42 | ["109", '科技-科技', 'news_tech'], 43 | ["110", '军事-军事', 'news_military'], 44 | # ["111" 宗教 无,凤凰佛教等来源], 45 | ["112", '旅游-旅游', 'news_travel'], 46 | ["113", '国际-国际', 'news_world'], 47 | ["114", '证券-股票', 'stock'], 48 | ["115", '农业-三农', 'news_agriculture'], 49 | ["116", '电竞-游戏', 'news_game'] 50 | ] 51 | ID2CN_MAPPING = {} 52 | for idx in range(len(CLASS_INFO)): 53 | ID2CN_MAPPING[CLASS_INFO[idx][0]] = CLASS_INFO[idx][1] 54 | 55 | # 1. 加载数据、模型 56 | # 1.1 加载模型 57 | vec_model = VectorizeModel(VEC_MODEL_PATH, DEVICE) 58 | index_dim = len(vec_model.predict_vec("你好啊")[0]) 59 | # 1.2 加载数据 60 | toutiao_index_data = load_toutiao_data(SOURCE_INDEX_DATA_PATH) 61 | source_index_data = copy.deepcopy(toutiao_index_data) 62 | logger.info("load data done: {}".format(len(source_index_data))) 63 | if MODE == "DEBUG": 64 | random.shuffle(source_index_data) 65 | source_index_data = source_index_data[:10000] 66 | elif MODE == "FEW": 67 | new_source_data = [] 68 | class_dict_cal = {} 69 | test_list = [] 70 | tmp_idx = 0 71 | 72 | for key in ID2CN_MAPPING: 73 | class_dict_cal[key] = 0 74 | for idx in range(len(source_index_data)): 75 | if class_dict_cal[source_index_data[idx][1][1]] < 10: 76 | class_dict_cal[source_index_data[idx][1][1]] += 1 77 | new_source_data.append(source_index_data[idx]) 78 | if sum([class_dict_cal[i] for i in class_dict_cal]) >= len(class_dict_cal) * 10: 79 | break 80 | source_index_data = new_source_data 81 | 82 | for item in source_index_data: 83 | item[1].append(ID2CN_MAPPING[item[1][1]]) 84 | # 1.3 训练集测试集划分 85 | if MODE != "FEW": 86 | train_list, test_list = train_test_split(source_index_data, test_size=TEST_SIZE, random_state=66) 87 | else: 88 | train_list = source_index_data 89 | test_list = toutiao_index_data[idx:idx + 1000] 90 | for item in test_list: 91 | item[1].append(ID2CN_MAPPING[item[1][1]]) 92 | 93 | # 2. 创建索引并灌入数据 94 | # 2.1 构造索引 95 | vec_searcher = VecSearcher() 96 | vec_searcher.build(index_dim, VEC_INDEX_DATA) 97 | 98 | # 2.2 推理向量 99 | vectorize_result = [] 100 | for q in tqdm(train_list, desc="VEC MODEL RUNNING"): 101 | vec = vec_model.predict_vec(q[0]).cpu().numpy() 102 | tmp_result = copy.deepcopy(q) 103 | tmp_result.append(vec) 104 | vectorize_result.append(copy.deepcopy(tmp_result)) 105 | 106 | # 2.3 开始存入 107 | for idx in tqdm(range(len(vectorize_result)), desc="INSERT INTO INDEX"): 108 | vec_searcher.insert(vectorize_result[idx][2], vectorize_result[idx][:2]) 109 | 110 | # 3. 保存 111 | # 3.1 索引保存 112 | vec_searcher.save() 113 | # 3.2 测试集保存 114 | with open(TESE_DATA_PATH, "w", encoding="utf8") as f: 115 | for item in test_list: 116 | f.write("_!_".join(item[1]) + "\n") 117 | -------------------------------------------------------------------------------- /llm_classification/script/run_toutiao_cases.py: -------------------------------------------------------------------------------- 1 | 2 | from tqdm import tqdm 3 | from sklearn.metrics import classification_report, confusion_matrix 4 | from loguru import logger 5 | 6 | from src.classifier import VecLlmClassifier 7 | from src.utils.data_processing import load_toutiao_data 8 | 9 | VERSION = "20240702_FEW" 10 | TEST_DATA_PATH = "data/toutiao_cat_data/test_set_{}.txt".format(VERSION) 11 | OUTPUT_DATA_PATH = "data/toutiao_cat_data/test_set_{}_result.txt".format(VERSION) 12 | test_data = load_toutiao_data(TEST_DATA_PATH) 13 | 14 | vlc = VecLlmClassifier() 15 | test_list = [] 16 | pred_list = [] 17 | labels = set() 18 | for i in tqdm(range(len(test_data)), desc="RUNNING TEST"): 19 | test_list.append(test_data[i][1][5]) 20 | labels.add(test_data[i][1][5]) 21 | pred_list.append(vlc.predict(test_data[i][0])) 22 | labels = list(labels) 23 | 24 | logger.info("\n{}".format(classification_report(test_list, pred_list, labels = labels))) 25 | logger.info("\n{}".format(confusion_matrix(test_list, pred_list, labels=labels))) 26 | 27 | with open(OUTPUT_DATA_PATH, "w", encoding="utf8") as fout: 28 | for idx in range(len(test_data)): 29 | fout.write("{}\t{}\t{}\t{}\n".format(test_data[idx][0], 30 | test_list[idx], 31 | pred_list[idx], 32 | test_list[idx]==pred_list[idx])) -------------------------------------------------------------------------------- /llm_classification/src/classifier.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Filename: classifier.py 3 | # Author: ZENGGUANRONG 4 | # Date: 2024-06-25 5 | # description: 分类器主函数 6 | 7 | import copy 8 | import torch 9 | from loguru import logger 10 | 11 | from config.toutiao_config import (VEC_INDEX_DATA, VEC_MODEL_PATH, 12 | LLM_CONFIG, LLM_PATH, PROMPT_TEMPLATE,CLASS_DEF_PATH) 13 | from src.searcher.searcher import Searcher 14 | from src.models.llm.llm_model import QWen2Model 15 | from src.utils.data_processing import load_class_def 16 | 17 | class VecLlmClassifier: 18 | def __init__(self) -> None: 19 | self.searcher = Searcher(VEC_MODEL_PATH, VEC_INDEX_DATA) 20 | self.device = torch.device('cuda' if torch.cuda.is_available() else "cpu") 21 | self.llm = QWen2Model(LLM_PATH, LLM_CONFIG, self.device) 22 | self.PROMPT_TEMPLATE = PROMPT_TEMPLATE 23 | self.class_def = load_class_def(CLASS_DEF_PATH) 24 | 25 | def predict(self, query): 26 | # 1. query预处理 27 | logger.info("request: {}".format(query)) 28 | # 2. query向量召回 29 | recall_result = self.searcher.search(query, nums=5) 30 | # logger.debug(recall_result) 31 | 32 | # 3. 请求大模型 33 | # 3.1 PROMPT拼接 34 | request_prompt= copy.deepcopy(self.PROMPT_TEMPLATE) 35 | # 3.1.1 子模块拼接 36 | examples = [] 37 | options = [] 38 | options_detail = [] 39 | for item in recall_result: 40 | tmp_examples = "——".join([item[1][0], item[1][1][5]]) 41 | if tmp_examples not in examples: 42 | examples.append(tmp_examples) 43 | opt_detail_str = ":".join(["【" + item[1][1][5] + "】",self.class_def[item[1][1][5]]]) 44 | opt = item[1][1][5] 45 | if opt not in options: 46 | options.append(opt) 47 | options_detail.append(opt_detail_str) 48 | # options.append("拒识:含义不明或用户query所属类目不在列举内时,分为此类") 49 | examples_str = "\n".join(examples) 50 | options_str = ",".join(options) 51 | options_detail_str = "\n".join(options_detail) 52 | 53 | # 3.1.2 整体组装 54 | request_prompt = request_prompt.replace("", examples_str) 55 | request_prompt = request_prompt.replace("", options_str) 56 | request_prompt = request_prompt.replace("", options_detail_str) 57 | request_prompt = request_prompt.replace("", query) 58 | # logger.info(request_prompt) 59 | 60 | # 3.2 请求大模型 61 | llm_response = self.llm.predict(request_prompt) 62 | # logger.info("llm response: {}".format(llm_response)) 63 | 64 | # 3.3 大模型结果解析 65 | result = "拒识" 66 | for option in options: 67 | if option in llm_response: 68 | result = option 69 | break 70 | # logger.info("parse result: {}".format(result)) 71 | 72 | # 4. 返回结果 73 | logger.info("response: {}".format(result)) 74 | return result 75 | 76 | if __name__ == "__main__": 77 | import sys 78 | vlc = VecLlmClassifier() 79 | if len(sys.argv) > 1: 80 | logger.info(vlc.predict("".join(sys.argv[1:]))) 81 | 82 | # # 性能测试 83 | # from tqdm import tqdm 84 | # for i in tqdm(range(20), desc="warm up"): 85 | # vlc.predict("感冒发烧怎么治疗") 86 | # for i in tqdm(range(20), desc="running speed"): 87 | # vlc.predict("王阳明到底顿悟了什么?") -------------------------------------------------------------------------------- /llm_classification/src/models/llm/llm_model.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Filename: llm_model.py 3 | # Author: ZENGGUANRONG 4 | # Date: 2023-12-17 5 | # description: 大模型调用模块,这里默认用的chatglm2 6 | 7 | # from transformers import AutoModel, AutoTokenizer 8 | from transformers import AutoModelForCausalLM, AutoTokenizer 9 | from typing import Tuple, List 10 | from loguru import logger 11 | 12 | class QWen2Model: 13 | def __init__(self, model_path, config = {}, device="cuda"): 14 | self.model = AutoModelForCausalLM.from_pretrained( 15 | model_path, 16 | torch_dtype="auto", 17 | device_map="auto" 18 | ) 19 | self.tokenizer = AutoTokenizer.from_pretrained(model_path) 20 | self.model = self.model.eval() 21 | self.device = device 22 | 23 | self.generate_config = self._read_config_(config) 24 | logger.info("load LLM Model done") 25 | 26 | def _read_config_(self, config): 27 | tmp_config = {} 28 | # tmp_config["max_length"] = config.get("max_length", 2048) 29 | tmp_config["num_beams"] = config.get("num_beams", 1) 30 | tmp_config["do_sample"] = config.get("do_sample", False) 31 | tmp_config["top_k"] = config.get("top_k", 1) 32 | tmp_config["temperature"] = config.get("temperature", 0.8) 33 | return tmp_config 34 | 35 | def predict(self, query): 36 | messages = [ 37 | {"role": "system", "content": "You are a helpful assistant."}, 38 | {"role": "user", "content": query} 39 | ] 40 | text = self.tokenizer.apply_chat_template( 41 | messages, 42 | tokenize=False, 43 | add_generation_prompt=True 44 | ) 45 | model_inputs = self.tokenizer([text], return_tensors="pt").to(self.device) 46 | 47 | # Directly use generate() and tokenizer.decode() to get the output. 48 | # Use `max_new_tokens` to control the maximum output length. 49 | generated_ids = self.model.generate( 50 | model_inputs.input_ids, 51 | attention_mask=model_inputs.attention_mask, 52 | pad_token_id=self.tokenizer.eos_token_id, 53 | max_new_tokens=512, 54 | **self.generate_config 55 | ) 56 | generated_ids = [ 57 | output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) 58 | ] 59 | 60 | response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] 61 | return response 62 | 63 | if __name__ == "__main__": 64 | from config.toutiao_config import LLM_CONFIG,LLM_PATH 65 | print(LLM_CONFIG) 66 | llm_model = QWen2Model(LLM_PATH, config = LLM_CONFIG, device="cuda") 67 | print(llm_model.predict("如何做番茄炒蛋")) 68 | -------------------------------------------------------------------------------- /llm_classification/src/models/llm/test_qwen.py: -------------------------------------------------------------------------------- 1 | 2 | from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM 3 | from typing import Tuple, List 4 | from tqdm import tqdm 5 | import random 6 | 7 | model_path = "C:\\work\\tool\\huggingface\\models\\Qwen-1_8b" 8 | tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) 9 | # model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().quantize(8).cuda() 10 | 11 | model = AutoModelForCausalLM.from_pretrained( 12 | model_path, 13 | device_map="auto", 14 | trust_remote_code=True 15 | ).eval().cuda() 16 | response, history = model.chat(tokenizer, "你好", history=None) 17 | print(response) 18 | print(history) 19 | random_query = ["你好","你是谁","你是谁创造出来的"] 20 | for i in tqdm(range(100)): 21 | model.chat(tokenizer, random.choice(random_query), history=None) -------------------------------------------------------------------------------- /llm_classification/src/models/vec_model/simcse_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from loguru import logger 4 | from tqdm import tqdm 5 | from transformers import BertConfig, BertModel, BertTokenizer 6 | 7 | class SimcseModel(nn.Module): 8 | # https://blog.csdn.net/qq_44193969/article/details/126981581 9 | def __init__(self, pretrained_bert_path, pooling="cls") -> None: 10 | super(SimcseModel, self).__init__() 11 | 12 | self.pretrained_bert_path = pretrained_bert_path 13 | self.config = BertConfig.from_pretrained(self.pretrained_bert_path) 14 | 15 | self.model = BertModel.from_pretrained(self.pretrained_bert_path, config=self.config) 16 | self.model.eval() 17 | 18 | # self.model = None 19 | self.pooling = pooling 20 | 21 | def forward(self, input_ids, attention_mask, token_type_ids): 22 | out = self.model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) 23 | 24 | return out.last_hidden_state[:, 0] -------------------------------------------------------------------------------- /llm_classification/src/models/vec_model/vec_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from loguru import logger 5 | 6 | from transformers import BertTokenizer 7 | 8 | from src.models.vec_model.simcse_model import SimcseModel 9 | 10 | import onnxruntime as ort 11 | 12 | class VectorizeModel: 13 | def __init__(self, ptm_model_path, device = "cpu") -> None: 14 | self.tokenizer = BertTokenizer.from_pretrained(ptm_model_path) 15 | self.model = SimcseModel(pretrained_bert_path=ptm_model_path, pooling="cls") 16 | # print(self.model) 17 | self.model.eval() 18 | 19 | self.DEVICE = torch.device('cuda' if torch.cuda.is_available() else "cpu") 20 | # self.DEVICE = device 21 | logger.info(self.DEVICE) 22 | self.model.to(self.DEVICE) 23 | 24 | self.pdist = nn.PairwiseDistance(2) 25 | 26 | def predict_vec(self,query): 27 | q_id = self.tokenizer(query, max_length = 200, truncation=True, padding="max_length", return_tensors='pt') 28 | with torch.no_grad(): 29 | q_id_input_ids = q_id["input_ids"].squeeze(1).to(self.DEVICE) 30 | q_id_attention_mask = q_id["attention_mask"].squeeze(1).to(self.DEVICE) 31 | q_id_token_type_ids = q_id["token_type_ids"].squeeze(1).to(self.DEVICE) 32 | q_id_pred = self.model(q_id_input_ids, q_id_attention_mask, q_id_token_type_ids) 33 | 34 | return q_id_pred 35 | 36 | def predict_vec_request(self, query): 37 | q_id_pred = self.predict_vec(query) 38 | return q_id_pred.cpu().numpy().tolist() 39 | 40 | def predict_sim(self, q1, q2): 41 | q1_v = self.predict_vec(q1) 42 | q2_v = self.predict_vec(q2) 43 | sim = F.cosine_similarity(q1_v[0], q2_v[0], dim=-1) 44 | return sim.cpu().numpy().tolist() 45 | 46 | class VectorizeModel_v2(VectorizeModel): 47 | def __init__(self, ptm_model_path, onnx_path, providers=['CUDAExecutionProvider']) -> None: 48 | # ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider'] 49 | self.tokenizer = BertTokenizer.from_pretrained(ptm_model_path) 50 | self.model = ort.InferenceSession(onnx_path, providers=providers) 51 | 52 | self.pdist = nn.PairwiseDistance(2) 53 | 54 | def _to_numpy(self, tensor): 55 | return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy() 56 | 57 | def predict_vec(self,query): 58 | q_id = self.tokenizer(query, max_length = 200, truncation=True, padding="max_length", return_tensors='pt') 59 | input_feed = { 60 | self.model.get_inputs()[0].name: self._to_numpy(q_id["input_ids"]), 61 | self.model.get_inputs()[1].name: self._to_numpy(q_id["attention_mask"]), 62 | self.model.get_inputs()[2].name: self._to_numpy(q_id["token_type_ids"]), 63 | } 64 | return torch.tensor(self.model.run(None, input_feed=input_feed)[0]) 65 | 66 | def predict_sim(self, q1, q2): 67 | q1_v = self.predict_vec(q1) 68 | q2_v = self.predict_vec(q2) 69 | sim = F.cosine_similarity(q1_v[0], q2_v[0], dim=-1) 70 | return sim.numpy().tolist() 71 | 72 | if __name__ == "__main__": 73 | import time,random 74 | from tqdm import tqdm 75 | device = torch.device('cuda' if torch.cuda.is_available() else "cpu") 76 | # device = "" 77 | # vec_model = VectorizeModel('C:/work/tool/huggingface/models/simcse-chinese-roberta-wwm-ext', device=device) 78 | vec_model = VectorizeModel_v2('C:/work/tool/huggingface/models/simcse-chinese-roberta-wwm-ext', 79 | "./data/model_simcse_roberta_output_20240211.onnx",providers=['CUDAExecutionProvider']) 80 | # vec_model = VectorizeModel_v2('C:/work/tool/huggingface/models/simcse-chinese-roberta-wwm-ext', 81 | # "./data/model_simcse_roberta_output_20240211.onnx",providers=['TensorrtExecutionProvider']) 82 | # 单测 83 | # q = ["你好啊"] 84 | # print(vec_model.predict_vec(q)) 85 | # print(vec_model.predict_sim("你好呀","你好啊")) 86 | tmp_queries = ["你好啊", "今天天气怎么样", "我要暴富"] 87 | # 开始批跑 88 | batch_sizes = [1,2,4,8,16] 89 | for b in batch_sizes: 90 | for i in tqdm(range(100),desc="warmup"): 91 | tmp_q = [] 92 | for i in range(b): 93 | tmp_q.append(random.choice(tmp_queries)) 94 | vec_model.predict_vec(tmp_q) 95 | for i in tqdm(range(1000),desc="batch_size={}".format(b)): 96 | tmp_q = [] 97 | for i in range(b): 98 | tmp_q.append(random.choice(tmp_queries)) 99 | vec_model.predict_vec(tmp_q) 100 | -------------------------------------------------------------------------------- /llm_classification/src/searcher/searcher.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Filename: searcher.py 3 | # Author: ZENGGUANRONG 4 | # Date: 2023-12-12 5 | # description: 核心检索器 6 | 7 | import json,requests,copy 8 | import numpy as np 9 | from loguru import logger 10 | from src.searcher.vec_searcher.vec_searcher import VecSearcher 11 | from src.models.vec_model.vec_model import VectorizeModel 12 | 13 | class Searcher: 14 | def __init__(self, model_path, vec_search_path): 15 | self.vec_model = VectorizeModel(model_path) 16 | logger.info("load vec_model done") 17 | 18 | self.vec_searcher = VecSearcher() 19 | self.vec_searcher.load(vec_search_path) 20 | logger.info("load vec_searcher done") 21 | 22 | def rank(self, query, recall_result): 23 | rank_result = [] 24 | for idx in range(len(recall_result)): 25 | new_sim = self.vec_model.predict_sim(query, recall_result[idx][1][0]) 26 | rank_item = copy.deepcopy(recall_result[idx]) 27 | rank_item.append(new_sim) 28 | rank_result.append(copy.deepcopy(rank_item)) 29 | rank_result.sort(key=lambda x: x[3], reverse=True) 30 | return rank_result 31 | 32 | def search(self, query, nums=3): 33 | # logger.info("request: {}".format(query)) 34 | 35 | q_vec = self.vec_model.predict_vec(query).cpu().numpy() 36 | 37 | recall_result = self.vec_searcher.search(q_vec, nums) 38 | 39 | rank_result = self.rank(query, recall_result) 40 | # rank_result = list(filter(lambda x:x[4] > 0.8, rank_result)) 41 | 42 | # logger.info("response: {}".format(rank_result)) 43 | return rank_result 44 | 45 | if __name__ == "__main__": 46 | VEC_MODEL_PATH = "C:/work/tool/huggingface/models/simcse-chinese-roberta-wwm-ext" 47 | VEC_INDEX_DATA = "vec_index_test2023121201" 48 | searcher = Searcher(VEC_MODEL_PATH, VEC_INDEX_DATA) 49 | q = "什么人不能吃花生" 50 | print(searcher.search(q)) -------------------------------------------------------------------------------- /llm_classification/src/searcher/vec_searcher/vec_index.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Filename: vec_index.py 3 | # Author: ZENGGUANRONG 4 | # Date: 2023-12-12 5 | # description: 向量召回索引-FAISS 6 | 7 | import faiss 8 | from loguru import logger 9 | from src.models.vec_model.vec_model import VectorizeModel 10 | 11 | class VecIndex: 12 | def __init__(self) -> None: 13 | self.index = "" 14 | 15 | def build(self, index_dim): 16 | description = "HNSW64" 17 | measure = faiss.METRIC_L2 18 | self.index = faiss.index_factory(index_dim, description, measure) 19 | 20 | def insert(self, vec): 21 | self.index.add(vec) 22 | 23 | def batch_insert(self, vecs): 24 | self.index.add(vecs) 25 | 26 | def load(self, read_path): 27 | # read_path: XXX.index 28 | self.index = faiss.read_index(read_path) 29 | 30 | def save(self, save_path): 31 | # save_path: XXX.index 32 | faiss.write_index(self.index, save_path) 33 | 34 | def search(self, vec, num): 35 | # id, distance 36 | return self.index.search(vec, num) -------------------------------------------------------------------------------- /llm_classification/src/searcher/vec_searcher/vec_searcher.py: -------------------------------------------------------------------------------- 1 | import os, json 2 | from loguru import logger 3 | from src.searcher.vec_searcher.vec_index import VecIndex 4 | 5 | class VecSearcher: 6 | def __init__(self): 7 | self.invert_index = VecIndex() # 检索倒排,使用的是索引是VecIndex 8 | self.forward_index = [] # 检索正排,实质上只是个list,通过ID获取对应的内容 9 | self.INDEX_FOLDER_PATH_TEMPLATE = "data/index/{}" 10 | 11 | def build(self, index_dim, index_name): 12 | self.index_name = index_name 13 | self.index_folder_path = self.INDEX_FOLDER_PATH_TEMPLATE.format(index_name) 14 | if not os.path.exists(self.index_folder_path) or not os.path.isdir(self.index_folder_path): 15 | os.mkdir(self.index_folder_path) 16 | 17 | self.invert_index = VecIndex() 18 | self.invert_index.build(index_dim) 19 | 20 | self.forward_index = [] 21 | 22 | def insert(self, vec, doc): 23 | self.invert_index.insert(vec) 24 | # self.invert_index.batch_insert(vecs) 25 | 26 | self.forward_index.append(doc) 27 | 28 | def save(self): 29 | with open(self.index_folder_path + "/forward_index.txt", "w", encoding="utf8") as f: 30 | for data in self.forward_index: 31 | f.write("{}\n".format(json.dumps(data, ensure_ascii=False))) 32 | 33 | self.invert_index.save(self.index_folder_path + "/invert_index.faiss") 34 | 35 | def load(self, index_name): 36 | self.index_name = index_name 37 | self.index_folder_path = self.INDEX_FOLDER_PATH_TEMPLATE.format(index_name) 38 | 39 | self.invert_index = VecIndex() 40 | self.invert_index.load(self.index_folder_path + "/invert_index.faiss") 41 | 42 | self.forward_index = [] 43 | with open(self.index_folder_path + "/forward_index.txt", encoding="utf8") as f: 44 | for line in f: 45 | self.forward_index.append(json.loads(line.strip())) 46 | 47 | def search(self, vecs, nums = 5): 48 | search_res = self.invert_index.search(vecs, nums) 49 | recall_list = [] 50 | for idx in range(nums): 51 | # recall_list_idx, recall_list_detail, distance 52 | recall_list.append([search_res[1][0][idx], self.forward_index[search_res[1][0][idx]], search_res[0][0][idx]]) 53 | # recall_list = list(filter(lambda x: x[2] < 100, result)) 54 | 55 | return recall_list -------------------------------------------------------------------------------- /llm_classification/src/utils/data_processing.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Filename: data_processing.py 3 | # Author: ZENGGUANRONG 4 | # Date: 2024-06-25 5 | # description: 数据处理函数 6 | 7 | def load_toutiao_data(path): 8 | source_data = [] 9 | with open(path, encoding="utf8") as f: 10 | for line in f: 11 | ll = line.strip().split("_!_") # 新闻ID,分类code,分类名称,新闻字符串(仅含标题),新闻关键词 12 | source_data.append([ll[3], ll]) 13 | return source_data 14 | 15 | def load_class_def(path): 16 | source_data = {} 17 | with open(path, encoding="utf8") as f: 18 | for line in f: 19 | ll = line.strip().split("\t") 20 | source_data[ll[0]] = ll[1] 21 | return source_data -------------------------------------------------------------------------------- /mt5_summary/arg_config.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Filename: arg_config.py 3 | # Author: ZENGGUANRONG 4 | # Date: 2024-10-06 5 | # description: 执行参数 6 | # reference: https://github.com/jsksxs360/How-to-use-Transformers 7 | 8 | import argparse 9 | 10 | def parse_args(): 11 | parser = argparse.ArgumentParser() 12 | 13 | # Required parameters 14 | parser.add_argument("--output_dir", default=None, type=str, required=True, 15 | help="The output directory where the model checkpoints and predictions will be written.", 16 | ) 17 | parser.add_argument("--train_file", default=None, type=str, required=True, help="The input training file.") 18 | parser.add_argument("--dev_file", default=None, type=str, required=True, help="The input evaluation file.") 19 | parser.add_argument("--test_file", default=None, type=str, required=True, help="The input testing file.") 20 | 21 | parser.add_argument("--model_type", 22 | default="bert", type=str, required=True 23 | ) 24 | parser.add_argument("--model_checkpoint", 25 | default="bert-large-cased/", type=str, required=True, 26 | help="Path to pretrained model or model identifier from huggingface.co/models", 27 | ) 28 | parser.add_argument("--max_input_length", default=256, type=int, required=True) 29 | parser.add_argument("--max_target_length", default=256, type=int, required=True) 30 | 31 | parser.add_argument("--do_train", action="store_true", help="Whether to run training.") 32 | parser.add_argument("--do_test", action="store_true", help="Whether to run eval on the test set.") 33 | parser.add_argument("--do_predict", action="store_true", help="Whether to save predicted labels.") 34 | 35 | # Other parameters 36 | parser.add_argument("--learning_rate", default=1e-5, type=float, help="The initial learning rate for Adam.") 37 | parser.add_argument("--num_train_epochs", default=3, type=int, help="Total number of training epochs to perform.") 38 | parser.add_argument("--batch_size", default=4, type=int) 39 | parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") 40 | parser.add_argument("--beam_search_size", default=4, type=int) 41 | parser.add_argument("--no_repeat_ngram_size", default=2, type=int) 42 | 43 | parser.add_argument("--adam_beta1", default=0.9, type=float, 44 | help="Epsilon for Adam optimizer." 45 | ) 46 | parser.add_argument("--adam_beta2", default=0.98, type=float, 47 | help="Epsilon for Adam optimizer." 48 | ) 49 | parser.add_argument("--adam_epsilon", default=1e-8, type=float, 50 | help="Epsilon for Adam optimizer." 51 | ) 52 | parser.add_argument("--warmup_proportion", default=0.1, type=float, 53 | help="Proportion of training to perform linear learning rate warmup for,E.g., 0.1 = 10% of training." 54 | ) 55 | parser.add_argument("--weight_decay", default=0.01, type=float, 56 | help="Weight decay if we apply some." 57 | ) 58 | args = parser.parse_args() 59 | return args -------------------------------------------------------------------------------- /mt5_summary/data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Filename: data.py 3 | # Author: ZENGGUANRONG 4 | # Date: 2024-10-06 5 | # description: 数据处理函数 6 | # reference: https://github.com/jsksxs360/How-to-use-Transformers 7 | from torch.utils.data import Dataset, DataLoader 8 | import torch 9 | 10 | MAX_DATASET_SIZE = 200000 11 | 12 | class LCSTS(Dataset): 13 | # 数据参考:http://icrc.hitsz.edu.cn/Article/show/139.html 14 | def __init__(self, data_file): 15 | self.data = self.load_data(data_file) 16 | 17 | def load_data(self, data_file): 18 | Data = {} 19 | with open(data_file, 'rt', encoding='utf-8') as f: 20 | for idx, line in enumerate(f): 21 | if idx >= MAX_DATASET_SIZE: 22 | break 23 | items = line.strip().split('!=!') 24 | assert len(items) == 2 25 | Data[idx] = { 26 | 'title': items[0], 27 | 'content': items[1] 28 | } 29 | return Data 30 | 31 | def __len__(self): 32 | return len(self.data) 33 | 34 | def __getitem__(self, idx): 35 | return self.data[idx] 36 | 37 | def get_dataLoader(args, dataset, model, tokenizer, batch_size=None, shuffle=False): 38 | 39 | def collote_fn(batch_samples): 40 | batch_inputs, batch_targets = [], [] 41 | for sample in batch_samples: 42 | batch_inputs.append(sample['content']) 43 | batch_targets.append(sample['title']) 44 | batch_data = tokenizer( 45 | batch_inputs, 46 | padding=True, 47 | max_length=args.max_input_length, 48 | truncation=True, 49 | return_tensors="pt" 50 | ) 51 | with tokenizer.as_target_tokenizer(): 52 | labels = tokenizer( 53 | batch_targets, 54 | padding=True, 55 | max_length=args.max_target_length, 56 | truncation=True, 57 | return_tensors="pt" 58 | )["input_ids"] 59 | batch_data['decoder_input_ids'] = model.prepare_decoder_input_ids_from_labels(labels) 60 | end_token_index = torch.where(labels == tokenizer.eos_token_id)[1] 61 | for idx, end_idx in enumerate(end_token_index): 62 | labels[idx][end_idx+1:] = -100 63 | batch_data['labels'] = labels 64 | return batch_data 65 | 66 | return DataLoader(dataset, batch_size=(batch_size if batch_size else args.batch_size), shuffle=shuffle, 67 | collate_fn=collote_fn) -------------------------------------------------------------------------------- /mt5_summary/mt5_summary_main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import json 4 | from tqdm.auto import tqdm 5 | import numpy as np 6 | import torch 7 | from transformers import AdamW, get_scheduler 8 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM 9 | from rouge import Rouge 10 | import sys 11 | 12 | from tools import seed_everything 13 | from arg_config import parse_args 14 | from data import LCSTS, get_dataLoader 15 | 16 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', 17 | datefmt='%Y/%m/%d %H:%M:%S', 18 | level=logging.INFO) 19 | logger = logging.getLogger("Model") 20 | 21 | def train_loop(args, dataloader, model, optimizer, lr_scheduler, epoch, total_loss): 22 | progress_bar = tqdm(range(len(dataloader))) 23 | progress_bar.set_description(f'loss: {0:>7f}') 24 | finish_batch_num = epoch * len(dataloader) 25 | 26 | model.train() 27 | for batch, batch_data in enumerate(dataloader, start=1): 28 | batch_data = batch_data.to(args.device) 29 | outputs = model(**batch_data) 30 | loss = outputs.loss 31 | 32 | optimizer.zero_grad() 33 | loss.backward() 34 | optimizer.step() 35 | lr_scheduler.step() 36 | 37 | total_loss += loss.item() 38 | progress_bar.set_description(f'loss: {total_loss/(finish_batch_num + batch):>7f}') 39 | progress_bar.update(1) 40 | return total_loss 41 | 42 | def test_loop(args, dataloader, model, tokenizer): 43 | preds, labels = [], [] 44 | rouge = Rouge() 45 | 46 | model.eval() 47 | with torch.no_grad(): 48 | for batch_data in tqdm(dataloader): 49 | batch_data = batch_data.to(args.device) 50 | generated_tokens = model.generate( 51 | batch_data["input_ids"], 52 | attention_mask=batch_data["attention_mask"], 53 | max_length=args.max_target_length, 54 | num_beams=args.beam_search_size, 55 | no_repeat_ngram_size=args.no_repeat_ngram_size, 56 | ).cpu().numpy() 57 | if isinstance(generated_tokens, tuple): 58 | generated_tokens = generated_tokens[0] 59 | label_tokens = batch_data["labels"].cpu().numpy() 60 | 61 | decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=False) 62 | label_tokens = np.where(label_tokens != -100, label_tokens, tokenizer.pad_token_id) 63 | decoded_labels = tokenizer.batch_decode(label_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=False) 64 | 65 | preds += [' '.join(pred.strip()) for pred in decoded_preds] 66 | labels += [' '.join(label.strip()) for label in decoded_labels] 67 | scores = rouge.get_scores(hyps=preds, refs=labels, avg=True) 68 | result = {key: value['f'] * 100 for key, value in scores.items()} 69 | result['avg'] = np.mean(list(result.values())) 70 | return result 71 | 72 | def train(args, train_dataset, dev_dataset, model, tokenizer): 73 | """ Train the model """ 74 | train_dataloader = get_dataLoader(args, train_dataset, model, tokenizer, shuffle=True) 75 | dev_dataloader = get_dataLoader(args, dev_dataset, model, tokenizer, shuffle=False) 76 | t_total = len(train_dataloader) * args.num_train_epochs 77 | # Prepare optimizer and schedule (linear warmup and decay) 78 | no_decay = ["bias", "LayerNorm.weight"] 79 | optimizer_grouped_parameters = [ 80 | {"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay}, 81 | {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0} 82 | ] 83 | args.warmup_steps = int(t_total * args.warmup_proportion) 84 | optimizer = AdamW( 85 | optimizer_grouped_parameters, 86 | lr=args.learning_rate, 87 | betas=(args.adam_beta1, args.adam_beta2), 88 | eps=args.adam_epsilon 89 | ) 90 | lr_scheduler = get_scheduler( 91 | 'linear', 92 | optimizer, 93 | num_warmup_steps=args.warmup_steps, 94 | num_training_steps=t_total 95 | ) 96 | # Train! 97 | logger.info("***** Running training *****") 98 | logger.info(f"Num examples - {len(train_dataset)}") 99 | logger.info(f"Num Epochs - {args.num_train_epochs}") 100 | logger.info(f"Total optimization steps - {t_total}") 101 | with open(os.path.join(args.output_dir, 'args.txt'), 'wt') as f: 102 | f.write(str(args)) 103 | 104 | total_loss = 0. 105 | best_avg_rouge = 0. 106 | for epoch in range(args.num_train_epochs): 107 | print(f"Epoch {epoch+1}/{args.num_train_epochs}\n" + 30 * "-") 108 | total_loss = train_loop(args, train_dataloader, model, optimizer, lr_scheduler, epoch, total_loss) 109 | dev_rouges = test_loop(args, dev_dataloader, model, tokenizer) 110 | logger.info(f"Dev Rouge1: {dev_rouges['rouge-1']:>0.2f} Rouge2: {dev_rouges['rouge-2']:>0.2f} RougeL: {dev_rouges['rouge-l']:>0.2f}") 111 | rouge_avg = dev_rouges['avg'] 112 | if rouge_avg > best_avg_rouge: 113 | best_avg_rouge = rouge_avg 114 | logger.info(f'saving new weights to {args.output_dir}...\n') 115 | save_weight = f'epoch_{epoch+1}_dev_rouge_avg_{rouge_avg:0.4f}_weights.bin' 116 | torch.save(model.state_dict(), os.path.join(args.output_dir, save_weight)) 117 | logger.info("Done!") 118 | 119 | def test(args, test_dataset, model, tokenizer, save_weights:list): 120 | test_dataloader = get_dataLoader(args, test_dataset, model, tokenizer, shuffle=False) 121 | logger.info('***** Running testing *****') 122 | for save_weight in save_weights: 123 | logger.info(f'loading weights from {save_weight}...') 124 | model.load_state_dict(torch.load(os.path.join(args.output_dir, save_weight))) 125 | test_rouges = test_loop(args, test_dataloader, model, tokenizer) 126 | logger.info(f"Test Rouge1: {test_rouges['rouge-1']:>0.2f} Rouge2: {test_rouges['rouge-2']:>0.2f} RougeL: {test_rouges['rouge-l']:>0.2f}") 127 | 128 | def predict(args, document:str, model, tokenizer): 129 | inputs = tokenizer( 130 | document, 131 | max_length=args.max_input_length, 132 | truncation=True, 133 | return_tensors="pt" 134 | ) 135 | inputs = inputs.to(args.device) 136 | with torch.no_grad(): 137 | generated_tokens = model.generate( 138 | inputs["input_ids"], 139 | attention_mask=inputs["attention_mask"], 140 | max_length=args.max_target_length, 141 | num_beams=args.beam_search_size, 142 | no_repeat_ngram_size=args.no_repeat_ngram_size, 143 | ).cpu().numpy() 144 | if isinstance(generated_tokens, tuple): 145 | generated_tokens = generated_tokens[0] 146 | decoded_preds = tokenizer.decode( 147 | generated_tokens[0], 148 | skip_special_tokens=True, 149 | clean_up_tokenization_spaces=False 150 | ) 151 | return decoded_preds 152 | 153 | if __name__ == '__main__': 154 | args = parse_args() 155 | if args.do_train and os.path.exists(args.output_dir) and os.listdir(args.output_dir): 156 | raise ValueError(f'Output directory ({args.output_dir}) already exists and is not empty.') 157 | if not os.path.exists(args.output_dir): 158 | os.mkdir(args.output_dir) 159 | args.device = 'cuda' if torch.cuda.is_available() else 'cpu' 160 | args.n_gpu = torch.cuda.device_count() 161 | logger.warning(f'Using {args.device} device, n_gpu: {args.n_gpu}') 162 | # Set seed 163 | seed_everything(args.seed) 164 | # Load pretrained model and tokenizer 165 | logger.info(f'loading pretrained model and tokenizer of {args.model_type} ...') 166 | tokenizer = AutoTokenizer.from_pretrained(args.model_checkpoint) 167 | model = AutoModelForSeq2SeqLM.from_pretrained(args.model_checkpoint).to(args.device) 168 | # Training 169 | if args.do_train: 170 | # Set seed 171 | seed_everything(args.seed) 172 | train_dataset = LCSTS(args.train_file) 173 | dev_dataset = LCSTS(args.dev_file) 174 | train(args, train_dataset, dev_dataset, model, tokenizer) 175 | # Testing 176 | save_weights = [file for file in os.listdir(args.output_dir) if file.endswith('.bin')] 177 | if args.do_test: 178 | test_dataset = LCSTS(args.test_file) 179 | test(args, test_dataset, model, tokenizer, save_weights) 180 | # Predicting 181 | if args.do_predict: 182 | test_dataset = LCSTS(args.test_file) 183 | for save_weight in save_weights: 184 | logger.info(f'loading weights from {save_weight}...') 185 | model.load_state_dict(torch.load(os.path.join(args.output_dir, save_weight))) 186 | logger.info(f'predicting labels of {save_weight}...') 187 | 188 | results = [] 189 | model.eval() 190 | for s_idx in tqdm(range(len(test_dataset))): 191 | sample = test_dataset[s_idx] 192 | pred_summ = predict(args, sample['content'], model, tokenizer) 193 | results.append({ 194 | "sentence": sample['content'], 195 | "prediction": pred_summ, 196 | "summarization": sample['title'] 197 | }) 198 | with open(os.path.join(args.output_dir, save_weight + '_test_data_pred.json'), 'wt', encoding='utf-8') as f: 199 | for exapmle_result in results: 200 | f.write(json.dumps(exapmle_result, ensure_ascii=False) + '\n') -------------------------------------------------------------------------------- /mt5_summary/run.sh: -------------------------------------------------------------------------------- 1 | export OUTPUT_DIR=./summ_mt5_results/ 2 | 3 | python3 run_summarization_mt5.py \ 4 | --output_dir=$OUTPUT_DIR \ 5 | --model_type=mT5 \ 6 | --model_checkpoint=csebuetnlp/mT5_multilingual_XLSum \ 7 | --train_file=../../data/lcsts_tsv/data1.tsv \ 8 | --dev_file=../../data/lcsts_tsv/data2.tsv \ 9 | --test_file=../../data/lcsts_tsv/data3.tsv \ 10 | --max_input_length=512 \ 11 | --max_target_length=32 \ 12 | --learning_rate=1e-5 \ 13 | --num_train_epochs=3 \ 14 | --batch_size=32 \ 15 | --beam_search_size=4 \ 16 | --no_repeat_ngram_size=2 \ 17 | --do_train \ 18 | --warmup_proportion=0. \ 19 | --seed=42 -------------------------------------------------------------------------------- /mt5_summary/tools.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Filename: tools.py 3 | # Author: ZENGGUANRONG 4 | # Date: 2024-10-06 5 | # description: 关键工具 6 | # reference: https://github.com/jsksxs360/How-to-use-Transformers 7 | import random 8 | import os 9 | import numpy as np 10 | import torch 11 | 12 | def seed_everything(seed=1029): 13 | random.seed(seed) 14 | os.environ['PYTHONHASHSEED'] = str(seed) 15 | np.random.seed(seed) 16 | torch.manual_seed(seed) 17 | torch.cuda.manual_seed(seed) 18 | torch.cuda.manual_seed_all(seed) 19 | # some cudnn methods can be random even after fixing the seed 20 | # unless you tell it to be deterministic 21 | torch.backends.cudnn.deterministic = True -------------------------------------------------------------------------------- /vec_searcher/script/build_vec_index.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Filename: build_vec_index.py 3 | # Author: ZENGGUANRONG 4 | # Date: 2024-09-07 5 | # description: 构造向量索引脚本 6 | 7 | import json,torch,copy,random 8 | from tqdm import tqdm 9 | from loguru import logger 10 | 11 | from utils.data_processing import load_toutiao_data 12 | from vec_model.vec_model import VectorizeModel 13 | from vec_searcher.vec_searcher import VecSearcher 14 | 15 | if __name__ == "__main__": 16 | # 0. 必要配置 17 | MODE = "DEBUG" 18 | 19 | VERSION = "20240907" 20 | VEC_MODEL_PATH = "C:/work/tool/huggingface/models/simcse-chinese-roberta-wwm-ext" 21 | SOURCE_INDEX_DATA_PATH = "./data/toutiao_cat_data/toutiao_cat_data.txt" # 数据来源:https://github.com/aceimnorstuvwxz/toutiao-text-classfication-dataset 22 | VEC_INDEX_DATA = "vec_index_toutiao_{}_{}".format(VERSION,MODE) 23 | # TESE_DATA_PATH = "./data/toutiao_cat_data/test_set_{}_{}.txt".format(VERSION,MODE) 24 | RANDOM_SEED = 100 25 | 26 | DEVICE = torch.device('cuda' if torch.cuda.is_available() else "cpu") 27 | # TEST_SIZE = 0.1 28 | # 类目体系 29 | CLASS_INFO = [ 30 | ["100", '民生-故事', 'news_story'], 31 | ["101", '文化-文化', 'news_culture'], 32 | ["102", '娱乐-娱乐', 'news_entertainment'], 33 | ["103", '体育-体育', 'news_sports'], 34 | ["104", '财经-财经', 'news_finance'], 35 | # ["105", '时政 新时代', 'nineteenth'], 36 | ["106", '房产-房产', 'news_house'], 37 | ["107", '汽车-汽车', 'news_car'], 38 | ["108", '教育-教育', 'news_edu' ], 39 | ["109", '科技-科技', 'news_tech'], 40 | ["110", '军事-军事', 'news_military'], 41 | # ["111" 宗教 无,凤凰佛教等来源], 42 | ["112", '旅游-旅游', 'news_travel'], 43 | ["113", '国际-国际', 'news_world'], 44 | ["114", '证券-股票', 'stock'], 45 | ["115", '农业-三农', 'news_agriculture'], 46 | ["116", '电竞-游戏', 'news_game'] 47 | ] 48 | ID2CN_MAPPING = {} 49 | for idx in range(len(CLASS_INFO)): 50 | ID2CN_MAPPING[CLASS_INFO[idx][0]] = CLASS_INFO[idx][1] 51 | 52 | # 1. 加载数据、模型 53 | # 1.1 加载模型 54 | vec_model = VectorizeModel(VEC_MODEL_PATH, DEVICE) 55 | index_dim = len(vec_model.predict_vec("你好啊")[0]) 56 | # 1.2 加载数据 57 | toutiao_index_data = load_toutiao_data(SOURCE_INDEX_DATA_PATH) 58 | source_index_data = copy.deepcopy(toutiao_index_data) 59 | logger.info("load data done: {}".format(len(source_index_data))) 60 | if MODE == "DEBUG": 61 | random.shuffle(source_index_data) 62 | source_index_data = source_index_data[:10000] 63 | 64 | # 2. 创建索引并灌入数据 65 | # 2.1 构造索引 66 | vec_searcher = VecSearcher() 67 | vec_searcher.build(index_dim, VEC_INDEX_DATA) 68 | 69 | # 2.2 推理向量 70 | vectorize_result = [] 71 | for q in tqdm(source_index_data, desc="VEC MODEL RUNNING"): 72 | vec = vec_model.predict_vec(q[0]).cpu().numpy() 73 | tmp_result = copy.deepcopy(q) 74 | tmp_result.append(vec) 75 | vectorize_result.append(copy.deepcopy(tmp_result)) 76 | 77 | # 2.3 开始存入 78 | for idx in tqdm(range(len(vectorize_result)), desc="INSERT INTO INDEX"): 79 | vec_searcher.insert(vectorize_result[idx][2], vectorize_result[idx][:2]) 80 | 81 | # 3. 保存 82 | # 3.1 索引保存 83 | vec_searcher.save() 84 | logger.info("build done: {}".format(VEC_INDEX_DATA)) 85 | -------------------------------------------------------------------------------- /vec_searcher/searcher.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Filename: searcher.py 3 | # Author: ZENGGUANRONG 4 | # Date: 2023-12-12 5 | # description: 核心检索器 6 | 7 | import json,requests,copy 8 | import numpy as np 9 | from loguru import logger 10 | from vec_searcher.vec_searcher import VecSearcher 11 | from vec_model.vec_model import VectorizeModel 12 | 13 | class Searcher: 14 | def __init__(self, model_path, vec_search_path): 15 | self.vec_model = VectorizeModel(model_path) 16 | logger.info("load vec_model done") 17 | 18 | self.vec_searcher = VecSearcher() 19 | self.vec_searcher.load(vec_search_path) 20 | logger.info("load vec_searcher done") 21 | 22 | def rank(self, query, recall_result): 23 | rank_result = [] 24 | for idx in range(len(recall_result)): 25 | new_sim = self.vec_model.predict_sim(query, recall_result[idx][1][0]) 26 | rank_item = copy.deepcopy(recall_result[idx]) 27 | rank_item.append(new_sim) 28 | rank_result.append(copy.deepcopy(rank_item)) 29 | rank_result.sort(key=lambda x: x[3], reverse=True) 30 | return rank_result 31 | 32 | def search(self, query, nums=3): 33 | logger.info("request: {}".format(query)) 34 | 35 | q_vec = self.vec_model.predict_vec(query).cpu().numpy() 36 | 37 | recall_result = self.vec_searcher.search(q_vec, nums) 38 | 39 | rank_result = self.rank(query, recall_result) 40 | # rank_result = list(filter(lambda x:x[4] > 0.8, rank_result)) 41 | 42 | logger.info("response: {}".format(rank_result)) 43 | return rank_result 44 | 45 | if __name__ == "__main__": 46 | VEC_MODEL_PATH = "C:/work/tool/huggingface/models/simcse-chinese-roberta-wwm-ext" 47 | VEC_INDEX_DATA = "vec_index_toutiao_20240702_DEBUG" 48 | searcher = Searcher(VEC_MODEL_PATH, VEC_INDEX_DATA) 49 | q = "小产权房" 50 | print(searcher.search(q)) -------------------------------------------------------------------------------- /vec_searcher/utils/data_processing.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Filename: data_processing.py 3 | # Author: ZENGGUANRONG 4 | # Date: 2024-06-25 5 | # description: 数据处理函数 6 | 7 | def load_toutiao_data(path): 8 | source_data = [] 9 | with open(path, encoding="utf8") as f: 10 | for line in f: 11 | ll = line.strip().split("_!_") # 新闻ID,分类code,分类名称,新闻字符串(仅含标题),新闻关键词 12 | source_data.append([ll[3], ll]) 13 | return source_data 14 | 15 | def load_class_def(path): 16 | source_data = {} 17 | with open(path, encoding="utf8") as f: 18 | for line in f: 19 | ll = line.strip().split("\t") 20 | source_data[ll[0]] = ll[1] 21 | return source_data -------------------------------------------------------------------------------- /vec_searcher/vec_model/simcse_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from loguru import logger 4 | from tqdm import tqdm 5 | from transformers import BertConfig, BertModel, BertTokenizer 6 | 7 | class SimcseModel(nn.Module): 8 | # https://blog.csdn.net/qq_44193969/article/details/126981581 9 | def __init__(self, pretrained_bert_path, pooling="cls") -> None: 10 | super(SimcseModel, self).__init__() 11 | 12 | self.pretrained_bert_path = pretrained_bert_path 13 | self.config = BertConfig.from_pretrained(self.pretrained_bert_path) 14 | 15 | self.model = BertModel.from_pretrained(self.pretrained_bert_path, config=self.config) 16 | self.model.eval() 17 | 18 | # self.model = None 19 | self.pooling = pooling 20 | 21 | def forward(self, input_ids, attention_mask, token_type_ids): 22 | out = self.model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) 23 | 24 | return out.last_hidden_state[:, 0] -------------------------------------------------------------------------------- /vec_searcher/vec_model/vec_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from loguru import logger 5 | 6 | from transformers import BertTokenizer 7 | 8 | from vec_model.simcse_model import SimcseModel 9 | 10 | import onnxruntime as ort 11 | 12 | class VectorizeModel: 13 | def __init__(self, ptm_model_path, device = "cpu") -> None: 14 | self.tokenizer = BertTokenizer.from_pretrained(ptm_model_path) 15 | self.model = SimcseModel(pretrained_bert_path=ptm_model_path, pooling="cls") 16 | # print(self.model) 17 | self.model.eval() 18 | 19 | self.DEVICE = torch.device('cuda' if torch.cuda.is_available() else "cpu") 20 | # self.DEVICE = device 21 | logger.info(self.DEVICE) 22 | self.model.to(self.DEVICE) 23 | 24 | self.pdist = nn.PairwiseDistance(2) 25 | 26 | def predict_vec(self,query): 27 | q_id = self.tokenizer(query, max_length = 200, truncation=True, padding="max_length", return_tensors='pt') 28 | with torch.no_grad(): 29 | q_id_input_ids = q_id["input_ids"].squeeze(1).to(self.DEVICE) 30 | q_id_attention_mask = q_id["attention_mask"].squeeze(1).to(self.DEVICE) 31 | q_id_token_type_ids = q_id["token_type_ids"].squeeze(1).to(self.DEVICE) 32 | q_id_pred = self.model(q_id_input_ids, q_id_attention_mask, q_id_token_type_ids) 33 | 34 | return q_id_pred 35 | 36 | def predict_vec_request(self, query): 37 | q_id_pred = self.predict_vec(query) 38 | return q_id_pred.cpu().numpy().tolist() 39 | 40 | def predict_sim(self, q1, q2): 41 | q1_v = self.predict_vec(q1) 42 | q2_v = self.predict_vec(q2) 43 | sim = F.cosine_similarity(q1_v[0], q2_v[0], dim=-1) 44 | return sim.cpu().numpy().tolist() 45 | 46 | class VectorizeModel_v2(VectorizeModel): 47 | def __init__(self, ptm_model_path, onnx_path, providers=['CUDAExecutionProvider']) -> None: 48 | # ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider'] 49 | self.tokenizer = BertTokenizer.from_pretrained(ptm_model_path) 50 | self.model = ort.InferenceSession(onnx_path, providers=providers) 51 | 52 | self.pdist = nn.PairwiseDistance(2) 53 | 54 | def _to_numpy(self, tensor): 55 | return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy() 56 | 57 | def predict_vec(self,query): 58 | q_id = self.tokenizer(query, max_length = 200, truncation=True, padding="max_length", return_tensors='pt') 59 | input_feed = { 60 | self.model.get_inputs()[0].name: self._to_numpy(q_id["input_ids"]), 61 | self.model.get_inputs()[1].name: self._to_numpy(q_id["attention_mask"]), 62 | self.model.get_inputs()[2].name: self._to_numpy(q_id["token_type_ids"]), 63 | } 64 | return torch.tensor(self.model.run(None, input_feed=input_feed)[0]) 65 | 66 | def predict_sim(self, q1, q2): 67 | q1_v = self.predict_vec(q1) 68 | q2_v = self.predict_vec(q2) 69 | sim = F.cosine_similarity(q1_v[0], q2_v[0], dim=-1) 70 | return sim.numpy().tolist() 71 | 72 | if __name__ == "__main__": 73 | import time,random 74 | from tqdm import tqdm 75 | device = torch.device('cuda' if torch.cuda.is_available() else "cpu") 76 | # device = "" 77 | # vec_model = VectorizeModel('C:/work/tool/huggingface/models/simcse-chinese-roberta-wwm-ext', device=device) 78 | vec_model = VectorizeModel_v2('C:/work/tool/huggingface/models/simcse-chinese-roberta-wwm-ext', 79 | "./data/model_simcse_roberta_output_20240211.onnx",providers=['CUDAExecutionProvider']) 80 | # vec_model = VectorizeModel_v2('C:/work/tool/huggingface/models/simcse-chinese-roberta-wwm-ext', 81 | # "./data/model_simcse_roberta_output_20240211.onnx",providers=['TensorrtExecutionProvider']) 82 | # 单测 83 | # q = ["你好啊"] 84 | # print(vec_model.predict_vec(q)) 85 | # print(vec_model.predict_sim("你好呀","你好啊")) 86 | tmp_queries = ["你好啊", "今天天气怎么样", "我要暴富"] 87 | # 开始批跑 88 | batch_sizes = [1,2,4,8,16] 89 | for b in batch_sizes: 90 | for i in tqdm(range(100),desc="warmup"): 91 | tmp_q = [] 92 | for i in range(b): 93 | tmp_q.append(random.choice(tmp_queries)) 94 | vec_model.predict_vec(tmp_q) 95 | for i in tqdm(range(1000),desc="batch_size={}".format(b)): 96 | tmp_q = [] 97 | for i in range(b): 98 | tmp_q.append(random.choice(tmp_queries)) 99 | vec_model.predict_vec(tmp_q) 100 | -------------------------------------------------------------------------------- /vec_searcher/vec_searcher/vec_index.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Filename: vec_index.py 3 | # Author: ZENGGUANRONG 4 | # Date: 2023-12-12 5 | # description: 向量召回索引-FAISS 6 | 7 | import faiss 8 | from loguru import logger 9 | 10 | class VecIndex: 11 | def __init__(self) -> None: 12 | self.index = "" 13 | 14 | def build(self, index_dim): 15 | description = "HNSW64" 16 | measure = faiss.METRIC_L2 17 | self.index = faiss.index_factory(index_dim, description, measure) 18 | 19 | def insert(self, vec): 20 | self.index.add(vec) 21 | 22 | def batch_insert(self, vecs): 23 | self.index.add(vecs) 24 | 25 | def load(self, read_path): 26 | # read_path: XXX.index 27 | self.index = faiss.read_index(read_path) 28 | 29 | def save(self, save_path): 30 | # save_path: XXX.index 31 | faiss.write_index(self.index, save_path) 32 | 33 | def search(self, vec, num): 34 | # id, distance 35 | return self.index.search(vec, num) -------------------------------------------------------------------------------- /vec_searcher/vec_searcher/vec_searcher.py: -------------------------------------------------------------------------------- 1 | import os, json 2 | from loguru import logger 3 | from vec_searcher.vec_index import VecIndex 4 | 5 | class VecSearcher: 6 | def __init__(self): 7 | self.invert_index = VecIndex() # 检索倒排,使用的是索引是VecIndex 8 | self.forward_index = [] # 检索正排,实质上只是个list,通过ID获取对应的内容 9 | self.INDEX_FOLDER_PATH_TEMPLATE = "data/index/{}" 10 | 11 | def build(self, index_dim, index_name): 12 | self.index_name = index_name 13 | self.index_folder_path = self.INDEX_FOLDER_PATH_TEMPLATE.format(index_name) 14 | if not os.path.exists(self.index_folder_path) or not os.path.isdir(self.index_folder_path): 15 | os.mkdir(self.index_folder_path) 16 | 17 | self.invert_index = VecIndex() 18 | self.invert_index.build(index_dim) 19 | 20 | self.forward_index = [] 21 | 22 | def insert(self, vec, doc): 23 | self.invert_index.insert(vec) 24 | # self.invert_index.batch_insert(vecs) 25 | 26 | self.forward_index.append(doc) 27 | 28 | def save(self): 29 | with open(self.index_folder_path + "/forward_index.txt", "w", encoding="utf8") as f: 30 | for data in self.forward_index: 31 | f.write("{}\n".format(json.dumps(data, ensure_ascii=False))) 32 | 33 | self.invert_index.save(self.index_folder_path + "/invert_index.faiss") 34 | 35 | def load(self, index_name): 36 | self.index_name = index_name 37 | self.index_folder_path = self.INDEX_FOLDER_PATH_TEMPLATE.format(index_name) 38 | 39 | self.invert_index = VecIndex() 40 | self.invert_index.load(self.index_folder_path + "/invert_index.faiss") 41 | 42 | self.forward_index = [] 43 | with open(self.index_folder_path + "/forward_index.txt", encoding="utf8") as f: 44 | for line in f: 45 | self.forward_index.append(json.loads(line.strip())) 46 | 47 | def search(self, vecs, nums = 5): 48 | search_res = self.invert_index.search(vecs, nums) 49 | recall_list = [] 50 | for idx in range(nums): 51 | # recall_list_idx, recall_list_detail, distance 52 | recall_list.append([search_res[1][0][idx], self.forward_index[search_res[1][0][idx]], search_res[0][0][idx]]) 53 | # recall_list = list(filter(lambda x: x[2] < 100, result)) 54 | 55 | return recall_list --------------------------------------------------------------------------------