├── .gitignore
├── README.md
├── llm_classification
    ├── config
    │   └── toutiao_config.py
    ├── data
    │   └── toutiao_cat_data
    │   │   └── class_def.tsv
    ├── script
    │   ├── build_vec_index.py
    │   └── run_toutiao_cases.py
    └── src
    │   ├── classifier.py
    │   ├── models
    │       ├── llm
    │       │   ├── llm_model.py
    │       │   └── test_qwen.py
    │       └── vec_model
    │       │   ├── simcse_model.py
    │       │   └── vec_model.py
    │   ├── searcher
    │       ├── searcher.py
    │       └── vec_searcher
    │       │   ├── vec_index.py
    │       │   └── vec_searcher.py
    │   └── utils
    │       └── data_processing.py
├── mt5_summary
    ├── arg_config.py
    ├── data.py
    ├── mt5_summary_main.py
    ├── run.sh
    └── tools.py
└── vec_searcher
    ├── script
        └── build_vec_index.py
    ├── searcher.py
    ├── utils
        └── data_processing.py
    ├── vec_model
        ├── simcse_model.py
        └── vec_model.py
    └── vec_searcher
        ├── vec_index.py
        └── vec_searcher.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # ---> Python
 2 | # Byte-compiled / optimized / DLL files
 3 | __pycache__/
 4 | **/__pycache__/
 5 | 
 6 | # Distribution / packaging
 7 | .Python
 8 | build/
 9 | develop-eggs/
10 | dist/
11 | downloads/
12 | eggs/
13 | .eggs/
14 | lib/
15 | lib64/
16 | parts/
17 | sdist/
18 | var/
19 | wheels/
20 | pip-wheel-metadata/
21 | share/python-wheels/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | MANIFEST
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .nox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | *.py,cover
48 | .hypothesis/
49 | .pytest_cache/
50 | 
51 | # Sphinx documentation
52 | docs/_build/
53 | 
54 | # PyBuilder
55 | target/
56 | 
57 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
58 | __pypackages__/
59 | 
60 | 
61 | # ---> VisualStudioCode
62 | .vscode/*
63 | .vscode/settings.json
64 | *.code-workspace
65 | 
66 | # local config
67 | local_config.py
68 | 
69 | # log
70 | *.log.*
71 | 
72 | # .env
73 | ! default.env
74 | 
75 | # .idea 配置文件
76 | .idea/
77 | **/data/*


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # poc_project
2 | 
3 | 通用简单工具项目，用于存放简单通用工具。
4 | 
5 | - llm_classification：基于大模型的通用文本分类方案
6 | - vec_searcher：Faiss向量召回工具


--------------------------------------------------------------------------------
/llm_classification/config/toutiao_config.py:
--------------------------------------------------------------------------------
 1 | VEC_MODEL_PATH = "C:/work/tool/huggingface/models/simcse-chinese-roberta-wwm-ext"
 2 | # VEC_INDEX_DATA = "vec_index_toutiao_20240629"
 3 | VEC_INDEX_DATA = "vec_index_toutiao_20240702_FEW"
 4 | 
 5 | LLM_PATH = "C:/work/tool/qwen2-1.5b-instruct"
 6 | LLM_CONFIG = {"max_length": 2048,
 7 |               "do_sample": False,
 8 |               "top_k": 1,
 9 |               "temperature": 0.8}
10 | 
11 | CLASS_DEF_PATH = "data/toutiao_cat_data/class_def.tsv"
12 | 
13 | PROMPT_TEMPLATE = """你是一个优秀的句子分类师，能把给定的用户query划分到正确的类目中。现在请你根据给定信息和要求，为给定用户query，从备选类目中选择最合适的类目。
14 | 
15 | 下面是“参考案例”即被标注的正确结果，可供参考：
16 | <examples>
17 | 
18 | 备选类目：
19 | <options>
20 | 
21 | 类目概念：
22 | <options_detail>
23 | 
24 | 用户query：
25 | <query>
26 | 
27 | 请注意：
28 | 1. 用户query所选类目，仅能在【备选类目】中进行选择，用户query仅属于一个类目。
29 | 2. “参考案例”中的内容可供推理分析，可以仿照案例来分析用户query的所选类目。
30 | 3. 请仔细比对【备选类目】的概念和用户query的差异。
31 | 4. 如果用户quer也不属于【备选类目】中给定的类目，或者比较模糊，请选择“拒识”。
32 | 5. 请在“所选类目：”后回复结果，不需要说明理由。
33 | 
34 | 所选类目："""


--------------------------------------------------------------------------------
/llm_classification/data/toutiao_cat_data/class_def.tsv:
--------------------------------------------------------------------------------
 1 | 民生-故事	老百姓生活问题的新闻
 2 | 文化-文化	文学、艺术、教育、历史、哲学、宗教等多个方面的文化新闻
 3 | 娱乐-娱乐	明星 、电影、最新影讯/影评、电影院在线购票订座、电视剧、音乐、戏剧、演出等娱乐信息
 4 | 体育-体育	国内国际最热门体育赛事比分和赛果
 5 | 财经-财经	股票、债券、基金、期货、信托、理财、管理等服务新闻
 6 | 房产-房产	涵盖土地政策、房产金融、房产营销等信息
 7 | 汽车-汽车	海外、国内所有汽车品牌、高清车模、购车指南、车展报道
 8 | 教育-教育	高考、考研、自考、成人高考、教师招聘、就业、留学等权威的招考、招生、就业、招聘新闻
 9 | 科技-科技	通信、互联网、IT产业、IT产品和科普探索等前沿科技领域知识
10 | 军事-军事	权威军事资讯、追踪军事热点、反映军事动态、介绍国内外最新武器发展动态
11 | 旅游-旅游	文化和旅游部相关政策法规、跟踪报道全国各地旅游重点事件、大型旅游会议活动专题
12 | 国际-国际	世界新闻、国际博览、新闻人物、评论分析、媒体聚焦
13 | 证券-股票	证券、股票方面的新闻
14 | 农业-三农	种植业、林业、渔业、牧业等农业生产活动的新闻
15 | 电竞-游戏	各种电竞赛事的报道、游戏更新、行业趋势、选手动态、以及与电子竞技相关的文化和活动
16 | 


--------------------------------------------------------------------------------
/llm_classification/script/build_vec_index.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Filename:    build_vec_index.py
  3 | # Author:      ZENGGUANRONG
  4 | # Date:        2023-12-12
  5 | # description: 构造向量索引脚本
  6 | 
  7 | import json,torch,copy,random
  8 | from tqdm import tqdm
  9 | from loguru import logger
 10 | from sklearn.model_selection import train_test_split
 11 | 
 12 | from src.utils.data_processing import load_toutiao_data
 13 | from src.models.vec_model.vec_model import VectorizeModel
 14 | from src.searcher.vec_searcher.vec_searcher import VecSearcher 
 15 | 
 16 | if __name__ == "__main__":
 17 |     # 0. 必要配置
 18 |     MODE = "DEBUG"
 19 |     # MODE = "PRO"
 20 |     MODE = "FEW"
 21 | 
 22 |     VERSION = "20240702"
 23 |     VEC_MODEL_PATH = "C:/work/tool/huggingface/models/simcse-chinese-roberta-wwm-ext"
 24 |     SOURCE_INDEX_DATA_PATH = "./data/toutiao_cat_data/toutiao_cat_data.txt" # 数据来源：https://github.com/aceimnorstuvwxz/toutiao-text-classfication-dataset
 25 |     VEC_INDEX_DATA = "vec_index_toutiao_{}_{}".format(VERSION,MODE)
 26 |     TESE_DATA_PATH = "./data/toutiao_cat_data/test_set_{}_{}.txt".format(VERSION,MODE)
 27 |     RANDOM_SEED = 100
 28 | 
 29 |     DEVICE = torch.device('cuda' if torch.cuda.is_available() else "cpu")
 30 |     TEST_SIZE = 0.1
 31 |     # 类目体系
 32 |     CLASS_INFO = [
 33 |         ["100", '民生-故事', 'news_story'],
 34 |         ["101", '文化-文化', 'news_culture'],
 35 |         ["102", '娱乐-娱乐', 'news_entertainment'],
 36 |         ["103", '体育-体育', 'news_sports'],
 37 |         ["104", '财经-财经', 'news_finance'],
 38 |         # ["105", '时政 新时代', 'nineteenth'],
 39 |         ["106", '房产-房产', 'news_house'],
 40 |         ["107", '汽车-汽车', 'news_car'],
 41 |         ["108", '教育-教育', 'news_edu' ],
 42 |         ["109", '科技-科技', 'news_tech'],
 43 |         ["110", '军事-军事', 'news_military'],
 44 |         # ["111" 宗教 无，凤凰佛教等来源],
 45 |         ["112", '旅游-旅游', 'news_travel'],
 46 |         ["113", '国际-国际', 'news_world'],
 47 |         ["114", '证券-股票', 'stock'],
 48 |         ["115", '农业-三农', 'news_agriculture'],
 49 |         ["116", '电竞-游戏', 'news_game']
 50 |     ]
 51 |     ID2CN_MAPPING = {}
 52 |     for idx in range(len(CLASS_INFO)):
 53 |         ID2CN_MAPPING[CLASS_INFO[idx][0]] = CLASS_INFO[idx][1]
 54 | 
 55 |     # 1. 加载数据、模型
 56 |     # 1.1 加载模型
 57 |     vec_model = VectorizeModel(VEC_MODEL_PATH, DEVICE)
 58 |     index_dim = len(vec_model.predict_vec("你好啊")[0])
 59 |     # 1.2 加载数据
 60 |     toutiao_index_data = load_toutiao_data(SOURCE_INDEX_DATA_PATH)
 61 |     source_index_data = copy.deepcopy(toutiao_index_data)
 62 |     logger.info("load data done: {}".format(len(source_index_data)))
 63 |     if MODE == "DEBUG":
 64 |         random.shuffle(source_index_data)
 65 |         source_index_data = source_index_data[:10000]
 66 |     elif MODE == "FEW":
 67 |         new_source_data = []
 68 |         class_dict_cal = {}
 69 |         test_list = []
 70 |         tmp_idx = 0
 71 | 
 72 |         for key in ID2CN_MAPPING:
 73 |             class_dict_cal[key] = 0
 74 |         for idx in range(len(source_index_data)):
 75 |             if class_dict_cal[source_index_data[idx][1][1]] < 10:
 76 |                 class_dict_cal[source_index_data[idx][1][1]] += 1
 77 |                 new_source_data.append(source_index_data[idx])
 78 |             if sum([class_dict_cal[i] for i in class_dict_cal]) >= len(class_dict_cal) * 10:
 79 |                 break
 80 |         source_index_data = new_source_data
 81 | 
 82 |     for item in source_index_data:
 83 |         item[1].append(ID2CN_MAPPING[item[1][1]])
 84 |     # 1.3 训练集测试集划分
 85 |     if MODE != "FEW":
 86 |         train_list, test_list = train_test_split(source_index_data, test_size=TEST_SIZE, random_state=66)
 87 |     else:
 88 |         train_list = source_index_data
 89 |         test_list = toutiao_index_data[idx:idx + 1000]
 90 |         for item in test_list:
 91 |             item[1].append(ID2CN_MAPPING[item[1][1]])
 92 | 
 93 |     # 2. 创建索引并灌入数据
 94 |     # 2.1 构造索引
 95 |     vec_searcher = VecSearcher()
 96 |     vec_searcher.build(index_dim, VEC_INDEX_DATA)
 97 | 
 98 |     # 2.2 推理向量
 99 |     vectorize_result = []
100 |     for q in tqdm(train_list, desc="VEC MODEL RUNNING"):
101 |         vec = vec_model.predict_vec(q[0]).cpu().numpy()
102 |         tmp_result = copy.deepcopy(q)
103 |         tmp_result.append(vec)
104 |         vectorize_result.append(copy.deepcopy(tmp_result))
105 | 
106 |     # 2.3 开始存入
107 |     for idx in tqdm(range(len(vectorize_result)), desc="INSERT INTO INDEX"):
108 |         vec_searcher.insert(vectorize_result[idx][2], vectorize_result[idx][:2])
109 | 
110 |     # 3. 保存
111 |     # 3.1 索引保存
112 |     vec_searcher.save()
113 |     # 3.2 测试集保存
114 |     with open(TESE_DATA_PATH, "w", encoding="utf8") as f:
115 |         for item in test_list:
116 |             f.write("_!_".join(item[1]) + "\n")
117 | 


--------------------------------------------------------------------------------
/llm_classification/script/run_toutiao_cases.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from tqdm import tqdm
 3 | from sklearn.metrics import classification_report, confusion_matrix
 4 | from loguru import logger
 5 | 
 6 | from src.classifier import VecLlmClassifier
 7 | from src.utils.data_processing import load_toutiao_data
 8 | 
 9 | VERSION = "20240702_FEW"
10 | TEST_DATA_PATH = "data/toutiao_cat_data/test_set_{}.txt".format(VERSION)
11 | OUTPUT_DATA_PATH = "data/toutiao_cat_data/test_set_{}_result.txt".format(VERSION)
12 | test_data = load_toutiao_data(TEST_DATA_PATH)
13 | 
14 | vlc = VecLlmClassifier()
15 | test_list = []
16 | pred_list = []
17 | labels = set()
18 | for i in tqdm(range(len(test_data)), desc="RUNNING TEST"):
19 |     test_list.append(test_data[i][1][5])
20 |     labels.add(test_data[i][1][5])
21 |     pred_list.append(vlc.predict(test_data[i][0]))
22 | labels = list(labels)
23 | 
24 | logger.info("\n{}".format(classification_report(test_list, pred_list, labels = labels)))
25 | logger.info("\n{}".format(confusion_matrix(test_list, pred_list, labels=labels)))
26 | 
27 | with open(OUTPUT_DATA_PATH, "w", encoding="utf8") as fout:
28 |     for idx in range(len(test_data)):
29 |         fout.write("{}\t{}\t{}\t{}\n".format(test_data[idx][0], 
30 |                                             test_list[idx],
31 |                                             pred_list[idx],
32 |                                             test_list[idx]==pred_list[idx]))


--------------------------------------------------------------------------------
/llm_classification/src/classifier.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Filename:    classifier.py
 3 | # Author:      ZENGGUANRONG
 4 | # Date:        2024-06-25
 5 | # description: 分类器主函数
 6 | 
 7 | import copy
 8 | import torch
 9 | from loguru import logger
10 | 
11 | from config.toutiao_config import (VEC_INDEX_DATA, VEC_MODEL_PATH,
12 |                                     LLM_CONFIG, LLM_PATH, PROMPT_TEMPLATE,CLASS_DEF_PATH)
13 | from src.searcher.searcher import Searcher
14 | from src.models.llm.llm_model import QWen2Model
15 | from src.utils.data_processing import load_class_def
16 | 
17 | class VecLlmClassifier:
18 |     def __init__(self) -> None:
19 |         self.searcher = Searcher(VEC_MODEL_PATH, VEC_INDEX_DATA)
20 |         self.device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
21 |         self.llm = QWen2Model(LLM_PATH, LLM_CONFIG, self.device)
22 |         self.PROMPT_TEMPLATE = PROMPT_TEMPLATE
23 |         self.class_def = load_class_def(CLASS_DEF_PATH)
24 | 
25 |     def predict(self, query):
26 |         # 1. query预处理
27 |         logger.info("request: {}".format(query))
28 |         # 2. query向量召回
29 |         recall_result = self.searcher.search(query, nums=5)
30 |         # logger.debug(recall_result)
31 | 
32 |         # 3. 请求大模型
33 |         # 3.1 PROMPT拼接
34 |         request_prompt= copy.deepcopy(self.PROMPT_TEMPLATE)
35 |         # 3.1.1 子模块拼接
36 |         examples = []
37 |         options = []
38 |         options_detail = []
39 |         for item in recall_result:
40 |             tmp_examples = "——".join([item[1][0], item[1][1][5]])
41 |             if tmp_examples not in examples:
42 |                 examples.append(tmp_examples)
43 |             opt_detail_str = "：".join(["【" + item[1][1][5] + "】",self.class_def[item[1][1][5]]])
44 |             opt = item[1][1][5]
45 |             if opt not in options:
46 |                 options.append(opt)
47 |                 options_detail.append(opt_detail_str)
48 |         # options.append("拒识：含义不明或用户query所属类目不在列举内时，分为此类")
49 |         examples_str = "\n".join(examples)
50 |         options_str = "，".join(options)
51 |         options_detail_str = "\n".join(options_detail)
52 | 
53 |         # 3.1.2 整体组装
54 |         request_prompt = request_prompt.replace("<examples>", examples_str)
55 |         request_prompt = request_prompt.replace("<options>", options_str)
56 |         request_prompt = request_prompt.replace("<options_detail>", options_detail_str)
57 |         request_prompt = request_prompt.replace("<query>", query)
58 |         # logger.info(request_prompt)
59 | 
60 |         # 3.2 请求大模型
61 |         llm_response = self.llm.predict(request_prompt)
62 |         # logger.info("llm response: {}".format(llm_response))
63 | 
64 |         # 3.3 大模型结果解析
65 |         result = "拒识"
66 |         for option in options:
67 |             if option in llm_response:
68 |                 result = option
69 |                 break
70 |         # logger.info("parse result: {}".format(result))
71 | 
72 |         # 4. 返回结果
73 |         logger.info("response: {}".format(result))
74 |         return result
75 | 
76 | if __name__ == "__main__":
77 |     import sys
78 |     vlc = VecLlmClassifier()
79 |     if len(sys.argv) > 1:
80 |         logger.info(vlc.predict("".join(sys.argv[1:])))
81 | 
82 |     # # 性能测试
83 |     # from tqdm import tqdm
84 |     # for i in tqdm(range(20), desc="warm up"):
85 |     #     vlc.predict("感冒发烧怎么治疗")
86 |     # for i in tqdm(range(20), desc="running speed"):
87 |     #     vlc.predict("王阳明到底顿悟了什么？")


--------------------------------------------------------------------------------
/llm_classification/src/models/llm/llm_model.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Filename:    llm_model.py
 3 | # Author:      ZENGGUANRONG
 4 | # Date:        2023-12-17
 5 | # description: 大模型调用模块，这里默认用的chatglm2
 6 | 
 7 | # from transformers import AutoModel, AutoTokenizer
 8 | from transformers import AutoModelForCausalLM, AutoTokenizer
 9 | from typing import Tuple, List
10 | from loguru import logger
11 | 
12 | class QWen2Model:
13 |     def __init__(self, model_path, config = {}, device="cuda"):
14 |         self.model = AutoModelForCausalLM.from_pretrained(
15 |             model_path,
16 |             torch_dtype="auto",
17 |             device_map="auto"
18 |         )
19 |         self.tokenizer = AutoTokenizer.from_pretrained(model_path)
20 |         self.model = self.model.eval()
21 |         self.device = device
22 | 
23 |         self.generate_config = self._read_config_(config)
24 |         logger.info("load LLM Model done")
25 |     
26 |     def _read_config_(self, config):
27 |         tmp_config = {}
28 |         # tmp_config["max_length"] = config.get("max_length", 2048)
29 |         tmp_config["num_beams"] = config.get("num_beams", 1)
30 |         tmp_config["do_sample"] = config.get("do_sample", False)
31 |         tmp_config["top_k"] = config.get("top_k", 1)
32 |         tmp_config["temperature"] = config.get("temperature", 0.8)
33 |         return tmp_config
34 | 
35 |     def predict(self, query):
36 |         messages = [
37 |             {"role": "system", "content": "You are a helpful assistant."},
38 |             {"role": "user", "content": query}
39 |         ]
40 |         text = self.tokenizer.apply_chat_template(
41 |             messages,
42 |             tokenize=False,
43 |             add_generation_prompt=True
44 |         )
45 |         model_inputs = self.tokenizer([text], return_tensors="pt").to(self.device)
46 | 
47 |         # Directly use generate() and tokenizer.decode() to get the output.
48 |         # Use `max_new_tokens` to control the maximum output length.
49 |         generated_ids = self.model.generate(
50 |             model_inputs.input_ids,
51 |             attention_mask=model_inputs.attention_mask,
52 |             pad_token_id=self.tokenizer.eos_token_id,
53 |             max_new_tokens=512,
54 |             **self.generate_config
55 |         )
56 |         generated_ids = [
57 |             output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
58 |         ]
59 | 
60 |         response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
61 |         return response
62 | 
63 | if __name__ == "__main__":
64 |     from config.toutiao_config import LLM_CONFIG,LLM_PATH
65 |     print(LLM_CONFIG)
66 |     llm_model = QWen2Model(LLM_PATH, config = LLM_CONFIG, device="cuda")
67 |     print(llm_model.predict("如何做番茄炒蛋"))
68 | 


--------------------------------------------------------------------------------
/llm_classification/src/models/llm/test_qwen.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM
 3 | from typing import Tuple, List
 4 | from tqdm import tqdm
 5 | import random
 6 | 
 7 | model_path = "C:\\work\\tool\\huggingface\\models\\Qwen-1_8b"
 8 | tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 9 | # model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().quantize(8).cuda()
10 | 
11 | model = AutoModelForCausalLM.from_pretrained(
12 |     model_path,
13 |     device_map="auto",
14 |     trust_remote_code=True
15 | ).eval().cuda()
16 | response, history = model.chat(tokenizer, "你好", history=None)
17 | print(response)
18 | print(history)
19 | random_query = ["你好","你是谁","你是谁创造出来的"]
20 | for i in tqdm(range(100)):
21 |     model.chat(tokenizer, random.choice(random_query), history=None)


--------------------------------------------------------------------------------
/llm_classification/src/models/vec_model/simcse_model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from loguru import logger
 4 | from tqdm import tqdm
 5 | from transformers import BertConfig, BertModel, BertTokenizer
 6 | 
 7 | class SimcseModel(nn.Module):
 8 |     # https://blog.csdn.net/qq_44193969/article/details/126981581
 9 |     def __init__(self, pretrained_bert_path, pooling="cls") -> None:
10 |         super(SimcseModel, self).__init__()
11 | 
12 |         self.pretrained_bert_path = pretrained_bert_path
13 |         self.config = BertConfig.from_pretrained(self.pretrained_bert_path)
14 |         
15 |         self.model = BertModel.from_pretrained(self.pretrained_bert_path, config=self.config)
16 |         self.model.eval()
17 |         
18 |         # self.model = None
19 |         self.pooling = pooling
20 |     
21 |     def forward(self, input_ids, attention_mask, token_type_ids):
22 |         out = self.model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
23 | 
24 |         return out.last_hidden_state[:, 0]


--------------------------------------------------------------------------------
/llm_classification/src/models/vec_model/vec_model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from loguru import logger
  5 | 
  6 | from transformers import BertTokenizer
  7 | 
  8 | from src.models.vec_model.simcse_model import SimcseModel
  9 | 
 10 | import onnxruntime as ort
 11 | 
 12 | class VectorizeModel:
 13 |     def __init__(self, ptm_model_path, device = "cpu") -> None:
 14 |         self.tokenizer = BertTokenizer.from_pretrained(ptm_model_path)
 15 |         self.model = SimcseModel(pretrained_bert_path=ptm_model_path, pooling="cls")
 16 |         # print(self.model)
 17 |         self.model.eval()
 18 |         
 19 |         self.DEVICE = torch.device('cuda' if torch.cuda.is_available() else "cpu")
 20 |         # self.DEVICE = device
 21 |         logger.info(self.DEVICE)
 22 |         self.model.to(self.DEVICE)
 23 |         
 24 |         self.pdist = nn.PairwiseDistance(2)
 25 |     
 26 |     def predict_vec(self,query):
 27 |         q_id = self.tokenizer(query, max_length = 200, truncation=True, padding="max_length", return_tensors='pt')
 28 |         with torch.no_grad():
 29 |             q_id_input_ids = q_id["input_ids"].squeeze(1).to(self.DEVICE)
 30 |             q_id_attention_mask = q_id["attention_mask"].squeeze(1).to(self.DEVICE)
 31 |             q_id_token_type_ids = q_id["token_type_ids"].squeeze(1).to(self.DEVICE)
 32 |             q_id_pred = self.model(q_id_input_ids, q_id_attention_mask, q_id_token_type_ids)
 33 | 
 34 |         return q_id_pred
 35 | 
 36 |     def predict_vec_request(self, query):
 37 |         q_id_pred = self.predict_vec(query)
 38 |         return q_id_pred.cpu().numpy().tolist()
 39 |     
 40 |     def predict_sim(self, q1, q2):
 41 |         q1_v = self.predict_vec(q1)
 42 |         q2_v = self.predict_vec(q2)
 43 |         sim = F.cosine_similarity(q1_v[0], q2_v[0], dim=-1)
 44 |         return sim.cpu().numpy().tolist()
 45 | 
 46 | class VectorizeModel_v2(VectorizeModel):
 47 |     def __init__(self, ptm_model_path, onnx_path, providers=['CUDAExecutionProvider']) -> None:
 48 |         # ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']
 49 |         self.tokenizer = BertTokenizer.from_pretrained(ptm_model_path)
 50 |         self.model = ort.InferenceSession(onnx_path, providers=providers)
 51 |         
 52 |         self.pdist = nn.PairwiseDistance(2)
 53 |     
 54 |     def _to_numpy(self, tensor):
 55 |         return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
 56 |     
 57 |     def predict_vec(self,query):
 58 |         q_id = self.tokenizer(query, max_length = 200, truncation=True, padding="max_length", return_tensors='pt')
 59 |         input_feed = {
 60 |             self.model.get_inputs()[0].name: self._to_numpy(q_id["input_ids"]),
 61 |             self.model.get_inputs()[1].name: self._to_numpy(q_id["attention_mask"]),
 62 |             self.model.get_inputs()[2].name: self._to_numpy(q_id["token_type_ids"]),
 63 |         }
 64 |         return torch.tensor(self.model.run(None, input_feed=input_feed)[0])
 65 |     
 66 |     def predict_sim(self, q1, q2):
 67 |         q1_v = self.predict_vec(q1)
 68 |         q2_v = self.predict_vec(q2)
 69 |         sim = F.cosine_similarity(q1_v[0], q2_v[0], dim=-1)
 70 |         return sim.numpy().tolist()
 71 | 
 72 | if __name__ == "__main__":
 73 |     import time,random
 74 |     from tqdm import tqdm
 75 |     device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
 76 |     # device = ""
 77 |     # vec_model = VectorizeModel('C:/work/tool/huggingface/models/simcse-chinese-roberta-wwm-ext', device=device)
 78 |     vec_model = VectorizeModel_v2('C:/work/tool/huggingface/models/simcse-chinese-roberta-wwm-ext',
 79 |                                  "./data/model_simcse_roberta_output_20240211.onnx",providers=['CUDAExecutionProvider'])
 80 |     # vec_model = VectorizeModel_v2('C:/work/tool/huggingface/models/simcse-chinese-roberta-wwm-ext',
 81 |     #                              "./data/model_simcse_roberta_output_20240211.onnx",providers=['TensorrtExecutionProvider'])
 82 |     # 单测
 83 |     # q = ["你好啊"]
 84 |     # print(vec_model.predict_vec(q))
 85 |     # print(vec_model.predict_sim("你好呀","你好啊"))
 86 |     tmp_queries = ["你好啊", "今天天气怎么样", "我要暴富"]
 87 |     # 开始批跑
 88 |     batch_sizes = [1,2,4,8,16]
 89 |     for b in batch_sizes:
 90 |         for i in tqdm(range(100),desc="warmup"):
 91 |             tmp_q = []
 92 |             for i in range(b):
 93 |                 tmp_q.append(random.choice(tmp_queries))
 94 |             vec_model.predict_vec(tmp_q)
 95 |         for i in tqdm(range(1000),desc="batch_size={}".format(b)):
 96 |             tmp_q = []
 97 |             for i in range(b):
 98 |                 tmp_q.append(random.choice(tmp_queries))
 99 |             vec_model.predict_vec(tmp_q)
100 | 


--------------------------------------------------------------------------------
/llm_classification/src/searcher/searcher.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Filename:    searcher.py
 3 | # Author:      ZENGGUANRONG
 4 | # Date:        2023-12-12
 5 | # description: 核心检索器
 6 | 
 7 | import json,requests,copy
 8 | import numpy as np
 9 | from loguru import logger
10 | from src.searcher.vec_searcher.vec_searcher import VecSearcher
11 | from src.models.vec_model.vec_model import VectorizeModel
12 | 
13 | class Searcher:
14 |     def __init__(self, model_path, vec_search_path):
15 |         self.vec_model = VectorizeModel(model_path)
16 |         logger.info("load vec_model done")
17 | 
18 |         self.vec_searcher = VecSearcher()
19 |         self.vec_searcher.load(vec_search_path)
20 |         logger.info("load vec_searcher done")
21 | 
22 |     def rank(self, query, recall_result):
23 |         rank_result = []
24 |         for idx in range(len(recall_result)):
25 |             new_sim = self.vec_model.predict_sim(query, recall_result[idx][1][0])
26 |             rank_item = copy.deepcopy(recall_result[idx])
27 |             rank_item.append(new_sim)
28 |             rank_result.append(copy.deepcopy(rank_item))
29 |         rank_result.sort(key=lambda x: x[3], reverse=True)
30 |         return rank_result
31 |     
32 |     def search(self, query, nums=3):
33 |         # logger.info("request: {}".format(query))
34 | 
35 |         q_vec = self.vec_model.predict_vec(query).cpu().numpy()
36 | 
37 |         recall_result = self.vec_searcher.search(q_vec, nums)
38 | 
39 |         rank_result = self.rank(query, recall_result)
40 |         # rank_result = list(filter(lambda x:x[4] > 0.8, rank_result))
41 | 
42 |         # logger.info("response: {}".format(rank_result))
43 |         return rank_result
44 | 
45 | if __name__ == "__main__":
46 |     VEC_MODEL_PATH = "C:/work/tool/huggingface/models/simcse-chinese-roberta-wwm-ext"
47 |     VEC_INDEX_DATA = "vec_index_test2023121201"
48 |     searcher = Searcher(VEC_MODEL_PATH, VEC_INDEX_DATA)
49 |     q = "什么人不能吃花生"
50 |     print(searcher.search(q))


--------------------------------------------------------------------------------
/llm_classification/src/searcher/vec_searcher/vec_index.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Filename:    vec_index.py
 3 | # Author:      ZENGGUANRONG
 4 | # Date:        2023-12-12
 5 | # description: 向量召回索引-FAISS
 6 | 
 7 | import faiss
 8 | from loguru import logger
 9 | from src.models.vec_model.vec_model import VectorizeModel
10 | 
11 | class VecIndex:
12 |     def __init__(self) -> None:
13 |         self.index = ""
14 |     
15 |     def build(self, index_dim):
16 |         description = "HNSW64"
17 |         measure = faiss.METRIC_L2
18 |         self.index = faiss.index_factory(index_dim, description, measure)
19 |     
20 |     def insert(self, vec):
21 |         self.index.add(vec)
22 |     
23 |     def batch_insert(self, vecs):
24 |         self.index.add(vecs)
25 |     
26 |     def load(self, read_path):
27 |         # read_path: XXX.index
28 |         self.index = faiss.read_index(read_path)
29 | 
30 |     def save(self, save_path):
31 |         # save_path: XXX.index
32 |         faiss.write_index(self.index, save_path)
33 |     
34 |     def search(self, vec, num):
35 |         # id, distance
36 |         return self.index.search(vec, num)


--------------------------------------------------------------------------------
/llm_classification/src/searcher/vec_searcher/vec_searcher.py:
--------------------------------------------------------------------------------
 1 | import os, json
 2 | from loguru import logger
 3 | from src.searcher.vec_searcher.vec_index import VecIndex
 4 | 
 5 | class VecSearcher:
 6 |     def __init__(self):
 7 |         self.invert_index = VecIndex() # 检索倒排，使用的是索引是VecIndex
 8 |         self.forward_index = [] # 检索正排，实质上只是个list，通过ID获取对应的内容
 9 |         self.INDEX_FOLDER_PATH_TEMPLATE = "data/index/{}"
10 | 
11 |     def build(self, index_dim, index_name):
12 |         self.index_name = index_name
13 |         self.index_folder_path = self.INDEX_FOLDER_PATH_TEMPLATE.format(index_name)
14 |         if not os.path.exists(self.index_folder_path) or not os.path.isdir(self.index_folder_path):
15 |             os.mkdir(self.index_folder_path)
16 | 
17 |         self.invert_index = VecIndex()
18 |         self.invert_index.build(index_dim)
19 | 
20 |         self.forward_index = []
21 |     
22 |     def insert(self, vec, doc):
23 |         self.invert_index.insert(vec)
24 |         # self.invert_index.batch_insert(vecs)
25 | 
26 |         self.forward_index.append(doc)
27 |     
28 |     def save(self):
29 |         with open(self.index_folder_path + "/forward_index.txt", "w", encoding="utf8") as f:
30 |             for data in self.forward_index:
31 |                 f.write("{}\n".format(json.dumps(data, ensure_ascii=False)))
32 | 
33 |         self.invert_index.save(self.index_folder_path + "/invert_index.faiss")
34 |     
35 |     def load(self, index_name):
36 |         self.index_name = index_name
37 |         self.index_folder_path = self.INDEX_FOLDER_PATH_TEMPLATE.format(index_name)
38 | 
39 |         self.invert_index = VecIndex()
40 |         self.invert_index.load(self.index_folder_path + "/invert_index.faiss")
41 | 
42 |         self.forward_index = []
43 |         with open(self.index_folder_path + "/forward_index.txt", encoding="utf8") as f:
44 |             for line in f:
45 |                 self.forward_index.append(json.loads(line.strip()))
46 |     
47 |     def search(self, vecs, nums = 5):
48 |         search_res = self.invert_index.search(vecs, nums)
49 |         recall_list = []
50 |         for idx in range(nums):
51 |             # recall_list_idx, recall_list_detail, distance
52 |             recall_list.append([search_res[1][0][idx], self.forward_index[search_res[1][0][idx]], search_res[0][0][idx]])
53 |         # recall_list = list(filter(lambda x: x[2] < 100, result))
54 | 
55 |         return recall_list


--------------------------------------------------------------------------------
/llm_classification/src/utils/data_processing.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Filename:    data_processing.py
 3 | # Author:      ZENGGUANRONG
 4 | # Date:        2024-06-25
 5 | # description: 数据处理函数
 6 | 
 7 | def load_toutiao_data(path):
 8 |     source_data = []
 9 |     with open(path, encoding="utf8") as f:
10 |         for line in f:
11 |             ll = line.strip().split("_!_") # 新闻ID，分类code，分类名称，新闻字符串（仅含标题），新闻关键词
12 |             source_data.append([ll[3], ll])
13 |     return source_data
14 | 
15 | def load_class_def(path):
16 |     source_data = {}
17 |     with open(path, encoding="utf8") as f:
18 |         for line in f:
19 |             ll = line.strip().split("\t")
20 |             source_data[ll[0]] = ll[1]
21 |     return source_data


--------------------------------------------------------------------------------
/mt5_summary/arg_config.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Filename:    arg_config.py
 3 | # Author:      ZENGGUANRONG
 4 | # Date:        2024-10-06
 5 | # description: 执行参数
 6 | # reference:   https://github.com/jsksxs360/How-to-use-Transformers
 7 | 
 8 | import argparse
 9 | 
10 | def parse_args():
11 |     parser = argparse.ArgumentParser()
12 | 
13 |     # Required parameters
14 |     parser.add_argument("--output_dir", default=None, type=str, required=True,
15 |         help="The output directory where the model checkpoints and predictions will be written.",
16 |     )
17 |     parser.add_argument("--train_file", default=None, type=str, required=True, help="The input training file.")
18 |     parser.add_argument("--dev_file", default=None, type=str, required=True, help="The input evaluation file.")
19 |     parser.add_argument("--test_file", default=None, type=str, required=True, help="The input testing file.")
20 |     
21 |     parser.add_argument("--model_type",
22 |         default="bert", type=str, required=True
23 |     )
24 |     parser.add_argument("--model_checkpoint",
25 |         default="bert-large-cased/", type=str, required=True,
26 |         help="Path to pretrained model or model identifier from huggingface.co/models",
27 |     )
28 |     parser.add_argument("--max_input_length", default=256, type=int, required=True)
29 |     parser.add_argument("--max_target_length", default=256, type=int, required=True)
30 |     
31 |     parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
32 |     parser.add_argument("--do_test", action="store_true", help="Whether to run eval on the test set.")
33 |     parser.add_argument("--do_predict", action="store_true", help="Whether to save predicted labels.")
34 |     
35 |     # Other parameters
36 |     parser.add_argument("--learning_rate", default=1e-5, type=float, help="The initial learning rate for Adam.")
37 |     parser.add_argument("--num_train_epochs", default=3, type=int, help="Total number of training epochs to perform.")
38 |     parser.add_argument("--batch_size", default=4, type=int)
39 |     parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
40 |     parser.add_argument("--beam_search_size", default=4, type=int)
41 |     parser.add_argument("--no_repeat_ngram_size", default=2, type=int)
42 |     
43 |     parser.add_argument("--adam_beta1", default=0.9, type=float,
44 |         help="Epsilon for Adam optimizer."
45 |     )
46 |     parser.add_argument("--adam_beta2", default=0.98, type=float,
47 |         help="Epsilon for Adam optimizer."
48 |     )
49 |     parser.add_argument("--adam_epsilon", default=1e-8, type=float, 
50 |         help="Epsilon for Adam optimizer."
51 |     )
52 |     parser.add_argument("--warmup_proportion", default=0.1, type=float,
53 |         help="Proportion of training to perform linear learning rate warmup for,E.g., 0.1 = 10% of training."
54 |     )
55 |     parser.add_argument("--weight_decay", default=0.01, type=float,
56 |         help="Weight decay if we apply some."
57 |     )
58 |     args = parser.parse_args()
59 |     return args


--------------------------------------------------------------------------------
/mt5_summary/data.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Filename:    data.py
 3 | # Author:      ZENGGUANRONG
 4 | # Date:        2024-10-06
 5 | # description: 数据处理函数
 6 | # reference:   https://github.com/jsksxs360/How-to-use-Transformers
 7 | from torch.utils.data import Dataset, DataLoader
 8 | import torch
 9 | 
10 | MAX_DATASET_SIZE = 200000
11 | 
12 | class LCSTS(Dataset):
13 |     # 数据参考：http://icrc.hitsz.edu.cn/Article/show/139.html
14 |     def __init__(self, data_file):
15 |         self.data = self.load_data(data_file)
16 |     
17 |     def load_data(self, data_file):
18 |         Data = {}
19 |         with open(data_file, 'rt', encoding='utf-8') as f:
20 |             for idx, line in enumerate(f):
21 |                 if idx >= MAX_DATASET_SIZE:
22 |                     break
23 |                 items = line.strip().split('!=!')
24 |                 assert len(items) == 2
25 |                 Data[idx] = {
26 |                     'title': items[0],
27 |                     'content': items[1]
28 |                 }
29 |         return Data
30 |     
31 |     def __len__(self):
32 |         return len(self.data)
33 | 
34 |     def __getitem__(self, idx):
35 |         return self.data[idx]
36 | 
37 | def get_dataLoader(args, dataset, model, tokenizer, batch_size=None, shuffle=False):
38 |     
39 |     def collote_fn(batch_samples):
40 |         batch_inputs, batch_targets = [], []
41 |         for sample in batch_samples:
42 |             batch_inputs.append(sample['content'])
43 |             batch_targets.append(sample['title'])
44 |         batch_data = tokenizer(
45 |             batch_inputs, 
46 |             padding=True, 
47 |             max_length=args.max_input_length,
48 |             truncation=True, 
49 |             return_tensors="pt"
50 |         )
51 |         with tokenizer.as_target_tokenizer():
52 |             labels = tokenizer(
53 |                 batch_targets, 
54 |                 padding=True, 
55 |                 max_length=args.max_target_length,
56 |                 truncation=True, 
57 |                 return_tensors="pt"
58 |             )["input_ids"]
59 |             batch_data['decoder_input_ids'] = model.prepare_decoder_input_ids_from_labels(labels)
60 |             end_token_index = torch.where(labels == tokenizer.eos_token_id)[1]
61 |             for idx, end_idx in enumerate(end_token_index):
62 |                 labels[idx][end_idx+1:] = -100
63 |             batch_data['labels'] = labels
64 |         return batch_data
65 |     
66 |     return DataLoader(dataset, batch_size=(batch_size if batch_size else args.batch_size), shuffle=shuffle, 
67 |                       collate_fn=collote_fn)


--------------------------------------------------------------------------------
/mt5_summary/mt5_summary_main.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | import json
  4 | from tqdm.auto import tqdm
  5 | import numpy as np
  6 | import torch
  7 | from transformers import AdamW, get_scheduler
  8 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
  9 | from rouge import Rouge
 10 | import sys
 11 | 
 12 | from tools import seed_everything
 13 | from arg_config import parse_args
 14 | from data import LCSTS, get_dataLoader
 15 | 
 16 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
 17 |                     datefmt='%Y/%m/%d %H:%M:%S',
 18 |                     level=logging.INFO)
 19 | logger = logging.getLogger("Model")
 20 | 
 21 | def train_loop(args, dataloader, model, optimizer, lr_scheduler, epoch, total_loss):
 22 |     progress_bar = tqdm(range(len(dataloader)))
 23 |     progress_bar.set_description(f'loss: {0:>7f}')
 24 |     finish_batch_num = epoch * len(dataloader)
 25 |     
 26 |     model.train()
 27 |     for batch, batch_data in enumerate(dataloader, start=1):
 28 |         batch_data = batch_data.to(args.device)
 29 |         outputs = model(**batch_data)
 30 |         loss = outputs.loss
 31 | 
 32 |         optimizer.zero_grad()
 33 |         loss.backward()
 34 |         optimizer.step()
 35 |         lr_scheduler.step()
 36 | 
 37 |         total_loss += loss.item()
 38 |         progress_bar.set_description(f'loss: {total_loss/(finish_batch_num + batch):>7f}')
 39 |         progress_bar.update(1)
 40 |     return total_loss
 41 | 
 42 | def test_loop(args, dataloader, model, tokenizer):
 43 |     preds, labels = [], []
 44 |     rouge = Rouge()
 45 | 
 46 |     model.eval()
 47 |     with torch.no_grad():
 48 |         for batch_data in tqdm(dataloader):
 49 |             batch_data = batch_data.to(args.device)
 50 |             generated_tokens = model.generate(
 51 |                 batch_data["input_ids"],
 52 |                 attention_mask=batch_data["attention_mask"],
 53 |                 max_length=args.max_target_length,
 54 |                 num_beams=args.beam_search_size,
 55 |                 no_repeat_ngram_size=args.no_repeat_ngram_size,
 56 |             ).cpu().numpy()
 57 |             if isinstance(generated_tokens, tuple):
 58 |                 generated_tokens = generated_tokens[0]
 59 |             label_tokens = batch_data["labels"].cpu().numpy()
 60 | 
 61 |             decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=False)
 62 |             label_tokens = np.where(label_tokens != -100, label_tokens, tokenizer.pad_token_id)
 63 |             decoded_labels = tokenizer.batch_decode(label_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=False)
 64 | 
 65 |             preds += [' '.join(pred.strip()) for pred in decoded_preds]
 66 |             labels += [' '.join(label.strip()) for label in decoded_labels]
 67 |     scores = rouge.get_scores(hyps=preds, refs=labels, avg=True)
 68 |     result = {key: value['f'] * 100 for key, value in scores.items()}
 69 |     result['avg'] = np.mean(list(result.values()))
 70 |     return result
 71 | 
 72 | def train(args, train_dataset, dev_dataset, model, tokenizer):
 73 |     """ Train the model """
 74 |     train_dataloader = get_dataLoader(args, train_dataset, model, tokenizer, shuffle=True)
 75 |     dev_dataloader = get_dataLoader(args, dev_dataset, model, tokenizer, shuffle=False)
 76 |     t_total = len(train_dataloader) * args.num_train_epochs
 77 |     # Prepare optimizer and schedule (linear warmup and decay)
 78 |     no_decay = ["bias", "LayerNorm.weight"]
 79 |     optimizer_grouped_parameters = [
 80 |         {"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay},
 81 |         {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}
 82 |     ]
 83 |     args.warmup_steps = int(t_total * args.warmup_proportion)
 84 |     optimizer = AdamW(
 85 |         optimizer_grouped_parameters, 
 86 |         lr=args.learning_rate, 
 87 |         betas=(args.adam_beta1, args.adam_beta2), 
 88 |         eps=args.adam_epsilon
 89 |     )
 90 |     lr_scheduler = get_scheduler(
 91 |         'linear',
 92 |         optimizer, 
 93 |         num_warmup_steps=args.warmup_steps,
 94 |         num_training_steps=t_total
 95 |     )
 96 |     # Train!
 97 |     logger.info("***** Running training *****")
 98 |     logger.info(f"Num examples - {len(train_dataset)}")
 99 |     logger.info(f"Num Epochs - {args.num_train_epochs}")
100 |     logger.info(f"Total optimization steps - {t_total}")
101 |     with open(os.path.join(args.output_dir, 'args.txt'), 'wt') as f:
102 |         f.write(str(args))
103 |     
104 |     total_loss = 0.
105 |     best_avg_rouge = 0.
106 |     for epoch in range(args.num_train_epochs):
107 |         print(f"Epoch {epoch+1}/{args.num_train_epochs}\n" + 30 * "-")
108 |         total_loss = train_loop(args, train_dataloader, model, optimizer, lr_scheduler, epoch, total_loss)
109 |         dev_rouges = test_loop(args, dev_dataloader, model, tokenizer)
110 |         logger.info(f"Dev Rouge1: {dev_rouges['rouge-1']:>0.2f} Rouge2: {dev_rouges['rouge-2']:>0.2f} RougeL: {dev_rouges['rouge-l']:>0.2f}")
111 |         rouge_avg = dev_rouges['avg']
112 |         if rouge_avg > best_avg_rouge:
113 |             best_avg_rouge = rouge_avg
114 |             logger.info(f'saving new weights to {args.output_dir}...\n')
115 |             save_weight = f'epoch_{epoch+1}_dev_rouge_avg_{rouge_avg:0.4f}_weights.bin'
116 |             torch.save(model.state_dict(), os.path.join(args.output_dir, save_weight))
117 |     logger.info("Done!")
118 | 
119 | def test(args, test_dataset, model, tokenizer, save_weights:list):
120 |     test_dataloader = get_dataLoader(args, test_dataset, model, tokenizer, shuffle=False)
121 |     logger.info('***** Running testing *****')
122 |     for save_weight in save_weights:
123 |         logger.info(f'loading weights from {save_weight}...')
124 |         model.load_state_dict(torch.load(os.path.join(args.output_dir, save_weight)))
125 |         test_rouges = test_loop(args, test_dataloader, model, tokenizer)
126 |         logger.info(f"Test Rouge1: {test_rouges['rouge-1']:>0.2f} Rouge2: {test_rouges['rouge-2']:>0.2f} RougeL: {test_rouges['rouge-l']:>0.2f}")
127 | 
128 | def predict(args, document:str, model, tokenizer):
129 |     inputs = tokenizer(
130 |         document, 
131 |         max_length=args.max_input_length, 
132 |         truncation=True, 
133 |         return_tensors="pt"
134 |     )
135 |     inputs = inputs.to(args.device)
136 |     with torch.no_grad():
137 |         generated_tokens = model.generate(
138 |             inputs["input_ids"],
139 |             attention_mask=inputs["attention_mask"],
140 |             max_length=args.max_target_length,
141 |             num_beams=args.beam_search_size,
142 |             no_repeat_ngram_size=args.no_repeat_ngram_size,
143 |         ).cpu().numpy()
144 |     if isinstance(generated_tokens, tuple):
145 |         generated_tokens = generated_tokens[0]
146 |     decoded_preds = tokenizer.decode(
147 |         generated_tokens[0], 
148 |         skip_special_tokens=True,
149 |         clean_up_tokenization_spaces=False
150 |     )
151 |     return decoded_preds
152 | 
153 | if __name__ == '__main__':
154 |     args = parse_args()
155 |     if args.do_train and os.path.exists(args.output_dir) and os.listdir(args.output_dir):
156 |         raise ValueError(f'Output directory ({args.output_dir}) already exists and is not empty.')
157 |     if not os.path.exists(args.output_dir):
158 |         os.mkdir(args.output_dir)
159 |     args.device = 'cuda' if torch.cuda.is_available() else 'cpu'
160 |     args.n_gpu = torch.cuda.device_count()
161 |     logger.warning(f'Using {args.device} device, n_gpu: {args.n_gpu}')
162 |     # Set seed
163 |     seed_everything(args.seed)
164 |     # Load pretrained model and tokenizer
165 |     logger.info(f'loading pretrained model and tokenizer of {args.model_type} ...')
166 |     tokenizer = AutoTokenizer.from_pretrained(args.model_checkpoint)
167 |     model = AutoModelForSeq2SeqLM.from_pretrained(args.model_checkpoint).to(args.device)
168 |     # Training
169 |     if args.do_train:
170 |         # Set seed
171 |         seed_everything(args.seed)
172 |         train_dataset = LCSTS(args.train_file)
173 |         dev_dataset = LCSTS(args.dev_file)
174 |         train(args, train_dataset, dev_dataset, model, tokenizer)
175 |     # Testing
176 |     save_weights = [file for file in os.listdir(args.output_dir) if file.endswith('.bin')]
177 |     if args.do_test:
178 |         test_dataset = LCSTS(args.test_file)
179 |         test(args, test_dataset, model, tokenizer, save_weights)
180 |     # Predicting
181 |     if args.do_predict:
182 |         test_dataset = LCSTS(args.test_file)
183 |         for save_weight in save_weights:
184 |             logger.info(f'loading weights from {save_weight}...')
185 |             model.load_state_dict(torch.load(os.path.join(args.output_dir, save_weight)))
186 |             logger.info(f'predicting labels of {save_weight}...')
187 | 
188 |             results = []
189 |             model.eval()
190 |             for s_idx in tqdm(range(len(test_dataset))):
191 |                 sample = test_dataset[s_idx]
192 |                 pred_summ = predict(args, sample['content'], model, tokenizer)
193 |                 results.append({
194 |                     "sentence": sample['content'], 
195 |                     "prediction": pred_summ, 
196 |                     "summarization": sample['title']
197 |                 })
198 |             with open(os.path.join(args.output_dir, save_weight + '_test_data_pred.json'), 'wt', encoding='utf-8') as f:
199 |                 for exapmle_result in results:
200 |                     f.write(json.dumps(exapmle_result, ensure_ascii=False) + '\n')


--------------------------------------------------------------------------------
/mt5_summary/run.sh:
--------------------------------------------------------------------------------
 1 | export OUTPUT_DIR=./summ_mt5_results/
 2 | 
 3 | python3 run_summarization_mt5.py \
 4 |     --output_dir=$OUTPUT_DIR \
 5 |     --model_type=mT5 \
 6 |     --model_checkpoint=csebuetnlp/mT5_multilingual_XLSum \
 7 |     --train_file=../../data/lcsts_tsv/data1.tsv \
 8 |     --dev_file=../../data/lcsts_tsv/data2.tsv \
 9 |     --test_file=../../data/lcsts_tsv/data3.tsv \
10 |     --max_input_length=512 \
11 |     --max_target_length=32 \
12 |     --learning_rate=1e-5 \
13 |     --num_train_epochs=3 \
14 |     --batch_size=32 \
15 |     --beam_search_size=4 \
16 |     --no_repeat_ngram_size=2 \
17 |     --do_train \
18 |     --warmup_proportion=0. \
19 |     --seed=42


--------------------------------------------------------------------------------
/mt5_summary/tools.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Filename:    tools.py
 3 | # Author:      ZENGGUANRONG
 4 | # Date:        2024-10-06
 5 | # description: 关键工具
 6 | # reference:   https://github.com/jsksxs360/How-to-use-Transformers
 7 | import random
 8 | import os
 9 | import numpy as np
10 | import torch
11 | 
12 | def seed_everything(seed=1029):
13 |     random.seed(seed)
14 |     os.environ['PYTHONHASHSEED'] = str(seed)
15 |     np.random.seed(seed)
16 |     torch.manual_seed(seed)
17 |     torch.cuda.manual_seed(seed)
18 |     torch.cuda.manual_seed_all(seed)
19 |     # some cudnn methods can be random even after fixing the seed
20 |     # unless you tell it to be deterministic
21 |     torch.backends.cudnn.deterministic = True


--------------------------------------------------------------------------------
/vec_searcher/script/build_vec_index.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Filename:    build_vec_index.py
 3 | # Author:      ZENGGUANRONG
 4 | # Date:        2024-09-07
 5 | # description: 构造向量索引脚本
 6 | 
 7 | import json,torch,copy,random
 8 | from tqdm import tqdm
 9 | from loguru import logger
10 | 
11 | from utils.data_processing import load_toutiao_data
12 | from vec_model.vec_model import VectorizeModel
13 | from vec_searcher.vec_searcher import VecSearcher 
14 | 
15 | if __name__ == "__main__":
16 |     # 0. 必要配置
17 |     MODE = "DEBUG"
18 | 
19 |     VERSION = "20240907"
20 |     VEC_MODEL_PATH = "C:/work/tool/huggingface/models/simcse-chinese-roberta-wwm-ext"
21 |     SOURCE_INDEX_DATA_PATH = "./data/toutiao_cat_data/toutiao_cat_data.txt" # 数据来源：https://github.com/aceimnorstuvwxz/toutiao-text-classfication-dataset
22 |     VEC_INDEX_DATA = "vec_index_toutiao_{}_{}".format(VERSION,MODE)
23 |     # TESE_DATA_PATH = "./data/toutiao_cat_data/test_set_{}_{}.txt".format(VERSION,MODE)
24 |     RANDOM_SEED = 100
25 | 
26 |     DEVICE = torch.device('cuda' if torch.cuda.is_available() else "cpu")
27 |     # TEST_SIZE = 0.1
28 |     # 类目体系
29 |     CLASS_INFO = [
30 |         ["100", '民生-故事', 'news_story'],
31 |         ["101", '文化-文化', 'news_culture'],
32 |         ["102", '娱乐-娱乐', 'news_entertainment'],
33 |         ["103", '体育-体育', 'news_sports'],
34 |         ["104", '财经-财经', 'news_finance'],
35 |         # ["105", '时政 新时代', 'nineteenth'],
36 |         ["106", '房产-房产', 'news_house'],
37 |         ["107", '汽车-汽车', 'news_car'],
38 |         ["108", '教育-教育', 'news_edu' ],
39 |         ["109", '科技-科技', 'news_tech'],
40 |         ["110", '军事-军事', 'news_military'],
41 |         # ["111" 宗教 无，凤凰佛教等来源],
42 |         ["112", '旅游-旅游', 'news_travel'],
43 |         ["113", '国际-国际', 'news_world'],
44 |         ["114", '证券-股票', 'stock'],
45 |         ["115", '农业-三农', 'news_agriculture'],
46 |         ["116", '电竞-游戏', 'news_game']
47 |     ]
48 |     ID2CN_MAPPING = {}
49 |     for idx in range(len(CLASS_INFO)):
50 |         ID2CN_MAPPING[CLASS_INFO[idx][0]] = CLASS_INFO[idx][1]
51 | 
52 |     # 1. 加载数据、模型
53 |     # 1.1 加载模型
54 |     vec_model = VectorizeModel(VEC_MODEL_PATH, DEVICE)
55 |     index_dim = len(vec_model.predict_vec("你好啊")[0])
56 |     # 1.2 加载数据
57 |     toutiao_index_data = load_toutiao_data(SOURCE_INDEX_DATA_PATH)
58 |     source_index_data = copy.deepcopy(toutiao_index_data)
59 |     logger.info("load data done: {}".format(len(source_index_data)))
60 |     if MODE == "DEBUG":
61 |         random.shuffle(source_index_data)
62 |         source_index_data = source_index_data[:10000]
63 | 
64 |     # 2. 创建索引并灌入数据
65 |     # 2.1 构造索引
66 |     vec_searcher = VecSearcher()
67 |     vec_searcher.build(index_dim, VEC_INDEX_DATA)
68 | 
69 |     # 2.2 推理向量
70 |     vectorize_result = []
71 |     for q in tqdm(source_index_data, desc="VEC MODEL RUNNING"):
72 |         vec = vec_model.predict_vec(q[0]).cpu().numpy()
73 |         tmp_result = copy.deepcopy(q)
74 |         tmp_result.append(vec)
75 |         vectorize_result.append(copy.deepcopy(tmp_result))
76 | 
77 |     # 2.3 开始存入
78 |     for idx in tqdm(range(len(vectorize_result)), desc="INSERT INTO INDEX"):
79 |         vec_searcher.insert(vectorize_result[idx][2], vectorize_result[idx][:2])
80 | 
81 |     # 3. 保存
82 |     # 3.1 索引保存
83 |     vec_searcher.save()
84 |     logger.info("build done: {}".format(VEC_INDEX_DATA))
85 | 


--------------------------------------------------------------------------------
/vec_searcher/searcher.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Filename:    searcher.py
 3 | # Author:      ZENGGUANRONG
 4 | # Date:        2023-12-12
 5 | # description: 核心检索器
 6 | 
 7 | import json,requests,copy
 8 | import numpy as np
 9 | from loguru import logger
10 | from vec_searcher.vec_searcher import VecSearcher
11 | from vec_model.vec_model import VectorizeModel
12 | 
13 | class Searcher:
14 |     def __init__(self, model_path, vec_search_path):
15 |         self.vec_model = VectorizeModel(model_path)
16 |         logger.info("load vec_model done")
17 | 
18 |         self.vec_searcher = VecSearcher()
19 |         self.vec_searcher.load(vec_search_path)
20 |         logger.info("load vec_searcher done")
21 | 
22 |     def rank(self, query, recall_result):
23 |         rank_result = []
24 |         for idx in range(len(recall_result)):
25 |             new_sim = self.vec_model.predict_sim(query, recall_result[idx][1][0])
26 |             rank_item = copy.deepcopy(recall_result[idx])
27 |             rank_item.append(new_sim)
28 |             rank_result.append(copy.deepcopy(rank_item))
29 |         rank_result.sort(key=lambda x: x[3], reverse=True)
30 |         return rank_result
31 |     
32 |     def search(self, query, nums=3):
33 |         logger.info("request: {}".format(query))
34 | 
35 |         q_vec = self.vec_model.predict_vec(query).cpu().numpy()
36 | 
37 |         recall_result = self.vec_searcher.search(q_vec, nums)
38 | 
39 |         rank_result = self.rank(query, recall_result)
40 |         # rank_result = list(filter(lambda x:x[4] > 0.8, rank_result))
41 | 
42 |         logger.info("response: {}".format(rank_result))
43 |         return rank_result
44 | 
45 | if __name__ == "__main__":
46 |     VEC_MODEL_PATH = "C:/work/tool/huggingface/models/simcse-chinese-roberta-wwm-ext"
47 |     VEC_INDEX_DATA = "vec_index_toutiao_20240702_DEBUG"
48 |     searcher = Searcher(VEC_MODEL_PATH, VEC_INDEX_DATA)
49 |     q = "小产权房"
50 |     print(searcher.search(q))


--------------------------------------------------------------------------------
/vec_searcher/utils/data_processing.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Filename:    data_processing.py
 3 | # Author:      ZENGGUANRONG
 4 | # Date:        2024-06-25
 5 | # description: 数据处理函数
 6 | 
 7 | def load_toutiao_data(path):
 8 |     source_data = []
 9 |     with open(path, encoding="utf8") as f:
10 |         for line in f:
11 |             ll = line.strip().split("_!_") # 新闻ID，分类code，分类名称，新闻字符串（仅含标题），新闻关键词
12 |             source_data.append([ll[3], ll])
13 |     return source_data
14 | 
15 | def load_class_def(path):
16 |     source_data = {}
17 |     with open(path, encoding="utf8") as f:
18 |         for line in f:
19 |             ll = line.strip().split("\t")
20 |             source_data[ll[0]] = ll[1]
21 |     return source_data


--------------------------------------------------------------------------------
/vec_searcher/vec_model/simcse_model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from loguru import logger
 4 | from tqdm import tqdm
 5 | from transformers import BertConfig, BertModel, BertTokenizer
 6 | 
 7 | class SimcseModel(nn.Module):
 8 |     # https://blog.csdn.net/qq_44193969/article/details/126981581
 9 |     def __init__(self, pretrained_bert_path, pooling="cls") -> None:
10 |         super(SimcseModel, self).__init__()
11 | 
12 |         self.pretrained_bert_path = pretrained_bert_path
13 |         self.config = BertConfig.from_pretrained(self.pretrained_bert_path)
14 |         
15 |         self.model = BertModel.from_pretrained(self.pretrained_bert_path, config=self.config)
16 |         self.model.eval()
17 |         
18 |         # self.model = None
19 |         self.pooling = pooling
20 |     
21 |     def forward(self, input_ids, attention_mask, token_type_ids):
22 |         out = self.model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
23 | 
24 |         return out.last_hidden_state[:, 0]


--------------------------------------------------------------------------------
/vec_searcher/vec_model/vec_model.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from loguru import logger
  5 | 
  6 | from transformers import BertTokenizer
  7 | 
  8 | from vec_model.simcse_model import SimcseModel
  9 | 
 10 | import onnxruntime as ort
 11 | 
 12 | class VectorizeModel:
 13 |     def __init__(self, ptm_model_path, device = "cpu") -> None:
 14 |         self.tokenizer = BertTokenizer.from_pretrained(ptm_model_path)
 15 |         self.model = SimcseModel(pretrained_bert_path=ptm_model_path, pooling="cls")
 16 |         # print(self.model)
 17 |         self.model.eval()
 18 |         
 19 |         self.DEVICE = torch.device('cuda' if torch.cuda.is_available() else "cpu")
 20 |         # self.DEVICE = device
 21 |         logger.info(self.DEVICE)
 22 |         self.model.to(self.DEVICE)
 23 |         
 24 |         self.pdist = nn.PairwiseDistance(2)
 25 |     
 26 |     def predict_vec(self,query):
 27 |         q_id = self.tokenizer(query, max_length = 200, truncation=True, padding="max_length", return_tensors='pt')
 28 |         with torch.no_grad():
 29 |             q_id_input_ids = q_id["input_ids"].squeeze(1).to(self.DEVICE)
 30 |             q_id_attention_mask = q_id["attention_mask"].squeeze(1).to(self.DEVICE)
 31 |             q_id_token_type_ids = q_id["token_type_ids"].squeeze(1).to(self.DEVICE)
 32 |             q_id_pred = self.model(q_id_input_ids, q_id_attention_mask, q_id_token_type_ids)
 33 | 
 34 |         return q_id_pred
 35 | 
 36 |     def predict_vec_request(self, query):
 37 |         q_id_pred = self.predict_vec(query)
 38 |         return q_id_pred.cpu().numpy().tolist()
 39 |     
 40 |     def predict_sim(self, q1, q2):
 41 |         q1_v = self.predict_vec(q1)
 42 |         q2_v = self.predict_vec(q2)
 43 |         sim = F.cosine_similarity(q1_v[0], q2_v[0], dim=-1)
 44 |         return sim.cpu().numpy().tolist()
 45 | 
 46 | class VectorizeModel_v2(VectorizeModel):
 47 |     def __init__(self, ptm_model_path, onnx_path, providers=['CUDAExecutionProvider']) -> None:
 48 |         # ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']
 49 |         self.tokenizer = BertTokenizer.from_pretrained(ptm_model_path)
 50 |         self.model = ort.InferenceSession(onnx_path, providers=providers)
 51 |         
 52 |         self.pdist = nn.PairwiseDistance(2)
 53 |     
 54 |     def _to_numpy(self, tensor):
 55 |         return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()
 56 |     
 57 |     def predict_vec(self,query):
 58 |         q_id = self.tokenizer(query, max_length = 200, truncation=True, padding="max_length", return_tensors='pt')
 59 |         input_feed = {
 60 |             self.model.get_inputs()[0].name: self._to_numpy(q_id["input_ids"]),
 61 |             self.model.get_inputs()[1].name: self._to_numpy(q_id["attention_mask"]),
 62 |             self.model.get_inputs()[2].name: self._to_numpy(q_id["token_type_ids"]),
 63 |         }
 64 |         return torch.tensor(self.model.run(None, input_feed=input_feed)[0])
 65 |     
 66 |     def predict_sim(self, q1, q2):
 67 |         q1_v = self.predict_vec(q1)
 68 |         q2_v = self.predict_vec(q2)
 69 |         sim = F.cosine_similarity(q1_v[0], q2_v[0], dim=-1)
 70 |         return sim.numpy().tolist()
 71 | 
 72 | if __name__ == "__main__":
 73 |     import time,random
 74 |     from tqdm import tqdm
 75 |     device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
 76 |     # device = ""
 77 |     # vec_model = VectorizeModel('C:/work/tool/huggingface/models/simcse-chinese-roberta-wwm-ext', device=device)
 78 |     vec_model = VectorizeModel_v2('C:/work/tool/huggingface/models/simcse-chinese-roberta-wwm-ext',
 79 |                                  "./data/model_simcse_roberta_output_20240211.onnx",providers=['CUDAExecutionProvider'])
 80 |     # vec_model = VectorizeModel_v2('C:/work/tool/huggingface/models/simcse-chinese-roberta-wwm-ext',
 81 |     #                              "./data/model_simcse_roberta_output_20240211.onnx",providers=['TensorrtExecutionProvider'])
 82 |     # 单测
 83 |     # q = ["你好啊"]
 84 |     # print(vec_model.predict_vec(q))
 85 |     # print(vec_model.predict_sim("你好呀","你好啊"))
 86 |     tmp_queries = ["你好啊", "今天天气怎么样", "我要暴富"]
 87 |     # 开始批跑
 88 |     batch_sizes = [1,2,4,8,16]
 89 |     for b in batch_sizes:
 90 |         for i in tqdm(range(100),desc="warmup"):
 91 |             tmp_q = []
 92 |             for i in range(b):
 93 |                 tmp_q.append(random.choice(tmp_queries))
 94 |             vec_model.predict_vec(tmp_q)
 95 |         for i in tqdm(range(1000),desc="batch_size={}".format(b)):
 96 |             tmp_q = []
 97 |             for i in range(b):
 98 |                 tmp_q.append(random.choice(tmp_queries))
 99 |             vec_model.predict_vec(tmp_q)
100 | 


--------------------------------------------------------------------------------
/vec_searcher/vec_searcher/vec_index.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Filename:    vec_index.py
 3 | # Author:      ZENGGUANRONG
 4 | # Date:        2023-12-12
 5 | # description: 向量召回索引-FAISS
 6 | 
 7 | import faiss
 8 | from loguru import logger
 9 | 
10 | class VecIndex:
11 |     def __init__(self) -> None:
12 |         self.index = ""
13 |     
14 |     def build(self, index_dim):
15 |         description = "HNSW64"
16 |         measure = faiss.METRIC_L2
17 |         self.index = faiss.index_factory(index_dim, description, measure)
18 |     
19 |     def insert(self, vec):
20 |         self.index.add(vec)
21 |     
22 |     def batch_insert(self, vecs):
23 |         self.index.add(vecs)
24 |     
25 |     def load(self, read_path):
26 |         # read_path: XXX.index
27 |         self.index = faiss.read_index(read_path)
28 | 
29 |     def save(self, save_path):
30 |         # save_path: XXX.index
31 |         faiss.write_index(self.index, save_path)
32 |     
33 |     def search(self, vec, num):
34 |         # id, distance
35 |         return self.index.search(vec, num)


--------------------------------------------------------------------------------
/vec_searcher/vec_searcher/vec_searcher.py:
--------------------------------------------------------------------------------
 1 | import os, json
 2 | from loguru import logger
 3 | from vec_searcher.vec_index import VecIndex
 4 | 
 5 | class VecSearcher:
 6 |     def __init__(self):
 7 |         self.invert_index = VecIndex() # 检索倒排，使用的是索引是VecIndex
 8 |         self.forward_index = [] # 检索正排，实质上只是个list，通过ID获取对应的内容
 9 |         self.INDEX_FOLDER_PATH_TEMPLATE = "data/index/{}"
10 | 
11 |     def build(self, index_dim, index_name):
12 |         self.index_name = index_name
13 |         self.index_folder_path = self.INDEX_FOLDER_PATH_TEMPLATE.format(index_name)
14 |         if not os.path.exists(self.index_folder_path) or not os.path.isdir(self.index_folder_path):
15 |             os.mkdir(self.index_folder_path)
16 | 
17 |         self.invert_index = VecIndex()
18 |         self.invert_index.build(index_dim)
19 | 
20 |         self.forward_index = []
21 |     
22 |     def insert(self, vec, doc):
23 |         self.invert_index.insert(vec)
24 |         # self.invert_index.batch_insert(vecs)
25 | 
26 |         self.forward_index.append(doc)
27 |     
28 |     def save(self):
29 |         with open(self.index_folder_path + "/forward_index.txt", "w", encoding="utf8") as f:
30 |             for data in self.forward_index:
31 |                 f.write("{}\n".format(json.dumps(data, ensure_ascii=False)))
32 | 
33 |         self.invert_index.save(self.index_folder_path + "/invert_index.faiss")
34 |     
35 |     def load(self, index_name):
36 |         self.index_name = index_name
37 |         self.index_folder_path = self.INDEX_FOLDER_PATH_TEMPLATE.format(index_name)
38 | 
39 |         self.invert_index = VecIndex()
40 |         self.invert_index.load(self.index_folder_path + "/invert_index.faiss")
41 | 
42 |         self.forward_index = []
43 |         with open(self.index_folder_path + "/forward_index.txt", encoding="utf8") as f:
44 |             for line in f:
45 |                 self.forward_index.append(json.loads(line.strip()))
46 |     
47 |     def search(self, vecs, nums = 5):
48 |         search_res = self.invert_index.search(vecs, nums)
49 |         recall_list = []
50 |         for idx in range(nums):
51 |             # recall_list_idx, recall_list_detail, distance
52 |             recall_list.append([search_res[1][0][idx], self.forward_index[search_res[1][0][idx]], search_res[0][0][idx]])
53 |         # recall_list = list(filter(lambda x: x[2] < 100, result))
54 | 
55 |         return recall_list


--------------------------------------------------------------------------------