├── __init__.py
├── best_model.pth
├── output
    └── logs
    │   └── Experiment_log.log
├── .DS_Store
├── test_predict.py
├── app.py
├── logger.py
├── Readme.md
├── config.py
├── prompt_model.py
├── utils.py
├── predict.py
├── processer.py
├── conlleval.py
├── main.py
└── LICENSE


/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/best_model.pth:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/output/logs/Experiment_log.log:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YuxingLu613/Prompt-NER-Chinese/HEAD/.DS_Store


--------------------------------------------------------------------------------
/test_predict.py:
--------------------------------------------------------------------------------
1 | import requests
2 | url = "http://127.0.0.1:5000/predict_ner"
3 | data = {
4 |     'input':'我得了肠胃炎，现在要去做穿肠手术。我昨天吃了二甲双胍，今天准备去拍CT。三个月前我被切除了胃和肾脏，现在觉得特别空虚，希望能做一个白细胞检查。',
5 |     }
6 | response = requests.post(url,data=data)
7 | 
8 | print(response.text)


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing.spawn import import_main_path
 2 | from flask import Flask,jsonify,request
 3 | from pip import main
 4 | from predict import build_model, predict
 5 | 
 6 | 
 7 | app=Flask(__name__)
 8 | app.config['JSON_AS_ASCII'] = False
 9 | 
10 | 
11 | @app.route('/')
12 | def hello():
13 |     return 'Hello World!'
14 | 
15 | @app.route('/predict_ner',methods=["POST"])
16 | def predict_ner():
17 |     global model
18 |     input=request.form.get("input")
19 |     output=predict(input,model)
20 |     return jsonify(output=output)
21 | 
22 | if __name__=="__main__":
23 |     model=build_model()
24 |     app.run(host="127.0.0.1")


--------------------------------------------------------------------------------
/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | from config import Config
 4 | 
 5 | config=Config()
 6 | 
 7 | if not os.path.exists(config.log_path):
 8 |     os.makedirs(config.log_path)
 9 | 
10 | logger=logging.getLogger("client_log")
11 | 
12 | logger.setLevel(logging.INFO)
13 | 
14 | stream_handler=logging.StreamHandler()
15 | log_file_handler=logging.FileHandler(filename=os.path.join(config.log_path,config.log_name),encoding="utf-8")
16 | 
17 | formatter = logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(message)s")
18 | 
19 | stream_handler.setFormatter(formatter)
20 | log_file_handler.setFormatter(formatter)
21 | 
22 | logger.addHandler(stream_handler)
23 | logger.addHandler(log_file_handler)


--------------------------------------------------------------------------------
/Readme.md:
--------------------------------------------------------------------------------
 1 | # Prompt-NER on Chinese EHR data
 2 | 
 3 | ### 简介
 4 | 
 5 | 该仓库使用[BERT](https://github.com/google-research/bert)作为预训练模型使用Prompt预训练方法进行命名实体识别任务。
 6 | 
 7 | 
 8 | 
 9 | ### 文件夹介绍：
10 | 
11 | ```
12 | .
13 | ├── .DS_Store
14 | ├── __init__.py
15 | ├── app.py 	# 接口文件
16 | ├── best_model.pth	# 模型文件（需要自己训练）
17 | ├── config.py	# 配置文件
18 | ├── conlleval.py	# 评价指标
19 | ├── data	# 数据集
20 | │   ├── eval.txt # 处理好的验证集
21 | │   ├── test.txt	# 处理好的测试集
22 | │   └── train.txt	# 处理好的训练集
23 | ├── logger.py	# 日志文件
24 | ├── main.py	# 主文件
25 | ├── output	# 输出
26 | │   └── logs
27 | │       └── Experiment_log.log
28 | ├── predict.py	# 预测程序
29 | ├── processer.py	# 数据预处理文件
30 | ├── prompt_model.py	# 模型结构
31 | ├── test_predict.py	# 接口测试文件
32 | └── utils.py	# 方法函数
33 | ```
34 | 
35 | 
36 | 
37 | ### Requirements:
38 | 
39 | python
40 | 
41 | torch
42 | 
43 | sklearn
44 | 
45 | pandas
46 | 
47 | transformers
48 | 
49 | 
50 | 
51 | ### 直接使用方法：
52 | 
53 | 使用python运行main.py，获得模型文件。
54 | 
55 | 使用python运行app.py，并且在test.py中修改input数据，获得返回的结果。
56 | 
57 | 
58 | 
59 | ### 结果：
60 | 
61 | | 训练方法       | F1     |
62 | | -------------- | ------ |
63 | | 常规预训练方法 | 0.7617 |
64 | | Prompt训练方法 | 0.8189 |


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import os
 3 | import threading
 4 | 
 5 | 
 6 | class Config(object):
 7 |     _instance_lock = threading.Lock()
 8 |     _init_flag = False
 9 | 
10 |     def __init__(self):
11 |         if not Config._init_flag:
12 |             Config._init_flag = True
13 |             root_path = str(os.getcwd()).replace("\\", "/")
14 |             if 'source' in root_path.split('/'):
15 |                 self.base_path = os.path.abspath(os.path.join(os.path.pardir))
16 |             else:
17 |                 self.base_path = os.path.abspath(os.path.join(os.getcwd()))
18 |             self._init_train_config()
19 | 
20 |     def __new__(cls, *args, **kwargs):
21 |         """
22 |         单例类
23 |         :param args:
24 |         :param kwargs:
25 |         :return:
26 |         """
27 |         if not hasattr(Config, '_instance'):
28 |             with Config._instance_lock:
29 |                 if not hasattr(Config, '_instance'):
30 |                     Config._instance = object.__new__(cls)
31 |         return Config._instance
32 | 
33 |     def _init_train_config(self):
34 |         self.label_list = ["O","B-手术","I-手术","B-药物","I-药物","B-实验室检验","I-实验室检验","B-影像检查","I-影像检查","B-解剖部位","I-解剖部位","B-疾病和诊断","I-疾病和诊断"]
35 |         self.use_gpu = True
36 |         self.device = "cpu"
37 |         self.sep = " "
38 | 
39 | 
40 |         # 输入数据集、输出目录
41 |         self.train_file = os.path.join(self.base_path, 'data', 'train.txt')
42 |         self.eval_file = os.path.join(self.base_path, 'data', 'eval.txt')
43 |         self.test_file = os.path.join(self.base_path, 'data', 'test.txt')
44 |         self.log_path = os.path.join(self.base_path, 'output', "logs")
45 |         self.log_name="Experiment_log.log"
46 |         self.output_path = os.path.join(self.base_path, 'output', datetime.datetime.now().strftime('%Y%m%d%H%M%S'))
47 | 
48 |         model_list=["bert-base-chinese","hfl/chinese-bert-wwm-ext","hfl/chinese-roberta-wwm-ext","voidful/albert_chinese_base","hfl/chinese-electra-180g-base-discriminator","nghuyong/ernie-1.0"]
49 | 
50 |         # Pretrained model name or path if not the same as model_name
51 |         # self.model_name_or_path = "/home/luyx/PLM/Continue_Pretrain/training_output"
52 |         self.model_name_or_path = model_list[0]
53 | 
54 |         # 以下是模型训练参数
55 |         self.do_train = True
56 |         self.do_eval = True
57 |         self.do_test = True
58 |         self.clean = True
59 |         self.need_birnn = True
60 |         self.do_lower_case = True
61 |         self.rnn_dim = 768
62 |         self.max_seq_length = 64
63 |         self.train_batch_size = 3072
64 |         self.eval_batch_size = 3072
65 |         self.num_train_epochs = 30
66 |         self.gradient_accumulation_steps = 1
67 |         self.learning_rate = 5e-6
68 |         self.adam_epsilon = 1e-8
69 |         self.warmup_steps = 0
70 |         self.logging_steps = 100
71 |         self.train_eval_split=0.8
72 |         self.min_sequence_length=10


--------------------------------------------------------------------------------
/prompt_model.py:
--------------------------------------------------------------------------------
 1 | from re import sub
 2 | from select import select
 3 | from transformers import BertForMaskedLM,AutoConfig,AutoModel,AutoModelForMaskedLM,BertForTokenClassification
 4 | import torch
 5 | from torch import nn
 6 | from torch.utils.data import DataLoader, SequentialSampler
 7 | from tqdm import tqdm
 8 | import conlleval
 9 | 
10 | class Prompt_Based_NER(nn.Module):
11 | 
12 |     def __init__(self, config):
13 |         super(Prompt_Based_NER, self).__init__()
14 |         self.config=config
15 |         self.plm_config=AutoConfig.from_pretrained(self.config.model_name_or_path)
16 |         # self.plm_model=BertForMaskedLM.from_pretrained("bert-base-chinese")
17 |         self.plm_model=BertForMaskedLM.from_pretrained(self.config.model_name_or_path)
18 |         self.dropout=nn.Dropout(self.plm_config.hidden_dropout_prob)
19 |         self.linear1=nn.Linear(in_features=self.plm_config.vocab_size,out_features=256)
20 |         self.linear2=nn.Linear(in_features=256,out_features=len(self.config.label_list))
21 |         self.softmax=nn.Softmax(dim=-1)
22 | 
23 |     def forward(self, input_ids, token_type_ids=None, attention_mask=None,masked_index=None):
24 |         masked_index=masked_index.to(self.config.device)
25 |         output=self.plm_model(input_ids, token_type_ids=token_type_ids,attention_mask=attention_mask)[0]
26 |         selected_output=None
27 |         for index, sub_output in enumerate(output):
28 |             if selected_output is None:
29 |                 selected_output=sub_output[masked_index[index]].unsqueeze(0)
30 |             else:
31 |                 selected_output=torch.cat((selected_output,sub_output[masked_index[index]].unsqueeze(0)),dim=0)
32 |         output=selected_output
33 |         output=self.linear1(output)
34 |         output=self.dropout(output)
35 |         output=self.linear2(output)
36 |         return output
37 | 
38 |     def predict(self, input_ids, token_type_ids=None, attention_mask=None):
39 |         output=self.plm_model(input_ids, token_type_ids=token_type_ids,attention_mask=attention_mask).logits
40 |         output=self.hidden2tag(output)
41 |         output=torch.argmax(output,dim=-1)
42 |         return output
43 | 
44 | def evaluate(config, dataloader, model, id2label, all_ori_tokens):
45 |         ori_labels, pred_labels = [], []
46 |         model.eval()
47 |         
48 |         for b_i, (input_ids, token_type_ids, attention_mask, label_ids,ori_tokens,masked_index) in enumerate(tqdm(dataloader, desc="Evaluating")):
49 |             input_ids = input_ids.to(config.device)
50 |             attention_mask = attention_mask.to(config.device)
51 |             token_type_ids = token_type_ids.to(config.device)
52 |             label_ids = label_ids.to(config.device)
53 |             masked_index=masked_index.to(config.device)
54 |             with torch.no_grad():
55 |                 output = model.forward(input_ids, token_type_ids, attention_mask,masked_index)                
56 |                 logits=torch.argmax(output,dim=-1)
57 | 
58 |             for l in logits:
59 |                 pred_labels.append([id2label[l.item()]])
60 | 
61 |             for l in label_ids:
62 |                 ori_labels.append([id2label[l.tolist().index(1)]])
63 | 
64 |         eval_list = []
65 |         for ori_tokens, oril, prel in zip(all_ori_tokens, ori_labels, pred_labels):
66 |             for ot, ol, pl in zip(ori_tokens, oril, prel):
67 |                 if ot in ["[CLS]", "[SEP]"]:
68 |                     continue
69 |                 eval_list.append(f"{ot} {ol} {pl}\n")
70 |             eval_list.append("\n")
71 |         
72 |         # eval the model
73 |         counts = conlleval.evaluate(eval_list)
74 |         conlleval.report(counts)
75 | 
76 |         # namedtuple('Metrics', 'tp fp fn prec rec fscore')
77 |         overall, by_type = conlleval.metrics(counts)
78 |         return overall, by_type


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import json
  3 | import os
  4 | import pickle
  5 | from re import I
  6 | import time
  7 | from datetime import timedelta,datetime
  8 | import numpy as np
  9 | import pandas as pd
 10 | from sklearn.model_selection import train_test_split
 11 | 
 12 | 
 13 | def load_file(file_path: str, separater: str = None):
 14 |     """
 15 |     读取文件；
 16 |     若sep为None，按行读取，返回文件内容列表，格式为:[xxx,xxx,xxx,...]
 17 |     若不为None，按行读取分隔，返回文件内容列表，格式为: [[xxx,xxx],[xxx,xxx],...]
 18 |     :param filepath:
 19 |     :param sep:
 20 |     :return:
 21 |     """
 22 |     with open(file_path,"r",encoding="utf-8-sig") as f:
 23 |         lines=f.readlines()
 24 |         if separater:
 25 |             return [line.strip().split(separater) for line in lines]
 26 |         else:
 27 |             return lines
 28 | 
 29 | 
 30 | def load_csv(file_path: str, is_tsv: bool = False):
 31 |     """
 32 |     加载csv文件为OrderDict()列表
 33 |     :param filepath:
 34 |     :param is_tsv:
 35 |     :return:
 36 |     """
 37 |     dialect = 'excel-tab' if is_tsv else 'excel'
 38 |     with open(file_path, "r", encoding='utf-8') as f:
 39 |         reader = csv.DictReader(f, dialect=dialect)
 40 |         return list(reader)
 41 | 
 42 | def save_csv(data: pd.DataFrame, file_path: str,sep: str=","):
 43 |     """
 44 |     将DataFrame数据保存到对应file_path下
 45 |     :param data:
 46 |     :para file_path:
 47 |     :return:
 48 |     """
 49 |     data.to_csv(file_path,sep=sep)
 50 | 
 51 | 
 52 | def load_json(filepath: str):
 53 |     """
 54 |     加载json文件
 55 |     :param filepath:
 56 |     :return:
 57 |     """
 58 |     with open(filepath, "r", encoding="utf-8") as f:
 59 |         return [json.loads(line.strip(), encoding="utf-8") for line in f.readlines()]
 60 | 
 61 | 
 62 | def save_json(list_data, filepath):
 63 |     """
 64 |     保存json文件
 65 |     :param list_data:
 66 |     :param filepath:
 67 |     :return:
 68 |     """
 69 |     with open(filepath, "w", encoding="utf-8") as f:
 70 |         for data in list_data:
 71 |             json_str = json.dumps(data, ensure_ascii=False)
 72 |             f.write("{}\n".format(json_str))
 73 |         f.flush()
 74 | 
 75 | 
 76 | def load_pkl(filepath):
 77 |     """
 78 |     加载pkl文件
 79 |     :param filepath:
 80 |     :return:
 81 |     """
 82 |     with open(filepath, 'rb') as f:
 83 |         data = pickle.load(f)
 84 |         return data
 85 | 
 86 | 
 87 | def save_pkl(data, filepath):
 88 |     """
 89 |     保存pkl文件，数据序列化
 90 |     :param data:
 91 |     :param filepath:
 92 |     :return:
 93 |     """
 94 |     with open(filepath, 'wb') as f:
 95 |         pickle.dump(data, f)
 96 | 
 97 | 
 98 | def calculate_distance(vector1, vector2,mode="cos"):
 99 |     """
100 |     计算两个向量的余弦相似度
101 |     :param vector1: 向量1
102 |     :param vector2: 向量2
103 |     :param mode: "cos"余弦相似度，"euc"欧氏距离
104 |     :return:
105 |     """
106 |     if mode=="cos":
107 |         distance = np.dot(vector1, vector2) / (np.linalg.norm(vector1) * (np.linalg.norm(vector2)))  # 余弦夹角
108 |     elif mode=="euc":
109 |         distance = np.sqrt(np.sum(np.square(vector1 - vector2)))  # 欧式距离
110 |     return distance
111 | 
112 | 
113 | def split_data(data_set, ratio):
114 |     """
115 |     数据集切分
116 |     :param data_set:
117 |     :param ratio:分割比例
118 |     :return:
119 |     """
120 |     split_data1,split_data2=train_test_split(data_set,ratio)
121 |     return split_data1,split_data2
122 | 
123 | 
124 | def format_data(t: datetime):
125 |     """
126 |     时间格式化，time.strftime("%Y-%m-%d %H:%M:%S")
127 |     :param t:
128 |     :return:
129 |     """
130 |     return t.strftime("%Y-%m-%d %H:%M:%S")
131 | 
132 | 
133 | def get_used_dif(start_time):
134 |     """
135 |     获取已使用时间
136 |     :param start_time: time.time()
137 |     :return:
138 |     """
139 |     end_time = time.time()
140 |     time_dif = end_time - start_time
141 |     return timedelta(seconds=int(round(time_dif)))
142 | 
143 | 
144 | def scan_filepath(path):
145 |     """
146 |     递归返回指定目录下的所有文件
147 |     :param ph:
148 |     :return:
149 |     """
150 |     path_list = []
151 |     for p in os.listdir(path):
152 |         fp = os.path.join(path, p)
153 |         if os.path.isfile(fp):
154 |             path_list.append(fp)
155 |         elif os.path.isdir(fp):
156 |             path_list.extend(scan_filepath(fp))
157 |     return path_list
158 | 


--------------------------------------------------------------------------------
/predict.py:
--------------------------------------------------------------------------------
  1 | from prompt_model import Prompt_Based_NER
  2 | import torch
  3 | from torch import nn
  4 | from config import Config
  5 | import os
  6 | from processer import get_label2id_id2label
  7 | from transformers import BertTokenizer
  8 | from tqdm import tqdm
  9 | from torch.utils.data import DataLoader,SequentialSampler,TensorDataset
 10 | from prompt_model import Prompt_Based_NER
 11 | 
 12 | def build_model():
 13 |     config=Config()
 14 |     # model_path=os.path.join(config.base_path,"output","20220306205222","best_model.pth")
 15 |     model=Prompt_Based_NER(config)
 16 |     # net_dict = model.state_dict()
 17 |     # best_model = torch.load(model_path)
 18 |     # for k, v in best_model.items():
 19 |     #     name = k[7:] # remove `module.`
 20 |     #     net_dict[name] = v
 21 |     # model.load_state_dict(net_dict)
 22 |     # model.to(device)
 23 |     # torch.save(model.state_dict(),"best_model.pth")
 24 | 
 25 |     model.load_state_dict(torch.load("best_model.pth"))
 26 | 
 27 |     return model
 28 | 
 29 | def predict(input,model):
 30 |     config=Config()
 31 | 
 32 |     use_gpu = False
 33 |     device = torch.device('cuda' if use_gpu else config.device)
 34 |     config.device = device
 35 | 
 36 |     tokenizer = BertTokenizer.from_pretrained(config.model_name_or_path)
 37 |     
 38 |     label2id, id2label = get_label2id_id2label(config.output_path, label_list=config.label_list)
 39 |     
 40 |     def get_input_sequence(input):
 41 |         input_split=input.split("。")
 42 |         input_list=[]
 43 |         max_len=config.max_seq_length-6
 44 |         for input in input_split:
 45 |             if input:
 46 |                 input+="。"
 47 |             while len(input)>0:
 48 |                 input_list.append(input[:max_len])
 49 |                 input=input[max_len:]
 50 |         return input_list
 51 | 
 52 |     class Template():
 53 |         def __init__(self,text) -> None:
 54 |             self.text=text
 55 | 
 56 |     def create_template(input_list):
 57 |         dataset=[]
 58 |         for text_list in input_list:
 59 |             for word in list(text_list):
 60 |                 template=Template(text=text_list+"，"+word+"是")
 61 |                 dataset.append(template)
 62 |         return dataset
 63 | 
 64 |     class InputFeatures(object):
 65 |         """A single set of features of data."""
 66 | 
 67 |         def __init__(self, input_ids, token_type_ids, attention_mask, masked_index):
 68 |             """
 69 |             :param input_ids:       单词在词典中的编码
 70 |             :param attention_mask:  指定 对哪些词 进行self-Attention操作
 71 |             :param token_type_ids:  区分两个句子的编码（上句全为0，下句全为1）
 72 |             :param label_id:        标签的id
 73 |             """
 74 |             self.input_ids = input_ids
 75 |             self.token_type_ids = token_type_ids
 76 |             self.attention_mask = attention_mask
 77 |             self.masked_index = masked_index
 78 | 
 79 |     def convert_examples_to_features(dataset,tokenizer):
 80 |         features=[]
 81 |         max_seq_length = config.max_seq_length
 82 |         for example in tqdm(dataset):
 83 |             if len(example.text) >= max_seq_length - 3:
 84 |                 # -2的原因是因为序列需要加一个句首和句尾标志
 85 |                 example.text = example.text[0:(max_seq_length - 3)]
 86 |             example.text = ["[CLS"]+list(example.text) + ['[MASK]']+["[SEP]"]
 87 | 
 88 |             input_ids = tokenizer.convert_tokens_to_ids(example.text)
 89 |             attention_mask = [1]*len(example.text)
 90 |             masked_index = example.text.index("[MASK]")
 91 | 
 92 |             if len(input_ids) < max_seq_length:
 93 |                 input_ids.extend([0]*(max_seq_length-len(input_ids)))
 94 |                 attention_mask.extend([0]*(max_seq_length-len(attention_mask)))
 95 | 
 96 |             token_type_ids = [0]*max_seq_length
 97 | 
 98 |             assert(len(input_ids) == len(attention_mask) ==
 99 |                     len(token_type_ids) == max_seq_length)
100 | 
101 |             features.append(InputFeatures(input_ids=input_ids,
102 |                                             token_type_ids=token_type_ids,
103 |                                             attention_mask=attention_mask,
104 |                                             masked_index=masked_index))
105 | 
106 |         all_input_ids = torch.tensor(
107 |             [f.input_ids for f in features], dtype=torch.long)
108 |         all_token_type_ids = torch.tensor(
109 |             [f.token_type_ids for f in features], dtype=torch.long)
110 |         all_attention_mask = torch.tensor(
111 |             [f.attention_mask for f in features], dtype=torch.long)
112 |         all_masked_index = torch.tensor(
113 |             [f.masked_index for f in features], dtype=torch.long)
114 |         data = TensorDataset(all_input_ids, all_token_type_ids,
115 |                                 all_attention_mask, all_masked_index)
116 | 
117 |         return features, data
118 | 
119 |     input_list=get_input_sequence(input)
120 |     dataset=create_template(input_list)
121 | 
122 |     predict_features, predict_dataset=convert_examples_to_features(dataset,tokenizer)
123 |     predict_dataloader=DataLoader(predict_dataset,batch_size=config.eval_batch_size,sampler=SequentialSampler(predict_dataset))
124 | 
125 |     model.eval()
126 | 
127 |     pred_labels = []
128 | 
129 |     for b_i, (input_ids, token_type_ids, attention_mask,masked_index) in enumerate(tqdm(predict_dataloader, desc="Predicting")):
130 |         input_ids = input_ids.to(config.device)
131 |         attention_mask = attention_mask.to(config.device)
132 |         token_type_ids = token_type_ids.to(config.device)
133 |         masked_index=masked_index.to(config.device)
134 |         with torch.no_grad():
135 |             output = model.forward(input_ids, token_type_ids, attention_mask,masked_index)                
136 |             logits=torch.argmax(output,dim=-1)
137 | 
138 |         for l in logits:
139 |             pred_labels.append([id2label[l.item()]])
140 | 
141 |     output=[]
142 | 
143 |     for word,label in zip(list("".join(input_list)),pred_labels):
144 |         output.append((word,label[0]))
145 |     print(output)
146 |     return output
147 | 
148 | 
149 | if __name__=="__main__":
150 |     model=build_model()
151 |     input="我得了肠胃炎，现在要去做穿肠手术。我昨天吃了二甲双胍，今天准备去拍CT。三个月前我被切除了胃和肾脏，现在觉得特别空虚，希望能做一个白细胞检查。"
152 |     for word,label in predict(input,model):
153 |         print(word,label)
154 | 
155 | 
156 | 


--------------------------------------------------------------------------------
/processer.py:
--------------------------------------------------------------------------------
  1 | import enum
  2 | import torch
  3 | from utils import load_file, save_pkl, load_pkl
  4 | from config import Config
  5 | from logger import logger as logging
  6 | import os
  7 | from tqdm import trange, tqdm
  8 | from torch.utils.data import TensorDataset
  9 | 
 10 | config = Config()
 11 | 
 12 | 
 13 | def dataset_format(file_path, separater=" "):
 14 |     label_map = {label: i for i, label in enumerate(config.label_list)}
 15 |     sentence_list = []
 16 |     label_list = []
 17 |     sentence = ""
 18 |     labels = []
 19 |     file = load_file(file_path, separater=separater)
 20 |     for unit in file[:]:
 21 |         if len(unit) <= 1:
 22 |             if len(sentence) >= config.min_sequence_length:
 23 |                 sentence_list.append(sentence[:58])
 24 |                 sentence = ""
 25 |                 label_list.append(labels[:58])
 26 |                 labels = []
 27 |             continue
 28 |         word, label = unit
 29 |         if word == "，" or word == "。":
 30 |             if len(sentence) >= config.min_sequence_length:
 31 |                 sentence_list.append(sentence[:58])
 32 |                 sentence = ""
 33 |                 label_list.append(labels[:58])
 34 |                 labels = []
 35 |         else:
 36 |             sentence += word
 37 |             labels.append(label_map[label])
 38 |     return sentence_list, label_list
 39 | 
 40 | 
 41 | class Template():
 42 |     def __init__(self, text, label) -> None:
 43 |         self.text = text
 44 |         self.label = label
 45 | 
 46 | 
 47 | def create_template(all_texts, all_labels):
 48 |     dataset = []
 49 |     for item in zip(all_texts, all_labels):
 50 |         texts, labels = item
 51 |         for word, label in zip(list(texts), labels):
 52 |             template = Template(text=texts+"，"+word+"是", label=label)
 53 |             dataset.append(template)
 54 |     return dataset
 55 | 
 56 | 
 57 | class InputFeatures(object):
 58 |     """A single set of features of data."""
 59 | 
 60 |     def __init__(self, input_ids, token_type_ids, attention_mask, masked_index, label_id, ori_label):
 61 |         """
 62 |         :param input_ids:       单词在词典中的编码
 63 |         :param attention_mask:  指定 对哪些词 进行self-Attention操作
 64 |         :param token_type_ids:  区分两个句子的编码（上句全为0，下句全为1）
 65 |         :param label_id:        标签的id
 66 |         """
 67 |         self.input_ids = input_ids
 68 |         self.token_type_ids = token_type_ids
 69 |         self.attention_mask = attention_mask
 70 |         self.label_id = label_id
 71 |         self.ori_label = ori_label
 72 |         self.masked_index = masked_index
 73 | 
 74 | 
 75 | def convert_examples_to_features(dataset, tokenizer):
 76 |     features = []
 77 |     label_map = {label: i for i, label in enumerate(config.label_list)}
 78 |     max_seq_length = config.max_seq_length
 79 |     for example in tqdm(dataset):
 80 | 
 81 |         label = [0]*len(config.label_list)
 82 |         label[example.label] = 1
 83 | 
 84 |         # for i, word in enumerate(example_text):
 85 |         #     token = tokenizer.tokenize(word)
 86 |         #     tokens.extend(token)
 87 |         #     ori_tokens.append(word)
 88 | 
 89 |         if len(example.text) >= max_seq_length - 8:
 90 |             # -2的原因是因为序列需要加一个句首和句尾标志
 91 |             example.text = example.text[0:(max_seq_length - 9)]
 92 | 
 93 |         example.text = ["[CLS"]+list(example.text) + ['[MASK]']+["[SEP]"]
 94 | 
 95 |         input_ids = tokenizer.convert_tokens_to_ids(example.text)
 96 |         attention_mask = [1]*len(example.text)
 97 |         masked_index = example.text.index("[MASK]")
 98 | 
 99 |         if len(input_ids) < max_seq_length:
100 |             input_ids.extend([0]*(max_seq_length-len(input_ids)))
101 |             attention_mask.extend([0]*(max_seq_length-len(attention_mask)))
102 | 
103 |         token_type_ids = [0]*max_seq_length
104 | 
105 |         assert(len(input_ids) == len(attention_mask) ==
106 |                 len(token_type_ids) == max_seq_length)
107 | 
108 |         features.append(InputFeatures(input_ids=input_ids,
109 |                                         token_type_ids=token_type_ids,
110 |                                         attention_mask=attention_mask,
111 |                                         label_id=label,
112 |                                         ori_label=example.label,
113 |                                         masked_index=masked_index))
114 | 
115 |     all_input_ids = torch.tensor(
116 |         [f.input_ids for f in features], dtype=torch.long)
117 |     all_token_type_ids = torch.tensor(
118 |         [f.token_type_ids for f in features], dtype=torch.long)
119 |     all_attention_mask = torch.tensor(
120 |         [f.attention_mask for f in features], dtype=torch.long)
121 |     all_label_ids = torch.tensor(
122 |         [f.label_id for f in features], dtype=torch.float)
123 |     all_ori_labels = torch.tensor(
124 |         [f.ori_label for f in features], dtype=torch.long)
125 |     all_masked_index = torch.tensor(
126 |         [f.masked_index for f in features], dtype=torch.long)
127 |     data = TensorDataset(all_input_ids, all_token_type_ids,
128 |                             all_attention_mask, all_label_ids, all_ori_labels, all_masked_index)
129 |     return features, data
130 | 
131 | 
132 | def get_labels(config: Config):
133 |     """
134 |     读取训练数据获取标签
135 |     :param config:
136 |     :return:
137 |     """
138 |     label_map = {label: i for i, label in enumerate(config.label_list)}
139 |     label_pkl_path = "label_list.pkl"
140 |     save_pkl(label_map, label_pkl_path)
141 |     if os.path.exists(label_pkl_path):
142 |         logging.info(f"loading labels info from {label_pkl_path}")
143 |         labels = load_pkl(label_pkl_path)
144 |         print(labels)
145 |     else:
146 |         logging.info(
147 |             f"loading labels info from train file and dump in {config.output_path}")
148 |         tokens_list = load_file(config.train_file, separater=config.sep)
149 |         labels = set([tokens[1] for tokens in tokens_list if len(tokens) == 2])
150 | 
151 |     if len(labels) == 0:
152 |         ValueError("loading labels error, labels type not found in data file: {}".format(
153 |             config.output_path))
154 |     else:
155 |         save_pkl(labels, label_pkl_path)
156 | 
157 |     return labels
158 | 
159 | 
160 | def get_label2id_id2label(output_path, label_list):
161 |     """
162 |     获取label2id、id2label的映射
163 |     :param output_path:
164 |     :param label_list:
165 |     :return:
166 |     """
167 |     label2id_path = "label2id.pkl"
168 |     if os.path.exists(label2id_path):
169 |         label2id = load_pkl(label2id_path)
170 |     else:
171 |         label2id = {l: i for i, l in enumerate(label_list)}
172 |         save_pkl(label2id, label2id_path)
173 | 
174 |     id2label = {value: key for key, value in label2id.items()}
175 |     return label2id, id2label
176 | 


--------------------------------------------------------------------------------
/conlleval.py:
--------------------------------------------------------------------------------
  1 | # Python version of the evaluation script from CoNLL'00-
  2 | # Originates from: https://github.com/spyysalo/conlleval.py
  3 | 
  4 | 
  5 | # Intentional differences:
  6 | # - accept any space as delimiter by default
  7 | # - optional file argument (default STDIN)
  8 | # - option to set boundary (-b argument)
  9 | # - LaTeX output (-l argument) not supported
 10 | # - raw tags (-r argument) not supported
 11 | 
 12 | # add function :evaluate(predicted_label, ori_label): which will not read from file
 13 | 
 14 | import codecs
 15 | import re
 16 | import sys
 17 | from collections import defaultdict, namedtuple
 18 | 
 19 | ANY_SPACE = '<SPACE>'
 20 | 
 21 | 
 22 | class FormatError(Exception):
 23 |     pass
 24 | 
 25 | 
 26 | Metrics = namedtuple('Metrics', 'tp fp fn prec rec fscore')
 27 | 
 28 | 
 29 | class EvalCounts(object):
 30 |     def __init__(self):
 31 |         self.correct_chunk = 0  # number of correctly identified chunks
 32 |         self.correct_tags = 0  # number of correct chunk tags
 33 |         self.found_correct = 0  # number of chunks in corpus
 34 |         self.found_guessed = 0  # number of identified chunks
 35 |         self.token_counter = 0  # token counter (ignores sentence breaks)
 36 | 
 37 |         # counts by type
 38 |         self.t_correct_chunk = defaultdict(int)
 39 |         self.t_found_correct = defaultdict(int)
 40 |         self.t_found_guessed = defaultdict(int)
 41 | 
 42 | 
 43 | def parse_args(argv):
 44 |     import argparse
 45 |     parser = argparse.ArgumentParser(
 46 |         description='evaluate tagging results using CoNLL criteria',
 47 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter
 48 |     )
 49 |     arg = parser.add_argument
 50 |     arg('-b', '--boundary', metavar='STR', default='-X-',
 51 |         help='sentence boundary')
 52 |     arg('-d', '--delimiter', metavar='CHAR', default=ANY_SPACE,
 53 |         help='character delimiting items in input')
 54 |     arg('-o', '--otag', metavar='CHAR', default='O',
 55 |         help='alternative outside tag')
 56 |     arg('file', nargs='?', default=None)
 57 |     return parser.parse_args(argv)
 58 | 
 59 | 
 60 | def parse_tag(t):
 61 |     m = re.match(r'^([^-]*)-(.*)$', t)
 62 |     return m.groups() if m else (t, '')
 63 | 
 64 | 
 65 | def evaluate(iterable, options=None):
 66 |     if options is None:
 67 |         options = parse_args([])  # use defaults
 68 | 
 69 |     counts = EvalCounts()
 70 |     num_features = None  # number of features per line
 71 |     in_correct = False  # currently processed chunks is correct until now
 72 |     last_correct = 'O'  # previous chunk tag in corpus
 73 |     last_correct_type = ''  # type of previously identified chunk tag
 74 |     last_guessed = 'O'  # previously identified chunk tag
 75 |     last_guessed_type = ''  # type of previous chunk tag in corpus
 76 | 
 77 |     for line in iterable:
 78 |         line = line.rstrip('\r\n')
 79 | 
 80 |         if options.delimiter == ANY_SPACE:
 81 |             features = line.split()
 82 |         else:
 83 |             features = line.split(options.delimiter)
 84 | 
 85 |         if num_features is None:
 86 |             num_features = len(features)
 87 |         elif num_features != len(features) and len(features) != 0:
 88 |             raise FormatError('unexpected number of features: %d (%d)' %
 89 |                               (len(features), num_features))
 90 | 
 91 |         if len(features) == 0 or features[0] == options.boundary:
 92 |             features = [options.boundary, 'O', 'O']
 93 |         if len(features) < 3:
 94 |             raise FormatError('unexpected number of features in line %s' % line)
 95 | 
 96 |         guessed, guessed_type = parse_tag(features.pop())
 97 |         correct, correct_type = parse_tag(features.pop())
 98 |         first_item = features.pop(0)
 99 | 
100 |         if first_item == options.boundary:
101 |             guessed = 'O'
102 | 
103 |         end_correct = end_of_chunk(last_correct, correct,
104 |                                    last_correct_type, correct_type)
105 |         end_guessed = end_of_chunk(last_guessed, guessed,
106 |                                    last_guessed_type, guessed_type)
107 |         start_correct = start_of_chunk(last_correct, correct,
108 |                                        last_correct_type, correct_type)
109 |         start_guessed = start_of_chunk(last_guessed, guessed,
110 |                                        last_guessed_type, guessed_type)
111 | 
112 |         if in_correct:
113 |             if (end_correct and end_guessed and
114 |                     last_guessed_type == last_correct_type):
115 |                 in_correct = False
116 |                 counts.correct_chunk += 1
117 |                 counts.t_correct_chunk[last_correct_type] += 1
118 |             elif (end_correct != end_guessed or guessed_type != correct_type):
119 |                 in_correct = False
120 | 
121 |         if start_correct and start_guessed and guessed_type == correct_type:
122 |             in_correct = True
123 | 
124 |         if start_correct:
125 |             counts.found_correct += 1
126 |             counts.t_found_correct[correct_type] += 1
127 |         if start_guessed:
128 |             counts.found_guessed += 1
129 |             counts.t_found_guessed[guessed_type] += 1
130 |         if first_item != options.boundary:
131 |             if correct == guessed and guessed_type == correct_type:
132 |                 counts.correct_tags += 1
133 |             counts.token_counter += 1
134 | 
135 |         last_guessed = guessed
136 |         last_correct = correct
137 |         last_guessed_type = guessed_type
138 |         last_correct_type = correct_type
139 | 
140 |     if in_correct:
141 |         counts.correct_chunk += 1
142 |         counts.t_correct_chunk[last_correct_type] += 1
143 | 
144 |     return counts
145 | 
146 | 
147 | def uniq(iterable):
148 |     seen = set()
149 |     return [i for i in iterable if not (i in seen or seen.add(i))]
150 | 
151 | 
152 | def calculate_metrics(correct, guessed, total):
153 |     tp, fp, fn = correct, guessed - correct, total - correct
154 |     p = 0 if tp + fp == 0 else 1. * tp / (tp + fp)
155 |     r = 0 if tp + fn == 0 else 1. * tp / (tp + fn)
156 |     f = 0 if p + r == 0 else 2 * p * r / (p + r)
157 |     return Metrics(tp, fp, fn, p, r, f)
158 | 
159 | 
160 | def metrics(counts):
161 |     c = counts
162 |     overall = calculate_metrics(
163 |         c.correct_chunk, c.found_guessed, c.found_correct
164 |     )
165 |     by_type = {}
166 |     for t in uniq(list(c.t_found_correct) + list(c.t_found_guessed)):
167 |         by_type[t] = calculate_metrics(
168 |             c.t_correct_chunk[t], c.t_found_guessed[t], c.t_found_correct[t]
169 |         )
170 |     return overall, by_type
171 | 
172 | 
173 | def report(counts, out=None):
174 |     if out is None:
175 |         out = sys.stdout
176 | 
177 |     overall, by_type = metrics(counts)
178 | 
179 |     c = counts
180 |     out.write('processed %d tokens with %d phrases; ' %
181 |               (c.token_counter, c.found_correct))
182 |     out.write('found: %d phrases; correct: %d.\n' %
183 |               (c.found_guessed, c.correct_chunk))
184 | 
185 |     if c.token_counter > 0:
186 |         out.write('accuracy: %6.2f%%; ' %
187 |                   (100. * c.correct_tags / c.token_counter))
188 |         out.write('precision: %6.2f%%; ' % (100. * overall.prec))
189 |         out.write('recall: %6.2f%%; ' % (100. * overall.rec))
190 |         out.write('FB1: %6.2f\n' % (100. * overall.fscore))
191 | 
192 |     for i, m in sorted(by_type.items()):
193 |         out.write('%17s: ' % i)
194 |         out.write('precision: %6.2f%%; ' % (100. * m.prec))
195 |         out.write('recall: %6.2f%%; ' % (100. * m.rec))
196 |         out.write('FB1: %6.2f  %d\n' % (100. * m.fscore, c.t_found_guessed[i]))
197 | 
198 | 
199 | def report_notprint(counts, out=None):
200 |     if out is None:
201 |         out = sys.stdout
202 | 
203 |     overall, by_type = metrics(counts)
204 | 
205 |     c = counts
206 |     final_report = []
207 |     line = []
208 |     line.append('processed %d tokens with %d phrases; ' %
209 |                 (c.token_counter, c.found_correct))
210 |     line.append('found: %d phrases; correct: %d.\n' %
211 |                 (c.found_guessed, c.correct_chunk))
212 |     final_report.append("".join(line))
213 | 
214 |     if c.token_counter > 0:
215 |         line = []
216 |         line.append('accuracy: %6.2f%%; ' %
217 |                     (100. * c.correct_tags / c.token_counter))
218 |         line.append('precision: %6.2f%%; ' % (100. * overall.prec))
219 |         line.append('recall: %6.2f%%; ' % (100. * overall.rec))
220 |         line.append('FB1: %6.2f\n' % (100. * overall.fscore))
221 |         final_report.append("".join(line))
222 | 
223 |     for i, m in sorted(by_type.items()):
224 |         line = []
225 |         line.append('%17s: ' % i)
226 |         line.append('precision: %6.2f%%; ' % (100. * m.prec))
227 |         line.append('recall: %6.2f%%; ' % (100. * m.rec))
228 |         line.append('FB1: %6.2f  %d\n' % (100. * m.fscore, c.t_found_guessed[i]))
229 |         final_report.append("".join(line))
230 |     return final_report
231 | 
232 | 
233 | def end_of_chunk(prev_tag, tag, prev_type, type_):
234 |     # check if a chunk ended between the previous and current word
235 |     # arguments: previous and current chunk tags, previous and current types
236 |     chunk_end = False
237 | 
238 |     if prev_tag == 'E': chunk_end = True
239 |     if prev_tag == 'S': chunk_end = True
240 | 
241 |     if prev_tag == 'B' and tag == 'B': chunk_end = True
242 |     if prev_tag == 'B' and tag == 'S': chunk_end = True
243 |     if prev_tag == 'B' and tag == 'O': chunk_end = True
244 |     if prev_tag == 'I' and tag == 'B': chunk_end = True
245 |     if prev_tag == 'I' and tag == 'S': chunk_end = True
246 |     if prev_tag == 'I' and tag == 'O': chunk_end = True
247 | 
248 |     if prev_tag != 'O' and prev_tag != '.' and prev_type != type_:
249 |         chunk_end = True
250 | 
251 |     # these chunks are assumed to have length 1
252 |     if prev_tag == ']': chunk_end = True
253 |     if prev_tag == '[': chunk_end = True
254 | 
255 |     return chunk_end
256 | 
257 | 
258 | def start_of_chunk(prev_tag, tag, prev_type, type_):
259 |     # check if a chunk started between the previous and current word
260 |     # arguments: previous and current chunk tags, previous and current types
261 |     chunk_start = False
262 | 
263 |     if tag == 'B': chunk_start = True
264 |     if tag == 'S': chunk_start = True
265 | 
266 |     if prev_tag == 'E' and tag == 'E': chunk_start = True
267 |     if prev_tag == 'E' and tag == 'I': chunk_start = True
268 |     if prev_tag == 'S' and tag == 'E': chunk_start = True
269 |     if prev_tag == 'S' and tag == 'I': chunk_start = True
270 |     if prev_tag == 'O' and tag == 'E': chunk_start = True
271 |     if prev_tag == 'O' and tag == 'I': chunk_start = True
272 | 
273 |     if tag != 'O' and tag != '.' and prev_type != type_:
274 |         chunk_start = True
275 | 
276 |     # these chunks are assumed to have length 1
277 |     if tag == '[': chunk_start = True
278 |     if tag == ']': chunk_start = True
279 | 
280 |     return chunk_start
281 | 
282 | 
283 | def return_report(input_file):
284 |     with codecs.open(input_file, "r", "utf8") as f:
285 |         counts = evaluate(f)
286 |     return report_notprint(counts)
287 | 
288 | 
289 | def main(argv):
290 |     args = parse_args(argv[1:])
291 | 
292 |     if args.file is None:
293 |         counts = evaluate(sys.stdin, args)
294 |     else:
295 |         with open(args.file) as f:
296 |             counts = evaluate(f, args)
297 |     report(counts)
298 | 
299 | 
300 | if __name__ == '__main__':
301 |     sys.exit(main(sys.argv))
302 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | from transformers import BertForMaskedLM, BertTokenizer,AdamW, get_linear_schedule_with_warmup
  2 | from processer import *
  3 | import os
  4 | import numpy as np
  5 | import pandas as pd
  6 | from logger import logger as logging
  7 | from config import Config
  8 | from prompt_model import Prompt_Based_NER,evaluate
  9 | from torch.utils.data import DataLoader,RandomSampler,SequentialSampler
 10 | from tqdm import trange,tqdm
 11 | import torch
 12 | import random
 13 | from utils import save_pkl,load_pkl
 14 | from torch.optim import Adam
 15 | from torch import nn
 16 | 
 17 | torch.manual_seed(1)
 18 | torch.cuda.manual_seed(1)
 19 | random.seed(1)
 20 | 
 21 | # os.environ["CUDA_VISIBLE_DEVICES"]="0"
 22 | 
 23 | config=Config()
 24 | 
 25 | if not os.path.exists(config.output_path):
 26 |     os.makedirs(config.output_path)
 27 | 
 28 | CURRENT_PATH=os.getcwd()
 29 | TRAIN_PATH=os.path.join(CURRENT_PATH,'data/train.txt')
 30 | EVAL_PATH=os.path.join(CURRENT_PATH,"data/eval.txt")
 31 | TEST_PATH=os.path.join(CURRENT_PATH,"data/test.txt")
 32 | 
 33 | use_gpu = torch.cuda.is_available() and config.use_gpu
 34 | device = torch.device('cuda' if use_gpu else config.device)
 35 | config.device = device
 36 | n_gpu = torch.cuda.device_count()
 37 | logging.info(f"available device: {device}，count_gpu: {n_gpu}")
 38 | 
 39 | tokenizer = BertTokenizer.from_pretrained(config.model_name_or_path)
 40 | logging.info(f"=================Tokenizer {config.model_name_or_path} Load Successfully=================")
 41 | 
 42 | label_list = get_labels(config=config)
 43 | config.label_list = label_list
 44 | num_labels = len(label_list)
 45 | logging.info(f"loading labels successful! the size is {num_labels}, label is: {','.join(list(label_list))}")
 46 | 
 47 | label2id, id2label = get_label2id_id2label(config.output_path, label_list=label_list)
 48 | logging.info("loading label2id and id2label dictionary successful!")
 49 | 
 50 | if config.do_train:
 51 |     logging.info(f"=================Start Loading Train Dataset=================")
 52 |     train_text,train_label=dataset_format(TRAIN_PATH,separater=" ")
 53 |     assert(len(train_text)==len(train_label))
 54 |     train_length_list=[len(i) for i in train_text]
 55 |     logging.info(f"Discription of Train Dataset: {pd.DataFrame(train_length_list).describe()}")
 56 | 
 57 |     logging.info(f"=================Start Creating Train Templates=================")
 58 |     train_dataset=create_template(train_text,train_label)
 59 | 
 60 |     logging.info(f"=================Preview 3 Examples=================")
 61 |     for i in range(5):
 62 |         print(train_dataset[i].text+"[MASK]",train_dataset[i].label)
 63 | 
 64 |     logging.info(f"=================Convert Train Examples=================")
 65 |     # if os.path.exists("train_dataset.pkl"):
 66 |     #     train_dataset=load_pkl("train_dataset.pkl")
 67 |     # else:
 68 |     train_features,train_dataset=convert_examples_to_features(train_dataset,tokenizer)
 69 |     save_pkl(train_dataset,"train_dataset.pkl")
 70 |     train_dataloader=DataLoader(train_dataset,batch_size=config.train_batch_size,sampler=RandomSampler(train_dataset))
 71 | 
 72 | if config.do_eval:
 73 |     logging.info(f"=================Start Loading Eval Dataset=================")
 74 |     eval_text,eval_label=dataset_format(EVAL_PATH,separater=" ")
 75 |     assert(len(eval_text)==len(eval_label))
 76 |     
 77 |     logging.info(f"=================Start Creating Eval Templates=================")
 78 |     eval_dataset=create_template(eval_text,eval_label)
 79 | 
 80 |     logging.info(f"=================Convert Eval Examples=================")
 81 |     # if os.path.exists("eval_dataset.pkl"):
 82 |     #     eval_dataset=load_pkl("eval_dataset.pkl")
 83 |     # else:
 84 |     eval_features,eval_dataset=convert_examples_to_features(eval_dataset,tokenizer)
 85 |     save_pkl(eval_dataset,"eval_dataset.pkl")
 86 |     eval_dataloader=DataLoader(eval_dataset,batch_size=config.eval_batch_size,sampler=SequentialSampler(eval_dataset))
 87 | 
 88 | if config.do_test:
 89 |     logging.info(f"=================Start Loading Test Dataset=================")
 90 |     test_text,test_label=dataset_format(TEST_PATH,separater=" ")
 91 |     assert(len(test_text)==len(test_label))
 92 | 
 93 |     logging.info(f"=================Start Creating Test Templates=================")
 94 |     test_dataset=create_template(test_text,test_label)
 95 | 
 96 |     logging.info(f"=================Convert Test Examples=================")
 97 |     if os.path.exists("test_dataset.pkl"):
 98 |         test_dataset=load_pkl("test_dataset.pkl")
 99 |     else:
100 |         test_features,test_dataset=convert_examples_to_features(test_dataset,tokenizer)
101 |         save_pkl(test_dataset,"test_dataset.pkl")
102 |     test_dataloader=DataLoader(test_dataset,batch_size=config.eval_batch_size,sampler=SequentialSampler(test_dataset))
103 | 
104 | model = Prompt_Based_NER(config).to(device)
105 | 
106 | if use_gpu and n_gpu > 1:
107 |     model = torch.nn.DataParallel(model)
108 | 
109 | no_decay = ['bias', 'LayerNorm.weight']
110 | optimizer_grouped_parameters = [
111 |     {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
112 |         'weight_decay': 0.01},
113 |     {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
114 |         'weight_decay': 0.0}
115 | ]
116 | optimizer = Adam(optimizer_grouped_parameters, lr=config.learning_rate, eps=config.adam_epsilon)
117 | 
118 | t_total = len(train_dataloader) // config.gradient_accumulation_steps * config.num_train_epochs
119 | scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=config.warmup_steps,
120 |                                                     num_training_steps=t_total)
121 | CrossEntropyloss=nn.CrossEntropyLoss()
122 | 
123 | logging.info("loading AdamW optimizer、Warmup LinearSchedule and calculate optimizer parameter successful!")
124 | 
125 | logging.info("====================== Running training ======================")
126 | logging.info(
127 |     f"Num Examples:  {len(train_dataset)}, Num Batch Step: {len(train_dataloader)}, "
128 |     f"Num Epochs: {config.num_train_epochs}, Num scheduler steps: {t_total}")
129 | 
130 | model.train()
131 | global_step, tr_loss, logging_loss, best_f1 = 0, 0.0, 0.0, 0.0
132 | for ep in trange(int(config.num_train_epochs), desc="Epoch"):
133 |     logging.info(f"#######[Epoch: {ep}/{int(config.num_train_epochs)}]#######")
134 |     model.train()
135 |     for step, batch in enumerate(tqdm(train_dataloader, desc="DataLoader")):
136 |         if step%100==0:
137 |             logging.info(f"####[Step: {step}/{len(train_dataloader)}]####")
138 | 
139 |         batch = tuple(t.to(device) for t in batch)
140 |         input_ids, token_type_ids, attention_mask, label_ids,ori_tokens,mask_index = batch
141 | 
142 |         outputs = model(input_ids, token_type_ids, attention_mask,mask_index)
143 |         loss = CrossEntropyloss(outputs,label_ids)
144 | 
145 |         if use_gpu and n_gpu > 1:
146 |             # mean() to average on multi-gpu.
147 |             loss = loss.mean()
148 | 
149 |         if config.gradient_accumulation_steps > 1:
150 |             loss = loss / config.gradient_accumulation_steps
151 | 
152 |         # 反向传播
153 |         loss.backward()
154 |         tr_loss += loss.item()
155 | 
156 |         # 优化器_模型参数的总更新次数，和上面的t_total对应
157 |         if (step + 1) % config.gradient_accumulation_steps == 0:
158 |             # 更新参数
159 |             optimizer.step()
160 |             scheduler.step()
161 |             # 梯度清零
162 |             model.zero_grad()
163 |             global_step += 1
164 | 
165 |         if global_step%config.logging_steps == 0:
166 |             tr_loss_avg = tr_loss  / config.logging_steps
167 |             tr_loss=0
168 |             logging.info(f'tr_loss_avg= {tr_loss_avg}')
169 | 
170 |     if config.do_eval:
171 |         logging.info("====================== Running Eval ======================")
172 |         all_ori_tokens_eval = [token for token in list("".join(eval_text))]
173 |         overall, by_type = evaluate(config, eval_dataloader, model, id2label, all_ori_tokens_eval)
174 | 
175 |         f1_score = overall.fscore
176 | 
177 |         # save the best performs model
178 |         if f1_score > best_f1:
179 |             logging.info(f"******** the best f1 is {f1_score}, save model !!! ********")
180 |             best_f1 = f1_score
181 |             # Take care of distributed/parallel training
182 |             model_to_save = model.module if hasattr(model, 'module') else model
183 |         
184 |             torch.save(model.state_dict(),config.output_path+"/best_model.pth")
185 | 
186 |             net_dict = model.state_dict()
187 |             best_model = torch.load(config.output_path+"/best_model.pth")
188 |             for k, v in best_model.items():
189 |                 name = k[7:] # remove `module.`
190 |                 net_dict[name] = v
191 |             
192 |             torch.save(net_dict,"best_model.pth")
193 |             # model_to_save.save_pretrained(config.output_path)
194 |             tokenizer.save_pretrained(config.output_path)
195 | 
196 | 
197 |             # Good practice: save your training arguments together with the trained model
198 |             torch.save(config, os.path.join(config.output_path, 'training_config.bin'))
199 |             torch.save(model, os.path.join(config.output_path, 'ner_model.ckpt'))
200 |             logging.info("training_args.bin and ner_model.ckpt save successful!")
201 | logging.info("NER Prompt model training successful!!!")
202 | logging.info(f"Best F1 is {best_f1}!")
203 | 
204 | 
205 | if config.do_test:
206 | 
207 |     all_ori_tokens_test = [token for token in list("".join(test_text))]
208 |     overall, by_type = evaluate(config, test_dataloader, model, id2label, all_ori_tokens_test)
209 | 
210 |     logging.info("====================== Running test ======================")
211 |     logging.info(f"Num Examples:  {len(test_dataset)}, Batch size: {config.eval_batch_size}")
212 |     f1_score = overall.fscore
213 |     logging.info(f"**********Test F1 is {f1_score}")
214 | 
215 |     model.eval()
216 | 
217 |     pred_labels = []
218 |     ori_labels=[]
219 | 
220 |     for b_i, (input_ids, token_type_ids, attention_mask, label_ids,ori_tokens,masked_index) in enumerate(tqdm(test_dataloader, desc="Evaluating")):
221 |         input_ids = input_ids.to(config.device)
222 |         attention_mask = attention_mask.to(config.device)
223 |         token_type_ids = token_type_ids.to(config.device)
224 |         label_ids = label_ids.to(config.device)
225 |         masked_index=masked_index.to(config.device)
226 |         with torch.no_grad():
227 |             output = model.forward(input_ids, token_type_ids, attention_mask,masked_index)                
228 |             logits=torch.argmax(output,dim=-1)
229 | 
230 |         for l in logits:
231 |             pred_labels.append([id2label[l.item()]])
232 | 
233 |         for l in label_ids:
234 |             ori_labels.append([id2label[l.tolist().index(1)]])
235 | 
236 |         for l in logits:
237 |             pred_label = []
238 |             for idx in l[1:]:
239 |                 pred_label.append(id2label[idx])
240 |             pred_labels.append(pred_label)
241 | 
242 |     assert len(pred_labels) == len(ori_tokens) == len(ori_labels)
243 | 
244 |     with open(os.path.join(config.output_path, "token_labels_test.txt"), "w", encoding="utf-8") as f:
245 |         for ori_tokens, ori_labels, prel in zip(ori_tokens, ori_labels, pred_labels):
246 |             for ot, ol, pl in zip(ori_tokens, ori_labels, prel):
247 |                 if ot in ["[CLS]", "[SEP]"]: 
248 |                     continue
249 |                 else:
250 |                     f.write(f"{ot} {ol} {pl}\n")
251 |             f.write("\n")
252 | 
253 | 
254 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------