├── model.png ├── config ├── bio_config └── bio_config_adv ├── README.md ├── train.py ├── data_parsers.py ├── data_build.py ├── model.py ├── data_utils.py └── eval.py /model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NeilGY/NER_entityRelationExtration/HEAD/model.png -------------------------------------------------------------------------------- /config/bio_config: -------------------------------------------------------------------------------- 1 | # pretrained embeddings 2 | filename_embeddings =data/vecs.lc.over100freq.txt 3 | 4 | # dataset 5 | filename_dev = "data/dev.txt" 6 | filename_test = "data/test.txt" 7 | filename_train = "data/train.txt" 8 | 9 | # training 10 | nepochs = 150 11 | optimizer = Adam 12 | activation = tanh 13 | learning_rate = 1e-3 14 | gradientClipping = False # if False, no clipping 15 | nepoch_no_imprv = 30 16 | use_dropout = True 17 | ner_loss = crf # or softmax 18 | use_chars = True 19 | use_adversarial = False 20 | ner_classes = BIO #or EC for entity classification 21 | 22 | #hyperparameters 23 | dropout_embedding = 0.9 24 | dropout_lstm = 0.9 25 | dropout_lstm_output = 0.9 26 | dropout_fcl_ner = 1 27 | dropout_fcl_rel = 1 28 | hidden_size_lstm = 64 29 | hidden_size_n1 = 64 30 | #hidden_size_n2 = 32 31 | num_lstm_layers = 3 32 | char_embeddings_size = 25 33 | hidden_size_char = 25 34 | label_embeddings_size = 0 #if 0, no label embeddings 35 | alpha = 0.01 36 | 37 | #evaluation 38 | evaluation_method = strict # alternatives "boundaries" and "relaxed" 39 | root_node = False 40 | -------------------------------------------------------------------------------- /config/bio_config_adv: -------------------------------------------------------------------------------- 1 | # pretrained embeddings 2 | filename_embeddings =data/CoNLL04/vecs.lc.over100freq.txt 3 | 4 | # dataset 5 | filename_dev = "data/CoNLL04/dev.txt" 6 | filename_test = "data/CoNLL04/test.txt" 7 | filename_train = "data/CoNLL04/train.txt" 8 | 9 | # training 10 | nepochs = 130 11 | optimizer = Adam 12 | activation = tanh 13 | learning_rate = 1e-3 14 | gradientClipping = False # if False, no clipping 15 | nepoch_no_imprv = 40 16 | use_dropout = True 17 | ner_loss = crf # or softmax 18 | use_chars = True 19 | use_adversarial = True 20 | ner_classes = BIO #or EC for entity classification 21 | 22 | #hyperparameters 23 | dropout_embedding = 0.9 24 | dropout_lstm = 0.9 25 | dropout_lstm_output = 0.9 26 | dropout_fcl_ner = 1 27 | dropout_fcl_rel = 1 28 | hidden_size_lstm = 64 29 | hidden_size_n1 = 64 30 | #hidden_size_n2 = 32 31 | num_lstm_layers = 3 32 | char_embeddings_size = 25 33 | hidden_size_char = 25 34 | label_embeddings_size = 0 #if 0, no label embeddings 35 | alpha = 0.01 36 | 37 | #evaluation 38 | evaluation_method = strict # alternatives "boundaries" and "relaxed" 39 | root_node = False 40 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 模型图:项目中model.png 2 | 请参照模型图理解代码 3 | 4 | 1.项目大致流程描述: 5 | word/char Embedding(特征嵌入层): 6 | 在词级别的向量基础上加入字符级的信息,这样的embedding可以捕捉前缀后缀这样的形态特征。 7 | 先用skip-gram word2vec 模型预训练得到的词向量表将每个词映射为一个词向量,然后把每个词中字母用一个向量表示,把一个词中所包含的字母的向量送入 BiLSTM, 8 | 把前后两个最终状态和 词向量进行拼接,得到词的embedding 9 | BiLSTM层: 10 | 把句子中所包含词的embedding输入,然后将前向、后向 每个对应位置的hidden state拼接起来得到新的编码序列。 11 | CRF Layer: 12 | 采用BIO标注策略,使用CRF引入标签之间的依赖关系, 13 | 计算每个词得到不同标签的分数 14 | 计算句子的标签序列概率 15 | 采用Viterbi算法得到分数最高的序列标签 16 | 在进行命名实体时 通过最小化交叉熵损失 来达到 优化网络参数和CRF的目的,测试时用Viterbi算法得到分数最高的序列标签 17 | Label Embedding: 18 | 实体标签的embedding。训练时真实标签,测试时为预测标签 19 | Heads Relations: 20 | 输入为BiLSTM的hidden state和label Embedding的拼接。可以预测多个头,头和关系的决策是一块完成的,而不是先预测头,再用关系分类器预测关系 21 | 标签策略: CRF层的输出是采用BIO标注策略的实体识别结果,head Relations层只有在和其他实体有关系时 会给出对应实体的尾单词和关系;在与其他实体没有关系时 head为原单词本身,关系为N 22 | Adversarial training(AT): 对抗训练 使分类器对于噪音数据有更强的鲁棒性(混合原来的样本+对抗样本) 23 | 24 | 25 | 词向量数据路径: 26 | 链接: https://pan.baidu.com/s/1P_QtMKKhUdtc0XfOnpSBOw 提取码: 45ic 27 | 28 | 2.数据格式描述: 29 | #doc 5121 文件名 30 | ['token_id', 'token', "BIO", "relation", 'head'] 31 | token_id : 每个文件中词所在位置下标 32 | token : 词 33 | BIO: 标注实体类型 34 | relation: 实体关系 35 | head: 当前 实体关系 对应实体的位置下标 36 | 37 | data_parsers.py: 38 | docId: 文件名称id 39 | token_ids: 词在每个文件中对应位置的下标列表 40 | tokens: 单词的列表 41 | BIOs: 词对应的实体列表 42 | ecs: 没加标注的的实体列表 43 | relations: 实体关系的列表 44 | heads: 实体关系对应实体下标位置的列表,如[[2],[3,4]] 45 | char_ids: 每个单词中的每个字母对应的id的列表,如 两个单词第一个单词包含三个字母,第二个单词包含四个字母[[1,2,3],[11,12,1,4]] 46 | embedding_ids:单词对应id的列表 47 | BIO_ids: 实体对应id的列表 48 | ec_ids: 没加标注的实体对应id的列表 49 | joint_ids: 实体关系联合的列表:计算规则(可参考后期验证数据校验时的 数据处理规则):headId*len(set(relations))+relation_id 50 | 实体关系的去重列表长度:len(set(relations)) 51 | 该实体谷关系对应的实体下标:headId 52 | 实体关系 对应的id: relation_id 53 | 54 | 55 | 56 | 3.文件描述:方法详细功能在代码注释中可看 57 | data_build.py 初始化配置文件及数据 58 | data_parsers.py 封装数据 59 | model.py 模型 60 | train.py 模型训练 61 | data_utils 数据转换、处理 62 | eval 模型校验 63 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import data_utils 2 | import model as Model 3 | from data_build import data_build 4 | import numpy as np 5 | import tensorflow as tf 6 | 7 | 8 | output_dir='logs/' 9 | config_file='config/bio_config' 10 | 11 | def train(): 12 | config = data_build(config_file) #加载配置文件数据,处理训练数据 13 | train_data = data_utils.HeadData(config.train_id_docs, np.arange(len(config.train_id_docs))) 14 | dev_data = data_utils.HeadData(config.dev_id_docs, np.arange(len(config.dev_id_docs))) 15 | test_data = data_utils.HeadData(config.test_id_docs, np.arange(len(config.test_id_docs))) 16 | 17 | tf.reset_default_graph() 18 | tf.set_random_seed(1) 19 | 20 | data_utils.printParameters(config) 21 | 22 | with tf.Session() as sess: 23 | embedding_matrix = tf.get_variable('embedding_matrix', shape=config.wordvectors.shape, dtype=tf.float32, 24 | trainable=False).assign(config.wordvectors) 25 | emb_mtx = sess.run(embedding_matrix) 26 | #初始化模型 27 | model = Model.model(config, emb_mtx, sess) 28 | #获取需要计算的模型损失、预测结果 29 | obj, m_op, predicted_op_ner, actual_op_ner, predicted_op_rel, actual_op_rel, score_op_rel = model.run() 30 | #优化函数迭代 31 | train_step = model.get_train_op(obj) 32 | #模型参数 33 | operations = Model.operations(train_step, obj, m_op, predicted_op_ner, actual_op_ner, predicted_op_rel, actual_op_rel, score_op_rel) 34 | 35 | sess.run(tf.global_variables_initializer()) 36 | 37 | best_score = 0 38 | nepoch_no_imprv = 0 # for early stopping 39 | 40 | for iter in range(config.nepochs + 1): 41 | #模型训练 42 | model.train(train_data, operations, iter) 43 | #模型评估 44 | dev_score = model.evaluate(dev_data, operations, 'dev') 45 | model.evaluate(test_data, operations, 'test') 46 | 47 | if dev_score >= best_score: 48 | nepoch_no_imprv = 0 49 | best_score = dev_score 50 | 51 | print("- Best dev score {} so far in {} epoch".format(dev_score, iter)) 52 | 53 | else: 54 | nepoch_no_imprv += 1 55 | if nepoch_no_imprv >= config.nepoch_no_imprv: 56 | print("- early stopping {} epochs without " \ 57 | "improvement".format(nepoch_no_imprv)) 58 | 59 | with open(output_dir + "/es" + ".txt", "w+") as myfile: 60 | myfile.write(str(iter)) 61 | myfile.close() 62 | 63 | break 64 | 65 | def main(_): 66 | train() 67 | if __name__ == '__main__': 68 | tf.app.run(main) -------------------------------------------------------------------------------- /data_parsers.py: -------------------------------------------------------------------------------- 1 | import data_utils 2 | import csv 3 | import pandas as pd 4 | 5 | 6 | class headIdDoc: 7 | def __init__(self, id): 8 | self.docId = id 9 | self.token_ids = [] 10 | self.tokens = [] 11 | self.BIOs = [] 12 | self.relations = [] 13 | self.heads = [] 14 | 15 | ###extend 16 | self.embedding_ids = [] 17 | self.char_ids = [] 18 | self.BIO_ids = [] 19 | self.ecs = [] 20 | self.ec_ids = [] 21 | self.joint_ids = [] 22 | 23 | def append(self, token_id, token, BIO, relations, heads): 24 | self.tokens.append(str(token)) 25 | self.token_ids.append(token_id) 26 | self.BIOs.append(BIO) 27 | self.relations.append(relations) 28 | self.heads.append(heads) 29 | 30 | def extend(self, wordindices, dataset_set_characters, dataset_set_bio_tags, dataset_set_ec_tags, 31 | dataset_set_relations): 32 | for tId in range(len(self.tokens)): 33 | self.embedding_ids.append(int(data_utils.getEmbeddingId(self.tokens[tId], wordindices))) 34 | self.char_ids.append(data_utils.tokenToCharIds(self.tokens[tId], dataset_set_characters)) 35 | self.BIO_ids.append(int(data_utils.getLabelId(self.BIOs[tId], dataset_set_bio_tags))) 36 | self.ecs.append(data_utils.getECfromBIO(self.BIOs[tId])) 37 | self.ec_ids.append(int(data_utils.getLabelId(data_utils.getECfromBIO(self.BIOs[tId]), dataset_set_ec_tags))) 38 | self.joint_ids.append(data_utils.getScoringMatrixHeads(self.relations[tId], dataset_set_relations, self.heads[tId])) 39 | 40 | 41 | class headIdParser: 42 | def __init__(self, file): 43 | docNr = -1 44 | self.head_docs = [] 45 | tokens = headIdDoc("") 46 | 47 | for i in range(file.shape[0]): 48 | if '#doc' in file[i][0] or i == file.shape[0] - 1: # append all docs including the last one 49 | if (i == file.shape[0] - 1): # append last line 50 | tokens.append(int(file[i][0]), file[i][1], file[i][2], data_utils.strToLst(file[i][3]), 51 | data_utils. 52 | strToLst(file[i][4])) # append lines 53 | if (docNr != -1): 54 | self.head_docs.append(tokens) 55 | docNr += 1 56 | tokens = headIdDoc(file[i][0]) 57 | else: 58 | tokens.append(int(file[i][0]), file[i][1], file[i][2], data_utils.strToLst(file[i][3]), 59 | data_utils. 60 | strToLst(file[i][4])) # append lines 61 | 62 | 63 | def readHeadFile(headFile): 64 | # head_id_col_vector = ['tId', 'emId', "token", "nerId", "nerBilou","nerBIO", "ner", 'relLabels', "headIds", 'rels', 'relIds','scoringMatrixHeads','tokenWeights'] 65 | head_id_col_vector = ['token_id', 'token', "BIO", "relation", 'head'] 66 | headfile = pd.read_csv(headFile, names=head_id_col_vector, encoding="utf-8", 67 | engine='python', sep="\t", quoting=csv.QUOTE_NONE).as_matrix() 68 | 69 | return headIdParser(headfile).head_docs 70 | 71 | def preprocess(docs,wordindices,dataset_set_characters,dataset_set_bio_tags,dataset_set_ec_tags,dataset_set_relations): 72 | for doc in docs: 73 | doc.extend(wordindices,dataset_set_characters,dataset_set_bio_tags,dataset_set_ec_tags,dataset_set_relations) 74 | 75 | class read_properties: 76 | def __init__(self,filepath, sep='=', comment_char='#'): 77 | """Read the file passed as parameter as a properties file.""" 78 | self.props = {} 79 | #print filepath 80 | with open(filepath, "rt") as f: 81 | for line in f: 82 | #print line 83 | l = line.strip() 84 | if l and not l.startswith(comment_char): 85 | key_value = l.split(sep) 86 | self.props[key_value[0].strip()] = key_value[1].split("#")[0].strip('" \t') 87 | 88 | 89 | def getProperty(self,propertyName): 90 | return self.props.get(propertyName) 91 | 92 | -------------------------------------------------------------------------------- /data_build.py: -------------------------------------------------------------------------------- 1 | import os 2 | import data_utils 3 | import data_parsers as parsers 4 | from sklearn.externals import joblib 5 | import os.path 6 | 7 | """"Read the configuration file and set the parameters of the model""" 8 | 9 | 10 | class data_build(): 11 | def __init__(self, fname): 12 | 13 | config_file = parsers.read_properties(fname) #加载配置文件 14 | self.config_fname = fname 15 | 16 | # load data 17 | self.filename_embeddings = config_file.getProperty("filename_embeddings") 18 | self.filename_train = config_file.getProperty("filename_train") 19 | self.filename_test = config_file.getProperty("filename_test") 20 | self.filename_dev = config_file.getProperty("filename_dev") 21 | #生成各列数据的集合 22 | self.train_id_docs = parsers.readHeadFile(self.filename_train) 23 | self.dev_id_docs = parsers.readHeadFile(self.filename_dev) 24 | self.test_id_docs = parsers.readHeadFile(self.filename_test) 25 | 26 | # 将所有数据加到一个大集合中 27 | dataset_documents = [] 28 | dataset_documents.extend(self.train_id_docs) 29 | dataset_documents.extend(self.dev_id_docs) 30 | dataset_documents.extend(self.test_id_docs) 31 | 32 | self.dataset_set_characters = data_utils.getCharsFromDocuments(dataset_documents)#获得所有数据中 字母 数字的集合 33 | self.dataset_set_bio_tags, self.dataset_set_ec_tags = data_utils.getEntitiesFromDocuments(dataset_documents)#获得所有数据中 实体 的集合 34 | self.dataset_set_relations = data_utils.getRelationsFromDocuments(dataset_documents)#获得所有数据中 关系 的集合 35 | #加载预训练好的词向量 36 | if os.path.isfile(self.filename_embeddings + ".pkl") == False: 37 | self.wordvectors, self.representationsize, self.words = data_utils.readWordvectorsNumpy(self.filename_embeddings, isBinary=True if self.filename_embeddings.endswith(".bin") else False) 38 | self.wordindices = data_utils.readIndices(self.filename_embeddings, 39 | isBinary=True if self.filename_embeddings.endswith(".bin") else False) 40 | joblib.dump((self.wordvectors, self.representationsize, self.words, self.wordindices), self.filename_embeddings + ".pkl") 41 | 42 | else: 43 | self.wordvectors, self.representationsize, self.words, self.wordindices = joblib.load(self.filename_embeddings + ".pkl") # loading is faster 44 | #将数据转换成对应id的列表 45 | parsers.preprocess(self.train_id_docs, self.wordindices, self.dataset_set_characters, 46 | self.dataset_set_bio_tags, self.dataset_set_ec_tags, self.dataset_set_relations) 47 | parsers.preprocess(self.dev_id_docs, self.wordindices, self.dataset_set_characters, 48 | self.dataset_set_bio_tags, self.dataset_set_ec_tags, self.dataset_set_relations) 49 | parsers.preprocess(self.test_id_docs, self.wordindices, self.dataset_set_characters, 50 | self.dataset_set_bio_tags, self.dataset_set_ec_tags, self.dataset_set_relations) 51 | 52 | # training 53 | self.nepochs = int(config_file.getProperty("nepochs")) 54 | self.optimizer = config_file.getProperty("optimizer") 55 | self.activation = config_file.getProperty("activation") 56 | self.learning_rate = float(config_file.getProperty("learning_rate")) 57 | self.gradientClipping = data_utils.strToBool(config_file.getProperty("gradientClipping")) 58 | self.nepoch_no_imprv = int(config_file.getProperty("nepoch_no_imprv")) 59 | self.use_dropout = data_utils.strToBool(config_file.getProperty("use_dropout")) 60 | self.ner_loss = config_file.getProperty("ner_loss") 61 | self.ner_classes = config_file.getProperty("ner_classes") 62 | self.use_chars = data_utils.strToBool(config_file.getProperty("use_chars")) 63 | self.use_adversarial = data_utils.strToBool(config_file.getProperty("use_adversarial")) 64 | 65 | # hyperparameters 66 | self.dropout_embedding = float(config_file.getProperty("dropout_embedding")) 67 | self.dropout_lstm = float(config_file.getProperty("dropout_lstm")) 68 | self.dropout_lstm_output = float(config_file.getProperty("dropout_lstm_output")) 69 | self.dropout_fcl_ner = float(config_file.getProperty("dropout_fcl_ner")) 70 | self.dropout_fcl_rel = float(config_file.getProperty("dropout_fcl_rel")) 71 | self.hidden_size_lstm = int(config_file.getProperty("hidden_size_lstm")) 72 | self.hidden_size_n1 = int(config_file.getProperty("hidden_size_n1")) 73 | # self.hidden_size_n2 = config_file.getProperty("hidden_size_n2") 74 | self.num_lstm_layers = int(config_file.getProperty("num_lstm_layers")) 75 | self.char_embeddings_size = int(config_file.getProperty("char_embeddings_size")) 76 | self.hidden_size_char = int(config_file.getProperty("hidden_size_char")) 77 | self.label_embeddings_size = int(config_file.getProperty("label_embeddings_size")) 78 | self.alpha = float(config_file.getProperty("alpha")) 79 | 80 | # evaluation 81 | self.evaluation_method = config_file.getProperty("evaluation_method") 82 | self.root_node = data_utils.strToBool(config_file.getProperty("root_node")) 83 | 84 | self.shuffle = False 85 | self.batchsize = 16 86 | 87 | 88 | 89 | 90 | 91 | 92 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import data_utils 2 | import time 3 | import eval 4 | import tensorflow as tf 5 | 6 | 7 | class model: 8 | """Set of classes and methods for training the model and computing the ner and head selection loss""" 9 | 10 | def __init__(self, config, emb_mtx, sess): 11 | """"Initialize data""" 12 | self.config = config 13 | self.emb_mtx = emb_mtx 14 | self.sess = sess 15 | 16 | def getEvaluator(self): 17 | if self.config.evaluation_method == "strict" and self.config.ner_classes == "BIO": # the most common metric 18 | return eval.chunkEvaluator(self.config, ner_chunk_eval="boundaries_type", 19 | rel_chunk_eval="boundaries_type") 20 | elif self.config.evaluation_method == "boundaries" and self.config.ner_classes == "BIO": # s 21 | return eval.chunkEvaluator(self.config, ner_chunk_eval="boundaries", rel_chunk_eval="boundaries") 22 | elif self.config.evaluation_method == "relaxed" and self.config.ner_classes == "EC": # todo 23 | return eval.relaxedChunkEvaluator(self.config, rel_chunk_eval="boundaries_type") 24 | else: 25 | raise ValueError( 26 | 'Valid evaluation methods : "strict" and "boundaries" in "BIO" mode and "relaxed" in "EC" mode .') 27 | 28 | def train(self, train_data, operations, iter): 29 | 30 | loss = 0 31 | 32 | evaluator = self.getEvaluator() 33 | start_time = time.time() 34 | for x_train in data_utils.generator(train_data, operations.m_op, self.config, train=True): 35 | _, val, predicted_ner, actual_ner, predicted_rel, actual_rel, _, m_train = self.sess.run( 36 | [operations.train_step, operations.obj, operations.predicted_op_ner, operations.actual_op_ner, operations.predicted_op_rel, operations.actual_op_rel, operations.score_op_rel, 37 | operations.m_op], feed_dict=x_train) # sess.run(embedding_init, feed_dict={embedding_placeholder: wordvectors}) 38 | 39 | if self.config.evaluation_method == "relaxed": 40 | evaluator.add(predicted_ner, actual_ner, predicted_rel, actual_rel, m_train['BIO']) 41 | else: 42 | evaluator.add(predicted_ner, actual_ner, predicted_rel, actual_rel) 43 | 44 | loss += val 45 | 46 | print('****iter %d****' % (iter)) 47 | print('-------Train-------') 48 | print('loss: %f ' % (loss)) 49 | 50 | if self.config.evaluation_method == "relaxed": 51 | evaluator.computeInfoMacro() 52 | else: 53 | evaluator.printInfo() 54 | 55 | elapsed_time = time.time() - start_time 56 | print("Elapsed train time in sec:" + str(elapsed_time)) 57 | print() 58 | 59 | def evaluate(self, eval_data, operations, set): 60 | 61 | print('-------Evaluate on ' + set + '-------') 62 | 63 | evaluator = self.getEvaluator() 64 | for x_dev in data_utils.generator(eval_data, operations.m_op, self.config, train=False): 65 | predicted_ner, actual_ner, predicted_rel, actual_rel, _, m_eval = self.sess.run( 66 | [operations.predicted_op_ner, operations.actual_op_ner, operations.predicted_op_rel, operations.actual_op_rel, operations.score_op_rel, operations.m_op], feed_dict=x_dev) 67 | 68 | if self.config.evaluation_method == "relaxed": 69 | evaluator.add(predicted_ner, actual_ner, predicted_rel, actual_rel, m_eval['BIO']) 70 | else: 71 | evaluator.add(predicted_ner, actual_ner, predicted_rel, actual_rel) 72 | 73 | if self.config.evaluation_method == "relaxed": 74 | evaluator.computeInfoMacro(printScores=True) 75 | if "other" in [x.lower() for x in self.config.dataset_set_ec_tags]: # if other class exists report score without "Other" class, see previous work on the CoNLL04 76 | return evaluator.getMacroF1scoresNoOtherClass()[2] 77 | else: 78 | return evaluator.getMacroF1scores()[2] 79 | 80 | else: 81 | evaluator.printInfo() 82 | return evaluator.getChunkedOverallAvgF1() 83 | 84 | def get_train_op(self, obj): 85 | import tensorflow as tf 86 | 87 | if self.config.optimizer == 'Adam': 88 | 89 | optim = tf.train.AdamOptimizer(self.config.learning_rate) 90 | 91 | elif self.config.optimizer == 'Adagrad': 92 | optim = tf.train.AdagradOptimizer(self.config.learning_rate) 93 | elif self.config.optimizer == 'AdadeltaOptimizer': 94 | optim = tf.train.AdadeltaOptimizer(self.config.learning_rate) 95 | elif self.config.optimizer == 'GradientDescentOptimizer': 96 | optim = tf.train.GradientDescentOptimizer(self.config.learning_rate) 97 | 98 | if self.config.gradientClipping == True: 99 | 100 | gvs = optim.compute_gradients(obj) 101 | 102 | new_gvs = self.correctGradients(gvs) 103 | 104 | capped_gvs = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in new_gvs] 105 | train_step = optim.apply_gradients(capped_gvs) 106 | 107 | 108 | else: 109 | train_step = optim.minimize(obj) 110 | 111 | return train_step 112 | 113 | def correctGradients(self, gvs): 114 | new_gvs = [] 115 | for grad, var in gvs: 116 | # print (grad) 117 | if grad == None: 118 | grad = tf.zeros_like(var) 119 | 120 | new_gvs.append((grad, var)) 121 | if len(gvs) != len(new_gvs): 122 | print("gradient Error") 123 | return new_gvs 124 | 125 | def broadcasting(self, left, right): 126 | left = tf.transpose(left, perm=[1, 0, 2]) 127 | left = tf.expand_dims(left, 3) 128 | 129 | right = tf.transpose(right, perm=[0, 2, 1]) 130 | right = tf.expand_dims(right, 0) 131 | 132 | B = left + right 133 | B = tf.transpose(B, perm=[1, 0, 3, 2]) 134 | 135 | return B 136 | 137 | def getNerScores(self, lstm_out, n_types=1, dropout_keep_in_prob=1): 138 | u_a = tf.get_variable("u_typ", [self.config.hidden_size_lstm * 2, self.config.hidden_size_n1]) # [128 32] 139 | v = tf.get_variable("v_typ", [self.config.hidden_size_n1, n_types]) # [32,1] or [32,10] 140 | b_s = tf.get_variable("b_typ", [self.config.hidden_size_n1]) 141 | b_c = tf.get_variable("b_ctyp", [n_types]) 142 | 143 | mul = tf.einsum('aij,jk->aik', lstm_out, u_a) # [16 348 64] * #[64 32] = [16 348 32] 144 | 145 | sum = mul + b_s 146 | if self.config.activation == "tanh": 147 | output = tf.nn.tanh(sum) 148 | elif self.config.activation == "relu": 149 | output = tf.nn.relu(sum) 150 | 151 | if self.config.use_dropout == True: 152 | output = tf.nn.dropout(output, keep_prob=dropout_keep_in_prob) 153 | 154 | g = tf.einsum('aik,kp->aip', output, v) + b_c 155 | 156 | return g 157 | 158 | def getHeadSelectionScores(self, lstm_out, dropout_keep_in_prob=1): 159 | u_a = tf.get_variable("u_a", [(self.config.hidden_size_lstm * 2) + self.config.label_embeddings_size, self.config.hidden_size_n1]) # [128 32] 160 | w_a = tf.get_variable("w_a", [(self.config.hidden_size_lstm * 2) + self.config.label_embeddings_size, self.config.hidden_size_n1]) # [128 32] 161 | v = tf.get_variable("v", [self.config.hidden_size_n1, len(self.config.dataset_set_relations)]) # [32,1] or [32,4] 162 | b_s = tf.get_variable("b_s", [self.config.hidden_size_n1]) 163 | 164 | left = tf.einsum('aij,jk->aik', lstm_out, u_a) # [16 348 64] * #[64 32] = [16 348 32] 165 | right = tf.einsum('aij,jk->aik', lstm_out, w_a) # [16 348 64] * #[64 32] = [16 348 32] 166 | 167 | outer_sum = self.broadcasting(left, right) # [16 348 348 32] 168 | 169 | outer_sum_bias = outer_sum + b_s 170 | 171 | if self.config.activation == "tanh": 172 | output = tf.tanh(outer_sum_bias) 173 | elif self.config.activation == "relu": 174 | output = tf.nn.relu(outer_sum_bias) 175 | 176 | if self.config.use_dropout == True: 177 | output = tf.nn.dropout(output, keep_prob=dropout_keep_in_prob) 178 | 179 | output = tf.nn.dropout(output, keep_prob=dropout_keep_in_prob) 180 | 181 | g = tf.einsum('aijk,kp->aijp', output, v) 182 | 183 | g = tf.reshape(g, [tf.shape(g)[0], tf.shape(g)[1], tf.shape(g)[2] * len(self.config.dataset_set_relations)]) 184 | 185 | return g 186 | 187 | def computeLoss(self, input_rnn, dropout_embedding_keep, dropout_lstm_keep, dropout_lstm_output_keep, 188 | seqlen, dropout_fcl_ner_keep, ners_ids, dropout_fcl_rel_keep, is_train, scoring_matrix_gold, reuse=False): 189 | 190 | with tf.variable_scope("loss_computation", reuse=reuse): 191 | 192 | if self.config.use_dropout: 193 | input_rnn = tf.nn.dropout(input_rnn, keep_prob=dropout_embedding_keep) 194 | # input_rnn = tf.Print(input_rnn, [dropout_embedding_keep], 'embedding: ', summarize=1000) 195 | for i in range(self.config.num_lstm_layers): 196 | if self.config.use_dropout and i > 0: 197 | input_rnn = tf.nn.dropout(input_rnn, keep_prob=dropout_lstm_keep) 198 | # input_rnn = tf.Print(input_rnn, [dropout_lstm_keep], 'lstm: ', summarize=1000) 199 | 200 | lstm_fw_cell = tf.contrib.rnn.BasicLSTMCell(self.config.hidden_size_lstm) 201 | # Backward direction cell 202 | lstm_bw_cell = tf.contrib.rnn.BasicLSTMCell(self.config.hidden_size_lstm) 203 | #scope='BiLSTM' + str(i) 解决每层LSTM输入维度不一致问题 204 | lstm_out, _ = tf.nn.bidirectional_dynamic_rnn( 205 | cell_fw=lstm_fw_cell, 206 | cell_bw=lstm_bw_cell, 207 | inputs=input_rnn, 208 | sequence_length=seqlen, 209 | dtype=tf.float32, scope='BiLSTM' + str(i)) 210 | 211 | input_rnn = tf.concat(lstm_out, 2) 212 | 213 | lstm_output = input_rnn 214 | 215 | if self.config.use_dropout: 216 | lstm_output = tf.nn.dropout(lstm_output, keep_prob=dropout_lstm_output_keep) 217 | 218 | mask = tf.sequence_mask(seqlen, dtype=tf.float32) 219 | 220 | ner_input = lstm_output 221 | # loss= tf.Print(loss, [tf.shape(loss)], 'shape of loss is:') # same as scoring matrix ie, [1 59 590] 222 | #实体识别 223 | if self.config.ner_classes == "EC": 224 | 225 | nerScores = self.getNerScores(ner_input, len(self.config.dataset_set_ec_tags), 226 | dropout_keep_in_prob=dropout_fcl_ner_keep) 227 | label_matrix = tf.get_variable(name="label_embeddings", dtype=tf.float32, 228 | shape=[len(self.config.dataset_set_ec_tags), 229 | self.config.label_embeddings_size]) 230 | elif self.config.ner_classes == "BIO": 231 | 232 | nerScores = self.getNerScores(ner_input, len(self.config.dataset_set_bio_tags), 233 | dropout_keep_in_prob=dropout_fcl_ner_keep) 234 | label_matrix = tf.get_variable(name="label_embeddings", dtype=tf.float32, 235 | shape=[len(self.config.dataset_set_bio_tags), 236 | self.config.label_embeddings_size]) 237 | 238 | # nerScores = tf.Print(nerScores, [tf.shape(ners_ids), ners_ids, tf.shape(nerScores)], 'ners_ids: ', summarize=1000) 239 | 240 | log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood( 241 | nerScores, ners_ids, seqlen) 242 | if self.config.ner_loss == "crf": 243 | 244 | lossNER = -log_likelihood 245 | predNers, viterbi_score = tf.contrib.crf.crf_decode( 246 | nerScores, transition_params, seqlen) 247 | 248 | elif self.config.ner_loss == "softmax": 249 | lossNER = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=nerScores, labels=ners_ids) 250 | 251 | predNers = tf.cast(tf.arg_max(nerScores, 2), tf.int32) 252 | 253 | if self.config.label_embeddings_size > 0: 254 | 255 | labels = tf.cond(is_train > 0, lambda: ners_ids, lambda: predNers) 256 | 257 | label_embeddings = tf.nn.embedding_lookup(label_matrix, labels) 258 | rel_input = tf.concat([lstm_output, label_embeddings], axis=2) 259 | 260 | else: 261 | 262 | rel_input = lstm_output 263 | 264 | #关系抽取 265 | rel_scores = self.getHeadSelectionScores(rel_input, 266 | dropout_keep_in_prob=dropout_fcl_rel_keep) 267 | 268 | lossREL = tf.nn.sigmoid_cross_entropy_with_logits(logits=rel_scores, labels=scoring_matrix_gold) 269 | probas = tf.nn.sigmoid(rel_scores) 270 | predictedRel = tf.round(probas) 271 | 272 | return lossNER, lossREL, predNers, predictedRel, rel_scores 273 | 274 | def run(self): 275 | # shape = (batch size, max length of sentence, max length of word) 276 | char_ids = tf.placeholder(tf.int32, shape=[None, None, None]) 277 | is_train = tf.placeholder(tf.int32) 278 | 279 | # shape = (batch_size, max_length of sentence) 280 | word_lengths = tf.placeholder(tf.int32, shape=[None, None]) 281 | 282 | embedding_ids = tf.placeholder(tf.int32, [None, None]) # [ batch_size,max_length of sentence ] 283 | 284 | token_ids = tf.placeholder(tf.int32, [None, None]) # [ batch_size * max_sequence ] 285 | 286 | entity_tags_ids = tf.placeholder(tf.int32, [None, None]) 287 | 288 | scoring_matrix_gold = tf.placeholder(tf.float32, [None, None, None]) # [ batch_size * max_sequence] 289 | 290 | tokens = tf.placeholder(tf.string, [None, None]) # [ batch_size * max_sequence] 291 | BIO = tf.placeholder(tf.string, [None, None]) # [ batch_size * max_sequence] 292 | entity_tags = tf.placeholder(tf.string, [None, None]) # [ batch_size * max_sequence] 293 | 294 | # classes = ... 295 | seqlen = tf.placeholder(tf.int32, [None]) # [ batch_size ] 296 | 297 | doc_ids = tf.placeholder(tf.string, [None]) # [ batch_size ] 298 | 299 | dropout_embedding_keep = tf.placeholder(tf.float32, name="dropout_embedding_keep") 300 | dropout_lstm_keep = tf.placeholder(tf.float32, name="dropout_lstm_keep") 301 | dropout_lstm_output_keep = tf.placeholder(tf.float32, name="dropout_lstm_output_keep") 302 | dropout_fcl_ner_keep = tf.placeholder(tf.float32, name="dropout_fcl_ner_keep") 303 | dropout_fcl_rel_keep = tf.placeholder(tf.float32, name="dropout_fcl_rel_keep") 304 | 305 | embedding_matrix = tf.get_variable(name="embeddings", shape=self.emb_mtx.shape, 306 | initializer=tf.constant_initializer(self.emb_mtx), trainable=False) 307 | 308 | #####char embeddings 309 | 310 | # 1. get character embeddings 311 | 312 | K = tf.get_variable(name="char_embeddings", dtype=tf.float32, 313 | shape=[len(self.config.dataset_set_characters), self.config.char_embeddings_size]) 314 | # shape = (batch, sentence, word, dim of char embeddings) 315 | char_embeddings = tf.nn.embedding_lookup(K, char_ids) 316 | 317 | # 2. put the time dimension on axis=1 for dynamic_rnn 318 | s = tf.shape(char_embeddings) # store old shape 319 | 320 | char_embeddings_reshaped = tf.reshape(char_embeddings, shape=[-1, s[-2], self.config.char_embeddings_size]) 321 | word_lengths_reshaped = tf.reshape(word_lengths, shape=[-1]) 322 | 323 | char_hidden_size = self.config.hidden_size_char 324 | 325 | # 3. bi lstm on chars 326 | cell_fw = tf.contrib.rnn.BasicLSTMCell(char_hidden_size, state_is_tuple=True) 327 | cell_bw = tf.contrib.rnn.BasicLSTMCell(char_hidden_size, state_is_tuple=True) 328 | 329 | _, ((_, output_fw), (_, output_bw)) = tf.nn.bidirectional_dynamic_rnn(cell_fw=cell_fw, cell_bw=cell_bw, 330 | inputs=char_embeddings_reshaped, 331 | sequence_length=word_lengths_reshaped, 332 | dtype=tf.float32) 333 | # shape = (batch x sentence, 2 x char_hidden_size) 334 | output = tf.concat([output_fw, output_bw], axis=-1) 335 | 336 | # shape = (batch, sentence, 2 x char_hidden_size) 337 | char_rep = tf.reshape(output, shape=[-1, s[1], 2 * char_hidden_size]) 338 | 339 | # concat char embeddings 340 | 341 | word_embeddings = tf.nn.embedding_lookup(embedding_matrix, embedding_ids) 342 | #词向量+字符向量 343 | if self.config.use_chars == True: 344 | input_rnn = tf.concat([word_embeddings, char_rep], axis=-1) 345 | 346 | else: 347 | input_rnn = word_embeddings 348 | 349 | embeddings_input = input_rnn 350 | #计算损失,预测值 351 | lossNER, lossREL, predicted_entity_tags_ids, predictedRel, rel_scores = self.computeLoss(input_rnn, 352 | dropout_embedding_keep, 353 | dropout_lstm_keep, 354 | dropout_lstm_output_keep, seqlen, 355 | dropout_fcl_ner_keep, 356 | entity_tags_ids, dropout_fcl_rel_keep, 357 | is_train, 358 | scoring_matrix_gold, reuse=False) 359 | 360 | obj = tf.reduce_sum(lossNER) + tf.reduce_sum(lossREL) 361 | # 生成对抗样本 362 | raw_perturb = tf.gradients(obj, embeddings_input)[0] # [batch, L, dim] 363 | normalized_per = tf.nn.l2_normalize(raw_perturb, dim=[1, 2]) 364 | perturb = self.config.alpha * tf.sqrt(tf.cast(tf.shape(input_rnn)[2], tf.float32)) * tf.stop_gradient(normalized_per) 365 | perturb_inputs = embeddings_input + perturb #训练样本+对抗样本 366 | #计算训练样本+对抗样本 的损失 367 | lossNER_per, lossREL_per, _, _, _ = self.computeLoss(perturb_inputs, 368 | dropout_embedding_keep, 369 | dropout_lstm_keep, 370 | dropout_lstm_output_keep, seqlen, 371 | dropout_fcl_ner_keep, 372 | entity_tags_ids, dropout_fcl_rel_keep, 373 | is_train, 374 | scoring_matrix_gold, reuse=True) 375 | 376 | actualRel = tf.round(scoring_matrix_gold) 377 | 378 | if self.config.use_adversarial == True: 379 | obj += tf.reduce_sum(lossNER_per) + tf.reduce_sum(lossREL_per) 380 | 381 | m = {} 382 | m['isTrain'] = is_train 383 | m['embeddingIds'] = embedding_ids 384 | m['charIds'] = char_ids 385 | m['tokensLens'] = word_lengths 386 | m['entity_tags_ids'] = entity_tags_ids 387 | m['scoringMatrixGold'] = scoring_matrix_gold 388 | m['seqlen'] = seqlen 389 | m['doc_ids'] = doc_ids 390 | m['tokenIds'] = token_ids 391 | m['dropout_embedding'] = dropout_embedding_keep 392 | m['dropout_lstm'] = dropout_lstm_keep 393 | m['dropout_lstm_output'] = dropout_lstm_output_keep 394 | m['dropout_fcl_ner'] = dropout_fcl_ner_keep 395 | m['dropout_fcl_rel'] = dropout_fcl_rel_keep 396 | m['tokens'] = tokens 397 | m['BIO'] = BIO 398 | m['entity_tags'] = entity_tags 399 | 400 | return obj, m, predicted_entity_tags_ids, entity_tags_ids, predictedRel, actualRel, rel_scores 401 | 402 | 403 | class operations(): 404 | def __init__(self, train_step, obj, m_op, predicted_op_ner, actual_op_ner, predicted_op_rel, actual_op_rel, score_op_rel): 405 | self.train_step = train_step 406 | self.obj = obj 407 | self.m_op = m_op 408 | self.predicted_op_ner = predicted_op_ner 409 | self.actual_op_ner = actual_op_ner 410 | self.predicted_op_rel = predicted_op_rel 411 | self.actual_op_rel = actual_op_rel 412 | self.score_op_rel = score_op_rel -------------------------------------------------------------------------------- /data_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import random 3 | import gensim 4 | import gzip 5 | import numpy as np 6 | import ast 7 | import copy 8 | import sys 9 | from sklearn.model_selection import train_test_split 10 | from prettytable import PrettyTable 11 | import re 12 | import tensorflow as tf 13 | 14 | """Generic set of classes and methods""" 15 | 16 | 17 | def strToLst(string): 18 | return ast.literal_eval(string) 19 | 20 | 21 | class HeadData: 22 | def __init__(self, data, indices): 23 | self.data = data 24 | self.indices = indices 25 | 26 | def split(self, fraction): 27 | 28 | data_train, data_test, idx_train, idx_test = train_test_split(self.data, self.indices, test_size=fraction, 29 | random_state=42) 30 | 31 | train = HeadData(data_train, idx_train) 32 | 33 | test = HeadData(data_test, idx_test) 34 | return train, test 35 | 36 | def transformToInitialInput(matrix,tags): 37 | active_relations = np.nonzero(matrix) 38 | active_relations_iidx = active_relations[0] 39 | active_relations_jidx = active_relations[1] 40 | 41 | tokens_ids = [] 42 | heads_ids = [] 43 | labels_ids = [] 44 | head_labels_ids = [] 45 | labels_name = [] 46 | 47 | for m_idx in range(len(matrix)): 48 | tokens_ids.append(m_idx) 49 | heads_ids.append([]) 50 | labels_ids.append([]) 51 | head_labels_ids.append([]) 52 | labels_name.append([]) 53 | 54 | for i_idx in range(len(active_relations_iidx)): 55 | head_id = int(active_relations_jidx[i_idx] / len(tags)) 56 | label_id = active_relations_jidx[i_idx] % len(tags) 57 | token_id = active_relations_iidx[i_idx] 58 | head_label_id = active_relations_jidx[i_idx] 59 | 60 | # idx=tokens_ids.index(token_id) 61 | heads_ids[token_id].append(head_id) 62 | labels_ids[token_id].append(label_id) 63 | head_labels_ids[token_id].append(head_label_id) 64 | labels_name[token_id].append(tags[label_id]) 65 | 66 | # print (str(token_id) + " " +str(head_label_id)+ " " +str(head)+ " " +str(label)) 67 | return tokens_ids, head_labels_ids, labels_ids, heads_ids, labels_name 68 | 69 | 70 | ###run one time to obtain the characters 71 | def getCharsFromDocuments(documents): 72 | chars = [] 73 | for doc in documents: 74 | for tokens in doc.tokens: 75 | for char in tokens: 76 | # print (token) 77 | chars.append(char) 78 | chars = list(set(chars)) 79 | chars.sort() 80 | return chars 81 | 82 | 83 | ###run one time to obtain the ner labels 84 | def getEntitiesFromDocuments(documents): 85 | BIOtags = [] 86 | ECtags = [] 87 | for doc in documents: 88 | for tag in doc.BIOs: 89 | BIOtags.append(tag) 90 | if tag.startswith("B-") or tag.startswith("I-"): 91 | ECtags.append(tag[2:]) 92 | else: 93 | ECtags.append(tag) 94 | 95 | BIOtags = list(set(BIOtags)) 96 | BIOtags.sort() 97 | ECtags = list(set(ECtags)) 98 | ECtags.sort() 99 | return BIOtags, ECtags 100 | 101 | 102 | def getECfromBIO(BIO_tag): 103 | if BIO_tag.startswith("B-") or BIO_tag.startswith("I-"): 104 | return (BIO_tag[2:]) 105 | else: 106 | return (BIO_tag) 107 | 108 | 109 | ###run one time to obtain the relations 110 | def getRelationsFromDocuments(documents): 111 | relations = [] 112 | for doc in documents: 113 | for relation_list in doc.relations: 114 | for relation in relation_list: 115 | relations.append(relation) 116 | 117 | relations = list(set(relations)) 118 | relations.sort() 119 | return relations 120 | 121 | 122 | def tokenToCharIds(token, characters): 123 | charIds = [] 124 | for char in token: 125 | charIds.append(characters.index(char)) 126 | return charIds 127 | 128 | 129 | def labelsListToIds(listofLabels, setofLabels): 130 | labelIds = [] 131 | for label in listofLabels: 132 | labelIds.append(setofLabels.index(label)) 133 | 134 | return labelIds 135 | 136 | 137 | def getScoringMatrixHeads(listofRelations, setofLabels, heads): 138 | scoringMatrixHeads = [] 139 | relationIds = labelsListToIds(listofRelations, setofLabels) 140 | 141 | 142 | for relIdx in range(len(relationIds)): 143 | # print (rels[relIdx]*getNumberOfClasses()+labelJointIds[relIdx]) 144 | scoringMatrixHeads.append(heads[relIdx] * len(setofLabels) + relationIds[relIdx]) 145 | return scoringMatrixHeads 146 | 147 | 148 | def getLabelId(label, setofLabels): 149 | return setofLabels.index(label) 150 | 151 | def strToBool(str): 152 | if str.lower() in ['true', '1']: 153 | return True 154 | return False 155 | 156 | 157 | 158 | def getEmbeddingId(word, embeddingsList): 159 | # modified method from http://cistern.cis.lmu.de/globalNormalization/globalNormalization_all.zip 160 | if word != "": 161 | if not word in embeddingsList: 162 | if re.search(r'^\d+$', word): 163 | word = "0" 164 | if word.islower(): 165 | word = word.title() 166 | else: 167 | word = word.lower() 168 | if not word in embeddingsList: 169 | word = "" 170 | curIndex = embeddingsList[word] 171 | return curIndex 172 | 173 | 174 | def readWordvectorsNumpy(wordvectorfile, isBinary=False): 175 | 176 | # modified method from http://cistern.cis.lmu.de/globalNormalization/globalNormalization_all.zip 177 | wordvectors = [] 178 | words = [] 179 | model = gensim.models.KeyedVectors.load_word2vec_format(wordvectorfile, binary=isBinary,unicode_errors='ignore') 180 | 181 | vectorsize = model.vector_size 182 | 183 | for key in list(model.vocab.keys()): 184 | wordvectors.append(model.wv[key]) 185 | words.append(key) 186 | 187 | zeroVec = [0 for i in range(vectorsize)] 188 | random.seed(123456) 189 | randomVec = [random.uniform(-np.sqrt(1. / len(wordvectors)), np.sqrt(1. / len(wordvectors))) for i in 190 | range(vectorsize)] 191 | wordvectors.insert(0, randomVec) 192 | words.insert(0, "") 193 | wordvectors.insert(0, zeroVec) 194 | words.insert(0, "") 195 | 196 | wordvectorsNumpy = np.array(wordvectors) 197 | return wordvectorsNumpy, vectorsize, words 198 | 199 | 200 | def readIndices(wordvectorfile, isBinary=False): 201 | # modified method from http://cistern.cis.lmu.de/globalNormalization/globalNormalization_all.zip 202 | indices = {} 203 | curIndex = 0 204 | indices[""] = curIndex 205 | curIndex += 1 206 | indices[""] = curIndex 207 | curIndex += 1 208 | 209 | model = gensim.models.KeyedVectors.load_word2vec_format(wordvectorfile, binary=isBinary,unicode_errors='ignore') 210 | 211 | count = 0 212 | # c=0 213 | for key in list(model.vocab.keys()): 214 | indices[key] = curIndex 215 | curIndex += 1 216 | 217 | return indices 218 | 219 | 220 | 221 | def printParameters(config): 222 | 223 | t = PrettyTable(['Params', 'Value']) 224 | 225 | #dataset 226 | t.add_row(['Config', config.config_fname]) 227 | t.add_row(['Embeddings', config.filename_embeddings]) 228 | t.add_row(['Embeddings size ', config.representationsize]) 229 | t.add_row(['Train', config.filename_train]) 230 | t.add_row(['Dev', config.filename_dev]) 231 | t.add_row(['Test', config.filename_test]) 232 | 233 | #training 234 | t.add_row(['Epochs ', config.nepochs]) 235 | t.add_row(['Optimizer ', config.optimizer]) 236 | t.add_row(['Activation ', config.activation]) 237 | t.add_row(['Learning rate ', config.learning_rate]) 238 | t.add_row(['Gradient clipping ', config.gradientClipping]) 239 | t.add_row(['Patience ', config.nepoch_no_imprv]) 240 | t.add_row(['Use dropout', config.use_dropout]) 241 | t.add_row(['Ner loss ', config.ner_loss]) 242 | t.add_row(['Ner classes ', config.ner_classes]) 243 | t.add_row(['Use char embeddings ', config.use_chars]) 244 | t.add_row(['Use adversarial',config.use_adversarial]) 245 | 246 | # hyperparameters 247 | t.add_row(['Dropout embedding ', config.dropout_embedding]) 248 | t.add_row(['Dropout lstm ', config.dropout_lstm]) 249 | t.add_row(['Dropout lstm output ', config.dropout_lstm_output]) 250 | t.add_row(['Dropout fcl ner ', config.dropout_fcl_ner]) 251 | t.add_row(['Dropout fcl rel ', config.dropout_fcl_rel]) 252 | t.add_row(['Hidden lstm size ', config.hidden_size_lstm]) 253 | t.add_row(['LSTM layers ', config.num_lstm_layers]) 254 | t.add_row(['Hidden nn size ', config.hidden_size_n1]) 255 | t.add_row(['Char embeddings size ', config.char_embeddings_size]) 256 | t.add_row(['Hidden size char ', config.hidden_size_char]) 257 | t.add_row(['Label embeddings size ', config.label_embeddings_size]) 258 | t.add_row(['Alpha ', config.alpha]) 259 | t.add_row(['Root node ', config.root_node]) 260 | 261 | #evaluation 262 | t.add_row(['Evaluation method ', config.evaluation_method]) 263 | 264 | 265 | print(t) 266 | 267 | def getSegmentationDict(lst): 268 | return {k: v for v, k in enumerate(lst)} 269 | 270 | def generator(data, m,config,train=False): 271 | # generate the data 272 | embeddingIds = m['embeddingIds'] 273 | isTrain=m['isTrain'] 274 | 275 | scoringMatrixGold = m['scoringMatrixGold'] 276 | BIO = m['BIO'] # always the BIO tags 277 | entity_tags=m['entity_tags'] # either the BIO tags or the EC tags - depends on the NER target values 278 | entity_tags_ids = m['entity_tags_ids'] 279 | tokens = m['tokens'] 280 | tokenIds = m['tokenIds'] 281 | charIds = m['charIds'] 282 | tokensLens = m['tokensLens'] 283 | 284 | seqlen = m['seqlen'] 285 | doc_ids=m['doc_ids'] 286 | 287 | 288 | dropout_embedding_keep = m['dropout_embedding'] 289 | dropout_lstm_keep = m['dropout_lstm'] 290 | dropout_lstm_output_keep = m['dropout_lstm_output'] 291 | dropout_fcl_ner_keep = m['dropout_fcl_ner'] 292 | dropout_fcl_rel_keep = m['dropout_fcl_rel'] 293 | 294 | dropout_embedding_prob = 1 295 | dropout_lstm_prob = 1 296 | dropout_lstm_output_prob = 1 297 | dropout_fcl_ner_prob = 1 298 | dropout_fcl_rel_prob = 1 299 | 300 | if config.use_dropout == True and train==True: 301 | 302 | dropout_embedding_prob = config.dropout_embedding 303 | dropout_lstm_prob = config.dropout_lstm 304 | dropout_lstm_output_prob = config.dropout_lstm_output 305 | dropout_fcl_ner_prob = config.dropout_fcl_ner 306 | dropout_fcl_rel_prob = config.dropout_fcl_rel 307 | 308 | data_copy = copy.deepcopy(data) 309 | # train_ind=np.arange(len(train.data)) 310 | #将样本数据打乱 311 | if config.shuffle == True: 312 | shuffled_data, _, shuffled_data_idx, _ = train_test_split(data_copy.data, data_copy.indices, test_size=0, 313 | random_state=42) 314 | # shuffled_data, _, shuffled_data_idx, _ = train_test_split(data_copy.data, data_copy.indices, test_size=0,random_state=42) 315 | 316 | data_copy = HeadData(shuffled_data, shuffled_data_idx) 317 | # print ("shuffle:"+ str(shuffle) ) 318 | # print(data_copy.indices) 319 | else: 320 | 321 | data_copy = HeadData(data_copy.data, data_copy.indices) 322 | # data_copy = HeadData(data_copy.data, data_copy.indices) 323 | 324 | # print("shuffle:" + str(shuffle)) 325 | # print(data_copy.indices) 326 | 327 | # batchsize=16 # number of documents per batch 328 | batches_embeddingIds = [] # e.g., 131 batches 329 | batches_charIds = [] # e.g., 131 batches 330 | batches_scoringMatrixHeadIds = [] # e.g., 131 batches 331 | batches_scoringMatrix = [] # e.g., 131 batches 332 | batches_tokens = [] 333 | 334 | batches_entity_tags = [] 335 | batches_entity_tags_ids = [] 336 | batches_BIO=[] 337 | batches_tokenIds = [] 338 | batches_doc_ids = [] 339 | 340 | docs_batch_embeddingIds = [] # e.g., 587 max doc length - complete with -1 when the size of the doc is smaller 341 | docs_batch_charIds = [] # e.g., 587 max doc length - complete with -1 when the size of the doc is smaller 342 | docs_batch_scoringMatrixHeadIds = [] 343 | docs_batch_scoringMatrix = [] 344 | 345 | docs_batch_entity_tags=[] 346 | docs_batch_entity_tags_ids = [] 347 | 348 | docs_batch_tokens = [] 349 | 350 | docs_batch_BIO = [] 351 | docs_batch_tokenIds = [] 352 | docs_batch_doc_ids = [] 353 | 354 | maxScoringMatrixLenList0 = [] 355 | maxScoringMatrixLen0 = -1 356 | maxScoringMatrixLenList1 = [] 357 | maxScoringMatrixLen1 = -1 358 | 359 | maxDocLenList = [] 360 | maxSentenceLen = -1 361 | 362 | maxWordLenList = [] 363 | maxWordLen = -1 364 | 365 | wordLenList = [] 366 | wordLens = [] 367 | 368 | lenBatchesDoc = [] 369 | lenEmbeddingssDoc = [] 370 | 371 | lenBatchesChars = [] 372 | lenCharsDoc = [] 373 | 374 | sumLen = 0 375 | for docIdx in range(len(data_copy.data)): 376 | doc = data_copy.data[docIdx] 377 | # print (doc) 378 | if docIdx % config.batchsize == 0 and docIdx > 0: 379 | # print (docIdx) 380 | # print ("new batch") 381 | batches_embeddingIds.append(docs_batch_embeddingIds) 382 | batches_charIds.append(docs_batch_charIds) 383 | 384 | batches_scoringMatrixHeadIds.append(docs_batch_scoringMatrixHeadIds) 385 | batches_scoringMatrix.append(docs_batch_scoringMatrix) 386 | batches_entity_tags.append(docs_batch_entity_tags) 387 | batches_entity_tags_ids.append(docs_batch_entity_tags_ids) 388 | 389 | batches_tokens.append(docs_batch_tokens) 390 | 391 | batches_BIO.append(docs_batch_BIO) 392 | batches_tokenIds.append(docs_batch_tokenIds) 393 | batches_doc_ids.append(docs_batch_doc_ids) 394 | 395 | docs_batch_embeddingIds = [] # e.g., 587 max doc length - complete with -1 when the size of the doc is smaller 396 | docs_batch_charIds = [] # e.g., 587 max doc length - complete with -1 when the size of the doc is smaller 397 | docs_batch_scoringMatrixHeadIds = [] 398 | docs_batch_scoringMatrix = [] 399 | 400 | docs_batch_tokens = [] 401 | 402 | docs_batch_entity_tags = [] 403 | docs_batch_entity_tags_ids = [] 404 | docs_batch_BIO = [] 405 | docs_batch_tokenIds = [] 406 | docs_batch_doc_ids = [] 407 | 408 | maxDocLenList.append(maxSentenceLen) 409 | maxSentenceLen = -1 410 | 411 | maxScoringMatrixLenList0.append(maxScoringMatrixLen0) 412 | maxScoringMatrixLen0 = -1 413 | 414 | maxScoringMatrixLenList1.append(maxScoringMatrixLen1) 415 | maxScoringMatrixLen1 = -1 416 | 417 | maxWordLenList.append(maxWordLen) 418 | maxWordLen = -1 419 | 420 | wordLenList.append(wordLens) 421 | 422 | 423 | 424 | if len(doc.token_ids) > maxSentenceLen: 425 | maxSentenceLen = len(doc.token_ids) 426 | 427 | longest_token_list=max(doc.char_ids, key=len) 428 | if len(longest_token_list) > maxWordLen: 429 | maxWordLen = len(longest_token_list) 430 | 431 | wordLens=[len(token) for token in doc.char_ids] 432 | 433 | 434 | sumLen += len(doc.token_ids) 435 | docs_batch_embeddingIds.append(doc.embedding_ids) 436 | docs_batch_charIds.append(doc.char_ids) 437 | docs_batch_scoringMatrixHeadIds.append(doc.joint_ids) 438 | 439 | scoringMatrix = np.zeros((len(doc.joint_ids), len(doc.joint_ids) *len(config.dataset_set_relations) )) 440 | if scoringMatrix.shape[0] > maxScoringMatrixLen0: 441 | maxScoringMatrixLen0 = scoringMatrix.shape[0] 442 | if scoringMatrix.shape[1] > maxScoringMatrixLen1: 443 | maxScoringMatrixLen1 = scoringMatrix.shape[1] 444 | 445 | 446 | for tokenIdx in range(len(doc.joint_ids)): 447 | tokenHeads = doc.joint_ids[tokenIdx] 448 | for head in tokenHeads: 449 | # print (str(tokenIdx)+ " "+ str(head)) 450 | scoringMatrix[tokenIdx, head] = 1 451 | 452 | docs_batch_scoringMatrix.append(scoringMatrix.tolist()) 453 | # print (scoringMatrix) 454 | 455 | #print (doc.jlabel_names) 456 | if config.ner_classes=="BIO": 457 | docs_batch_entity_tags.append(doc.BIOs)##to do 458 | docs_batch_entity_tags_ids.append(doc.BIO_ids) 459 | 460 | elif config.ner_classes=="EC": 461 | docs_batch_entity_tags.append(doc.ecs)##to do 462 | docs_batch_entity_tags_ids.append(doc.ec_ids) 463 | 464 | docs_batch_tokens.append(doc.tokens) 465 | 466 | docs_batch_BIO.append(doc.BIOs)##to do 467 | docs_batch_tokenIds.append(doc.token_ids) 468 | docs_batch_doc_ids.append(doc.docId) 469 | if docIdx == len( 470 | data_copy.data) - 1: ## if there are no documents left - append the batch - usually it is shorter batch 471 | batches_embeddingIds.append(docs_batch_embeddingIds) 472 | batches_charIds.append(docs_batch_charIds) 473 | batches_scoringMatrixHeadIds.append(docs_batch_scoringMatrixHeadIds) 474 | batches_scoringMatrix.append(docs_batch_scoringMatrix) 475 | 476 | batches_entity_tags.append(docs_batch_entity_tags) 477 | batches_entity_tags_ids.append(docs_batch_entity_tags_ids) 478 | batches_tokens.append(docs_batch_tokens) 479 | 480 | batches_BIO.append(docs_batch_BIO) 481 | batches_tokenIds.append(docs_batch_tokenIds) 482 | batches_doc_ids.append(docs_batch_doc_ids) 483 | maxDocLenList.append(maxSentenceLen) 484 | maxScoringMatrixLenList0.append(maxScoringMatrixLen0) 485 | maxScoringMatrixLenList1.append(maxScoringMatrixLen1) 486 | maxWordLenList.append(maxWordLen) 487 | wordLenList.append(wordLens) 488 | # maxDocLen.append(maxWordLen) 489 | 490 | # 按最长维度填充 491 | for bIdx in range(len(batches_embeddingIds)): 492 | 493 | batch_embeddingIds = batches_embeddingIds[bIdx] 494 | batch_charIds = batches_charIds[bIdx] 495 | batch_scoringMatrixHeadIds = batches_scoringMatrixHeadIds[bIdx] 496 | batch_scoringMatrix = batches_scoringMatrix[bIdx] 497 | 498 | batch_entity_tags = batches_entity_tags[bIdx] 499 | batch_entity_tags_ids = batches_entity_tags_ids[bIdx] 500 | batch_tokens = batches_tokens[bIdx] 501 | 502 | batch_tokenIds = batches_tokenIds[bIdx] 503 | 504 | for dIdx in range(len(batch_embeddingIds)): 505 | embeddingId_doc = batch_embeddingIds[dIdx] 506 | charIds_doc = batch_charIds[dIdx] 507 | scoringMatrixHeadId_doc = batch_scoringMatrixHeadIds[dIdx] 508 | scoringMatrix_doc = batch_scoringMatrix[dIdx] 509 | 510 | ner_doc=batch_entity_tags[dIdx] 511 | ner_doc_ids=batch_entity_tags_ids[dIdx] 512 | token_doc = batch_tokens[dIdx] 513 | 514 | token_id_doc = batch_tokenIds[dIdx] 515 | 516 | lenEmbeddingssDoc.append(len(embeddingId_doc)) 517 | tokensLen=[len(token) for token in charIds_doc]#每个句子中单词长度的列表 518 | lenCharsDoc.append(tokensLen) 519 | 520 | 521 | for tokenIdx in range(len(tokensLen)): 522 | tokenLen=tokensLen[tokenIdx] 523 | 524 | if tokenLen= ner[1] and rel[0] <= ner[2]: 94 | # print (ner) 95 | if relationTuple == "boundaries_type": 96 | left_chunk = ner 97 | elif relationTuple == "boundaries": 98 | left_chunk = (ner[1], ner[2]) 99 | elif relationTuple == "type": 100 | left_chunk = (ner[0]) 101 | if rel[2] >= ner[1] and rel[2] <= ner[2]: 102 | # print (ner) 103 | if relationTuple == "boundaries_type": 104 | right_chunk = ner 105 | elif relationTuple == "boundaries": 106 | right_chunk = (ner[1], ner[2]) 107 | elif relationTuple == "type": 108 | right_chunk = (ner[0]) 109 | if (left_chunk != "" and right_chunk != ""): 110 | relationChunks.append((left_chunk, relation, right_chunk)) 111 | return relationChunks 112 | 113 | def getTokenRelations(label_names, head_ids, token_ids): 114 | relations = [] 115 | for labelLIdx in range(len(label_names)): 116 | # print (predLabel) 117 | labelL = label_names[labelLIdx] 118 | headL = head_ids[labelLIdx] 119 | tokenId = token_ids[labelLIdx] 120 | for labelIdx in range(len(labelL)): 121 | 122 | label = labelL[labelIdx] 123 | head = headL[labelIdx] 124 | # print (label) 125 | # print ((tokenId)+" "+ label+ " " + head) 126 | if label != "N": 127 | # print (label) 128 | relations.append((tokenId, label, head)) 129 | # print (tokenId,label,head) 130 | return relations 131 | 132 | 133 | def keepOnlyChunkBoundaries(ners): 134 | nersNoBounds = [] 135 | ners = list(ners) 136 | for ner in ners: 137 | # ner[0]=None 138 | # print (ner) 139 | nersNoBounds.append((None, ner[1], ner[2])) 140 | return nersNoBounds 141 | 142 | class chunkEvaluator: 143 | def __init__(self,config,ner_chunk_eval="boundaries_type",rel_chunk_eval="boundaries"): 144 | self.nerSegmentationTags=config.dataset_set_bio_tags 145 | 146 | self.NERset = config.dataset_set_ec_tags 147 | self.RELset = config.dataset_set_relations 148 | 149 | self.root_node=config.root_node 150 | 151 | 152 | self.ner_chunk_eval=ner_chunk_eval 153 | self.rel_chunk_eval=rel_chunk_eval 154 | 155 | 156 | self.totals = 0 157 | self.oks = 0 158 | 159 | self.tpsNER = 0 160 | self.fpsNER = 0 161 | self.fnsNER = 0 162 | 163 | self.tpsREL = 0 164 | self.fpsREL = 0 165 | self.fnsREL = 0 166 | 167 | self.tpsClassesNER = dict.fromkeys(self.NERset, 0) 168 | self.fpsClassesNER = dict.fromkeys(self.NERset, 0) 169 | self.fnsClassesNER = dict.fromkeys(self.NERset, 0) 170 | self.precisionNER = dict.fromkeys(self.NERset, 0) 171 | self.recallNER = dict.fromkeys(self.NERset, 0) 172 | self.f1NER = dict.fromkeys(self.NERset, 0) 173 | 174 | self.tpsClassesREL = dict.fromkeys(self.RELset, 0) 175 | self.fpsClassesREL = dict.fromkeys(self.RELset, 0) 176 | self.fnsClassesREL = dict.fromkeys(self.RELset, 0) 177 | 178 | self.precisionREL = dict.fromkeys(self.RELset, 0) 179 | self.recallREL = dict.fromkeys(self.RELset, 0) 180 | self.f1REL = dict.fromkeys(self.RELset, 0) 181 | 182 | self.correct_predsNER, self.total_correctNER, self.total_predsNER = 0., 0., 0. 183 | self.correct_predsREL, self.total_correctREL, self.total_predsREL = 0., 0., 0. 184 | 185 | def add(self, pred_batchesNER, true_batchesNER, pred_batchesREL, true_batchesREL): 186 | 187 | 188 | 189 | for batch_idx in range(len(pred_batchesNER)): 190 | predNER = pred_batchesNER[batch_idx] 191 | trueNER = true_batchesNER[batch_idx] 192 | 193 | predRel = pred_batchesREL[batch_idx] 194 | trueRel = true_batchesREL[batch_idx] 195 | 196 | ptoken_ids, _, plabel_ids, phead_ids, plabel_names = data_utils.transformToInitialInput( 197 | predRel, self.RELset) 198 | 199 | _, _, tlabel_ids, thead_ids, tlabel_names = data_utils.transformToInitialInput( 200 | trueRel, self.RELset) 201 | 202 | trueRel = getTokenRelations(tlabel_names, thead_ids, ptoken_ids) 203 | 204 | predRel = getTokenRelations(plabel_names, phead_ids, ptoken_ids) 205 | 206 | tagsNER = data_utils.getSegmentationDict(self.nerSegmentationTags)#self. 207 | 208 | if self.ner_chunk_eval == "boundaries_type": 209 | 210 | lab_chunks = set(get_chunks(trueNER, tagsNER)) 211 | lab_pred_chunks = set(get_chunks(predNER, tagsNER)) 212 | 213 | elif self.ner_chunk_eval == "boundaries": 214 | 215 | lab_chunks = set(keepOnlyChunkBoundaries(set(get_chunks(trueNER, tagsNER)))) 216 | lab_pred_chunks = set(keepOnlyChunkBoundaries(set(get_chunks(predNER, tagsNER)))) 217 | 218 | 219 | 220 | 221 | lab_chunks_list = list(lab_chunks) 222 | lab_pred_chunks_list = list(lab_pred_chunks) 223 | 224 | 225 | if self.ner_chunk_eval == "boundaries_type": 226 | for lab_idx in range(len(lab_pred_chunks_list)): 227 | 228 | if lab_pred_chunks_list[lab_idx] in lab_chunks_list: 229 | # print (lab_pred_chunks_list[lab_idx][0]) 230 | self.tpsClassesNER[lab_pred_chunks_list[lab_idx][0]] += 1 231 | else: 232 | self.fpsClassesNER[lab_pred_chunks_list[lab_idx][0]] += 1 233 | # fnsEntitiesNER+=1 234 | 235 | for lab_idx in range(len(lab_chunks_list)): 236 | 237 | if lab_chunks_list[lab_idx] not in lab_pred_chunks_list: 238 | self.fnsClassesNER[lab_chunks_list[lab_idx][0]] += 1 239 | 240 | elif self.ner_chunk_eval == "boundaries": 241 | for lab_idx in range(len(lab_pred_chunks_list)): 242 | 243 | if lab_pred_chunks_list[lab_idx] in lab_chunks_list: 244 | # print (lab_pred_chunks_list[lab_idx][0]) 245 | self.tpsNER += 1 246 | else: 247 | self.fpsNER += 1 248 | # fnsEntitiesNER+=1 249 | 250 | for lab_idx in range(len(lab_chunks_list)): 251 | 252 | if lab_chunks_list[lab_idx] not in lab_pred_chunks_list: 253 | self.fnsNER += 1 254 | 255 | if self.root_node==True: 256 | lab_chunks_list_with_ROOT = copy.deepcopy(lab_chunks_list) 257 | lab_chunks_list_with_ROOT.append((None,0, 0)) 258 | 259 | lab_pred_chunks_list_with_ROOT = copy.deepcopy(lab_pred_chunks_list) 260 | lab_pred_chunks_list_with_ROOT.append((None,0, 0)) 261 | 262 | relTrue = set(relationChunks(trueRel, lab_chunks_list_with_ROOT, relationTuple=self.rel_chunk_eval)) 263 | 264 | relPred = set(relationChunks(predRel, lab_pred_chunks_list_with_ROOT, relationTuple=self.rel_chunk_eval)) 265 | 266 | else: 267 | relTrue = set(relationChunks(trueRel, lab_chunks_list,relationTuple=self.rel_chunk_eval)) 268 | 269 | relPred = set(relationChunks(predRel, lab_pred_chunks_list,relationTuple=self.rel_chunk_eval)) 270 | 271 | relTrueList = list(relTrue) # trueRel# 272 | 273 | # if (len(trueRel)!=len(relTrueList)): 274 | # print ("warning") 275 | 276 | relPredList = list(relPred) # predRel# 277 | 278 | for lab_idx in range(len(relPredList)): 279 | 280 | if relPredList[lab_idx] in relTrueList: 281 | # print (lab_pred_chunks_list[lab_idx][0]) 282 | self.tpsClassesREL[relPredList[lab_idx][1]] += 1 283 | # print (relPredList[lab_idx]) 284 | else: 285 | self.fpsClassesREL[relPredList[lab_idx][1]] += 1 286 | # fnsEntitiesNER+=1 287 | 288 | for lab_idx in range(len(relTrueList)): 289 | 290 | if relTrueList[lab_idx] not in relPredList: 291 | self.fnsClassesREL[relTrueList[lab_idx][1]] += 1 292 | 293 | self.correct_predsNER += len(lab_chunks & lab_pred_chunks) 294 | self.total_predsNER += len(lab_pred_chunks) 295 | self.total_correctNER += len(lab_chunks) 296 | 297 | self.correct_predsREL += len(relTrue & relPred) 298 | self.total_predsREL += len(relPred) 299 | self.total_correctREL += len(relTrue) 300 | 301 | 302 | 303 | def getResultsNER(self): 304 | p = self.correct_predsNER / self.total_predsNER if self.correct_predsNER > 0 else 0 305 | r = self.correct_predsNER / self.total_correctNER if self.correct_predsNER > 0 else 0 306 | f1 = 2 * p * r / (p + r) if self.correct_predsNER > 0 else 0 307 | 308 | print(self.correct_predsNER) 309 | print(self.total_predsNER) 310 | print(self.total_correctNER) 311 | 312 | print(f1) 313 | return f1 314 | 315 | def getResultsREL(self): 316 | p = self.correct_predsREL / self.total_predsREL if self.correct_predsREL > 0 else 0 317 | r = self.correct_predsREL / self.total_correctREL if self.correct_predsREL > 0 else 0 318 | f1 = 2 * p * r / (p + r) if self.correct_predsREL > 0 else 0 319 | 320 | print(self.correct_predsREL) 321 | print(self.total_predsREL) 322 | print(self.total_correctREL) 323 | 324 | print(f1) 325 | return f1 326 | 327 | def getPrecision(self, tps, fps): 328 | if tps == 0: 329 | return 0 330 | else: 331 | return tps / (tps + fps) 332 | 333 | def getRecall(self, tps, fns): 334 | if tps == 0: 335 | return 0 336 | else: 337 | return tps / (tps + fns) 338 | 339 | def getF1(self, tps, fps, fns): 340 | if tps == 0: 341 | return 0 342 | else: 343 | return 2 * self.getPrecision(tps, fps) * self.getRecall(tps, fns) / ( 344 | self.getPrecision(tps, fps) + self.getRecall(tps, fns)) 345 | 346 | 347 | def getChunkedOverallAvgF1(self): 348 | 349 | 350 | return (self.getChunkedNERF1()+self.getChunkedRELF1())/2 351 | 352 | def getChunkedOverallF1(self): 353 | tpsNER=0 354 | fnsNER=0 355 | fpsNER=0 356 | tpsREL=0 357 | fnsREL=0 358 | fpsREL=0 359 | if self.ner_chunk_eval == "boundaries_type": 360 | for label in self.NERset: 361 | # if label != "O" : 362 | tpsNER += self.tpsClassesNER[label] 363 | 364 | fnsNER += self.fnsClassesNER[label] 365 | fpsNER += self.fpsClassesNER[label] 366 | elif self.ner_chunk_eval == "boundaries": 367 | tpsNER=self.tpsNER 368 | fnsNER = self.fnsNER 369 | fpsNER = self.fpsNER 370 | 371 | 372 | for label in self.RELset: 373 | 374 | if label != "N": 375 | tpsREL += self.tpsClassesREL[label] 376 | 377 | fnsREL += self.fnsClassesREL[label] 378 | fpsREL += self.fpsClassesREL[label] 379 | 380 | 381 | 382 | return self.getF1(tpsNER+tpsREL, fpsNER+fpsREL, fnsNER+fnsREL) 383 | 384 | 385 | def getOverallF1(self): 386 | tpsNER=0 387 | fnsNER=0 388 | fpsNER=0 389 | tpsREL=0 390 | fnsREL=0 391 | fpsREL=0 392 | 393 | for label in self.NERset: 394 | # if label != "O" : 395 | tpsNER += self.tpsClassesNER[label] 396 | 397 | fnsNER += self.fnsClassesNER[label] 398 | fpsNER += self.fpsClassesNER[label] 399 | 400 | for label in self.RELset: 401 | 402 | if label != "N": 403 | tpsREL += self.tpsClassesREL[label] 404 | 405 | fnsREL += self.fnsClassesREL[label] 406 | fpsREL += self.fpsClassesREL[label] 407 | 408 | 409 | 410 | return self.getF1(tpsNER+tpsREL, fpsNER+fpsREL, fnsNER+fnsREL) 411 | 412 | def getChunkedRELF1(self): 413 | 414 | tpsREL=0 415 | fnsREL=0 416 | fpsREL=0 417 | 418 | 419 | 420 | for label in self.RELset: 421 | 422 | if label != "N": 423 | tpsREL += self.tpsClassesREL[label] 424 | 425 | fnsREL += self.fnsClassesREL[label] 426 | fpsREL += self.fpsClassesREL[label] 427 | 428 | 429 | 430 | return self.getF1(tpsREL, fpsREL, fnsREL) 431 | 432 | def getChunkedNERF1(self): 433 | tpsNER = 0 434 | fnsNER = 0 435 | fpsNER = 0 436 | if self.ner_chunk_eval == "boundaries_type": 437 | 438 | 439 | for label in self.NERset: 440 | # if label != "O" : 441 | tpsNER += self.tpsClassesNER[label] 442 | 443 | fnsNER += self.fnsClassesNER[label] 444 | fpsNER += self.fpsClassesNER[label] 445 | 446 | 447 | elif self.ner_chunk_eval== "boundaries": 448 | tpsNER =self.tpsNER 449 | fnsNER = self.fnsNER 450 | fpsNER = self.fpsNER 451 | 452 | return self.getF1(tpsNER, fpsNER, fnsNER) 453 | def getAccuracy(self): 454 | return self.oks / self.totals 455 | 456 | def printInfo(self): 457 | 458 | printer = printClasses() 459 | 460 | if self.ner_chunk_eval== "boundaries_type": 461 | for label in self.NERset: 462 | # if label != "O" : 463 | self.tpsNER += self.tpsClassesNER[label] 464 | 465 | self.fnsNER += self.fnsClassesNER[label] 466 | self.fpsNER += self.fpsClassesNER[label] 467 | 468 | printer.add(label, self.tpsClassesNER[label], self.fpsClassesNER[label], self.fnsClassesNER[label], 469 | self.getPrecision(self.tpsClassesNER[label], self.fpsClassesNER[label]), 470 | self.getRecall(self.tpsClassesNER[label], self.fnsClassesNER[label]), 471 | self.getF1(self.tpsClassesNER[label], self.fpsClassesNER[label], self.fnsClassesNER[label])) 472 | 473 | 474 | 475 | # print('%s TP: %d FP: %d FN: %d TN: %d precision: %f recall: %f F1: %f' % (label,self.tpsClasses[label],self.fpsClasses[label],self.fnsClasses[label],self.tnsClasses[label], self.precision[label], self.recall[label], self.f1[label])) 476 | printer.add("-", "-", "-", "-", 477 | "-", "-", 478 | "-") 479 | printer.add("Micro NER chunk", self.tpsNER, self.fpsNER, self.fnsNER, 480 | self.getPrecision(self.tpsNER, self.fpsNER), self.getRecall(self.tpsNER, self.fnsNER), 481 | self.getF1(self.tpsNER, self.fpsNER, self.fnsNER)) 482 | 483 | elif self.ner_chunk_eval== "boundaries": 484 | printer.add("Micro NER chunk boundaries", self.tpsNER, self.fpsNER, self.fnsNER, 485 | self.getPrecision(self.tpsNER, self.fpsNER), self.getRecall(self.tpsNER, self.fnsNER), 486 | self.getF1(self.tpsNER, self.fpsNER, self.fnsNER)) 487 | printer.print() 488 | 489 | printer = printClasses() 490 | for label in self.RELset: 491 | 492 | if label != "N": 493 | self.tpsREL += self.tpsClassesREL[label] 494 | 495 | self.fnsREL += self.fnsClassesREL[label] 496 | self.fpsREL += self.fpsClassesREL[label] 497 | 498 | printer.add(label, self.tpsClassesREL[label], self.fpsClassesREL[label], self.fnsClassesREL[label], 499 | self.getPrecision(self.tpsClassesREL[label], self.fpsClassesREL[label]), 500 | self.getRecall(self.tpsClassesREL[label], self.fnsClassesREL[label]), 501 | self.getF1(self.tpsClassesREL[label], self.fpsClassesREL[label], self.fnsClassesREL[label])) 502 | 503 | 504 | 505 | # print('%s TP: %d FP: %d FN: %d TN: %d precision: %f recall: %f F1: %f' % (label,self.tpsClasses[label],self.fpsClasses[label],self.fnsClasses[label],self.tnsClasses[label], self.precision[label], self.recall[label], self.f1[label])) 506 | printer.add("-", "-", "-", "-", 507 | "-", "-", 508 | "-") 509 | printer.add("Micro REL chunk", self.tpsREL, self.fpsREL, self.fnsREL, 510 | self.getPrecision(self.tpsREL, self.fpsREL), self.getRecall(self.tpsREL, self.fnsREL), 511 | self.getF1(self.tpsREL, self.fpsREL, self.fnsREL)) 512 | 513 | printer.print() 514 | 515 | 516 | def getMaxOccurence(lst): 517 | from collections import Counter 518 | most_common, num_most_common = Counter(lst).most_common(1)[0] # 4, 6 times 519 | return most_common 520 | 521 | 522 | def classesToChunks(tokenClasses, chunks): 523 | labeled_chunks = [] 524 | for chunk in chunks: 525 | 526 | class_list = (tokenClasses[chunk[1]:chunk[2] + 1]) 527 | 528 | if chunk[0] in class_list: 529 | labeled_chunks.append((chunk[0], chunk[1], chunk[2])) 530 | else: 531 | labeled_chunks.append((getMaxOccurence(class_list), chunk[1], chunk[2])) 532 | # print (class_list) 533 | return labeled_chunks 534 | 535 | 536 | def listOfTagsToids(lstTags,tags): 537 | lstids = [] 538 | for ner in lstTags: 539 | lstids.append(tags.index(ner)) 540 | 541 | return lstids 542 | 543 | def listOfIdsToTags(lst_ids,tags): 544 | lstTags= [] 545 | for nerId in lst_ids: 546 | lstTags.append(tags[nerId]) 547 | return lstTags 548 | 549 | class relaxedChunkEvaluator: 550 | def __init__(self,dataset_params,rel_chunk_eval="boundaries"): 551 | self.nerSegmentationTags=dataset_params.dataset_set_bio_tags 552 | 553 | self.NERset = dataset_params.dataset_set_ec_tags#utils.getNerSetACE04() 554 | self.RELset = dataset_params.dataset_set_relations#reutils.getRelSetACE04() 555 | #self.nerDict=dataset_params 556 | # print (self.NERset) 557 | self.rel_chunk_eval=rel_chunk_eval 558 | self.totals = 0 559 | self.oks = 0 560 | 561 | self.tpsNER = 0 562 | self.fpsNER = 0 563 | self.fnsNER = 0 564 | 565 | self.tpsREL = 0 566 | self.fpsREL = 0 567 | self.fnsREL = 0 568 | 569 | self.tpsNERMacro = 0 570 | self.fpsNERMacro = 0 571 | self.fnsNERMacro = 0 572 | 573 | self.tpsNERMacro_no_other = 0 574 | self.fpsNERMacro_no_other = 0 575 | self.fnsNERMacro_no_other = 0 576 | 577 | self.tpsRELMacro = 0 578 | self.fpsRELMacro = 0 579 | self.fnsRELMacro = 0 580 | 581 | 582 | self.NERF1Macro=0 583 | self.NERF1Macro_no_other = 0 584 | self.RELF1Macro = 0 585 | self.OverallF1Macro = 0 586 | self.OverallF1Macro_no_other = 0 587 | 588 | 589 | self.tpsClassesNER = dict.fromkeys(self.NERset, 0) 590 | self.fpsClassesNER = dict.fromkeys(self.NERset, 0) 591 | self.fnsClassesNER = dict.fromkeys(self.NERset, 0) 592 | self.precisionNER = dict.fromkeys(self.NERset, 0) 593 | self.recallNER = dict.fromkeys(self.NERset, 0) 594 | self.f1NER = dict.fromkeys(self.NERset, 0) 595 | 596 | self.tpsClassesREL = dict.fromkeys(self.RELset, 0) 597 | self.fpsClassesREL = dict.fromkeys(self.RELset, 0) 598 | self.fnsClassesREL = dict.fromkeys(self.RELset, 0) 599 | 600 | self.precisionREL = dict.fromkeys(self.RELset, 0) 601 | self.recallREL = dict.fromkeys(self.RELset, 0) 602 | self.f1REL = dict.fromkeys(self.RELset, 0) 603 | 604 | self.correct_predsNER, self.total_correctNER, self.total_predsNER = 0., 0., 0. 605 | self.correct_predsREL, self.total_correctREL, self.total_predsREL = 0., 0., 0. 606 | 607 | def add(self, pred_batchesNER, true_batchesNER, pred_batchesREL, true_batchesREL,true_batchesBIONER): 608 | 609 | 610 | 611 | for batch_idx in range(len(pred_batchesNER)): 612 | predNER = pred_batchesNER[batch_idx] 613 | trueNER = true_batchesNER[batch_idx] 614 | 615 | predRel = pred_batchesREL[batch_idx] 616 | trueRel = true_batchesREL[batch_idx] 617 | 618 | trueBIONER=true_batchesBIONER[batch_idx] 619 | 620 | 621 | ptoken_ids, _, plabel_ids, phead_ids, plabel_names = data_utils.transformToInitialInput( 622 | predRel, self.RELset) 623 | 624 | _, _, tlabel_ids, thead_ids, tlabel_names = data_utils.transformToInitialInput( 625 | trueRel, self.RELset) 626 | 627 | trueRel = getTokenRelations(tlabel_names, thead_ids, ptoken_ids) 628 | 629 | predRel = getTokenRelations(plabel_names, phead_ids, ptoken_ids) 630 | 631 | 632 | #print (self.NERset) 633 | tagsNER = data_utils.getSegmentationDict(self.nerSegmentationTags)#self. 634 | 635 | 636 | 637 | lab_chunks_ = set(get_chunks(listOfTagsToids(trueBIONER,self.nerSegmentationTags), tagsNER)) 638 | #lab_pred_chunks = set(get_chunks(predNER, tagsNER)) 639 | 640 | lab_chunks_list_ = list(lab_chunks_) 641 | 642 | 643 | trueNER_tags=listOfIdsToTags(trueNER,self.NERset) 644 | predNER_tags=listOfIdsToTags(predNER, self.NERset) 645 | 646 | lab_chunks = set(classesToChunks(trueNER_tags, lab_chunks_list_)) 647 | lab_pred_chunks=set(classesToChunks(predNER_tags, lab_chunks_list_)) 648 | 649 | lab_chunks_list = list(lab_chunks) 650 | lab_pred_chunks_list = list(lab_pred_chunks) 651 | 652 | 653 | for lab_idx in range(len(lab_pred_chunks_list)): 654 | 655 | if lab_pred_chunks_list[lab_idx] in lab_chunks_list: 656 | # print (lab_pred_chunks_list[lab_idx][0]) 657 | self.tpsClassesNER[lab_pred_chunks_list[lab_idx][0]] += 1 658 | else: 659 | self.fpsClassesNER[lab_pred_chunks_list[lab_idx][0]] += 1 660 | # fnsEntitiesNER+=1 661 | 662 | for lab_idx in range(len(lab_chunks_list)): 663 | 664 | if lab_chunks_list[lab_idx] not in lab_pred_chunks_list: 665 | self.fnsClassesNER[lab_chunks_list[lab_idx][0]] += 1 666 | 667 | relTrue = set(relationChunks(trueRel, lab_chunks_list,relationTuple=self.rel_chunk_eval)) 668 | 669 | relPred = set(relationChunks(predRel, lab_pred_chunks_list,relationTuple=self.rel_chunk_eval)) 670 | 671 | relTrueList = list(relTrue) # trueRel# 672 | 673 | # if (len(trueRel)!=len(relTrueList)): 674 | # print ("warning") 675 | 676 | relPredList = list(relPred) # predRel# 677 | 678 | #print("GOLD REL chunks:" + str(relTrueList)) 679 | 680 | #print("PRED REL chunks:" + str(relPredList)) 681 | 682 | for lab_idx in range(len(relPredList)): 683 | 684 | if relPredList[lab_idx] in relTrueList: 685 | # print (lab_pred_chunks_list[lab_idx][0]) 686 | self.tpsClassesREL[relPredList[lab_idx][1]] += 1 687 | # print (relPredList[lab_idx]) 688 | else: 689 | self.fpsClassesREL[relPredList[lab_idx][1]] += 1 690 | # fnsEntitiesNER+=1 691 | 692 | for lab_idx in range(len(relTrueList)): 693 | 694 | if relTrueList[lab_idx] not in relPredList: 695 | self.fnsClassesREL[relTrueList[lab_idx][1]] += 1 696 | 697 | self.correct_predsNER += len(lab_chunks & lab_pred_chunks) 698 | self.total_predsNER += len(lab_pred_chunks) 699 | self.total_correctNER += len(lab_chunks) 700 | 701 | self.correct_predsREL += len(relTrue & relPred) 702 | self.total_predsREL += len(relPred) 703 | self.total_correctREL += len(relTrue) 704 | 705 | 706 | 707 | def getResultsNER(self): 708 | p = self.correct_predsNER / self.total_predsNER if self.correct_predsNER > 0 else 0 709 | r = self.correct_predsNER / self.total_correctNER if self.correct_predsNER > 0 else 0 710 | f1 = 2 * p * r / (p + r) if self.correct_predsNER > 0 else 0 711 | 712 | print(self.correct_predsNER) 713 | print(self.total_predsNER) 714 | print(self.total_correctNER) 715 | 716 | print(f1) 717 | return f1 718 | 719 | def getResultsREL(self): 720 | p = self.correct_predsREL / self.total_predsREL if self.correct_predsREL > 0 else 0 721 | r = self.correct_predsREL / self.total_correctREL if self.correct_predsREL > 0 else 0 722 | f1 = 2 * p * r / (p + r) if self.correct_predsREL > 0 else 0 723 | 724 | print(self.correct_predsREL) 725 | print(self.total_predsREL) 726 | print(self.total_correctREL) 727 | 728 | print(f1) 729 | return f1 730 | 731 | def getPrecision(self, tps, fps): 732 | if tps == 0: 733 | return 0 734 | else: 735 | return tps / (tps + fps) 736 | 737 | def getRecall(self, tps, fns): 738 | if tps == 0: 739 | return 0 740 | else: 741 | return tps / (tps + fns) 742 | 743 | def getF1(self, tps, fps, fns): 744 | if tps == 0: 745 | return 0 746 | else: 747 | return 2 * self.getPrecision(tps, fps) * self.getRecall(tps, fns) / ( 748 | self.getPrecision(tps, fps) + self.getRecall(tps, fns)) 749 | 750 | def getChunkedOverallF1(self): 751 | tpsNER=0 752 | fnsNER=0 753 | fpsNER=0 754 | tpsREL=0 755 | fnsREL=0 756 | fpsREL=0 757 | 758 | for label in self.NERset: 759 | # if label != "O" : 760 | tpsNER += self.tpsClassesNER[label] 761 | 762 | fnsNER += self.fnsClassesNER[label] 763 | fpsNER += self.fpsClassesNER[label] 764 | 765 | for label in self.RELset: 766 | 767 | if label != "N": 768 | tpsREL += self.tpsClassesREL[label] 769 | 770 | fnsREL += self.fnsClassesREL[label] 771 | fpsREL += self.fpsClassesREL[label] 772 | 773 | 774 | 775 | return self.getF1(tpsNER+tpsREL, fpsNER+fpsREL, fnsNER+fnsREL) 776 | 777 | 778 | def getOverallF1(self): 779 | tpsNER=0 780 | fnsNER=0 781 | fpsNER=0 782 | tpsREL=0 783 | fnsREL=0 784 | fpsREL=0 785 | 786 | for label in self.NERset: 787 | # if label != "O" : 788 | tpsNER += self.tpsClassesNER[label] 789 | 790 | fnsNER += self.fnsClassesNER[label] 791 | fpsNER += self.fpsClassesNER[label] 792 | 793 | for label in self.RELset: 794 | 795 | if label != "N": 796 | tpsREL += self.tpsClassesREL[label] 797 | 798 | fnsREL += self.fnsClassesREL[label] 799 | fpsREL += self.fpsClassesREL[label] 800 | 801 | 802 | 803 | return self.getF1(tpsNER+tpsREL, fpsNER+fpsREL, fnsNER+fnsREL) 804 | 805 | def getChunkedRELF1(self): 806 | 807 | tpsREL=0 808 | fnsREL=0 809 | fpsREL=0 810 | 811 | 812 | 813 | for label in self.RELset: 814 | 815 | if label != "N": 816 | tpsREL += self.tpsClassesREL[label] 817 | 818 | fnsREL += self.fnsClassesREL[label] 819 | fpsREL += self.fpsClassesREL[label] 820 | 821 | 822 | 823 | return self.getF1(tpsREL, fpsREL, fnsREL) 824 | 825 | def getChunkedNERF1(self): 826 | tpsNER=0 827 | fnsNER=0 828 | fpsNER=0 829 | 830 | 831 | for label in self.NERset: 832 | # if label != "O" : 833 | tpsNER += self.tpsClassesNER[label] 834 | 835 | fnsNER += self.fnsClassesNER[label] 836 | fpsNER += self.fpsClassesNER[label] 837 | 838 | return self.getF1(tpsNER, fpsNER, fnsNER) 839 | 840 | def getAccuracy(self): 841 | return self.oks / self.totals 842 | 843 | def getMacroF1scores(self): 844 | 845 | 846 | return self.NERF1Macro,self.RELF1Macro,self.OverallF1Macro 847 | 848 | def getMacroF1scoresNoOtherClass(self): 849 | 850 | return self.NERF1Macro_no_other, self.RELF1Macro, self.OverallF1Macro_no_other 851 | 852 | 853 | def computeInfoMacro(self,printScores=True): 854 | 855 | printer = printClasses() 856 | 857 | 858 | averageNERF1_no_Other=0 859 | averageNERF1 = 0 860 | 861 | averageNERrecall_no_Other = 0 862 | averageNERrecall = 0 863 | 864 | averageNERprecision_no_Other = 0 865 | averageNERprecision = 0 866 | 867 | for label in self.NERset: 868 | if label != "O": 869 | self.tpsNERMacro += self.tpsClassesNER[label] 870 | 871 | self.fnsNERMacro += self.fnsClassesNER[label] 872 | self.fpsNERMacro += self.fpsClassesNER[label] 873 | 874 | f1_class=self.getF1(self.tpsClassesNER[label], self.fpsClassesNER[label], self.fnsClassesNER[label]) 875 | precision_class=self.getPrecision(self.tpsClassesNER[label], self.fpsClassesNER[label]) 876 | recall_class=self.getRecall(self.tpsClassesNER[label], self.fnsClassesNER[label]) 877 | if label!= "O" : 878 | averageNERF1+=f1_class 879 | averageNERrecall += recall_class 880 | averageNERprecision += precision_class 881 | 882 | if label!= "O" and label!= "Other": 883 | averageNERF1_no_Other+=f1_class 884 | averageNERrecall_no_Other += recall_class 885 | averageNERprecision_no_Other += precision_class 886 | 887 | 888 | if label != "O" and label != "Other": 889 | self.tpsNERMacro_no_other += self.tpsClassesNER[label] 890 | 891 | self.fnsNERMacro_no_other += self.fnsClassesNER[label] 892 | self.fpsNERMacro_no_other += self.fpsClassesNER[label] 893 | 894 | 895 | printer.add(label, self.tpsClassesNER[label], self.fpsClassesNER[label], self.fnsClassesNER[label], 896 | precision_class, 897 | recall_class, 898 | f1_class) 899 | 900 | 901 | 902 | # print('%s TP: %d FP: %d FN: %d TN: %d precision: %f recall: %f F1: %f' % (label,self.tpsClasses[label],self.fpsClasses[label],self.fnsClasses[label],self.tnsClasses[label], self.precision[label], self.recall[label], self.f1[label])) 903 | printer.add("-", "-", "-", "-", 904 | "-", "-", 905 | "-") 906 | 907 | averageNERrecall = averageNERrecall / (len(self.NERset) - 1) 908 | averageNERprecision = averageNERprecision / (len(self.NERset) - 1) 909 | averageNERF1 = averageNERF1 / (len(self.NERset) - 1) 910 | 911 | 912 | if "other" in [x.lower() for x in self.NERset]: 913 | 914 | averageNERprecision_no_Other=averageNERprecision_no_Other / (len(self.NERset) -2) 915 | averageNERrecall_no_Other=averageNERrecall_no_Other / (len(self.NERset) -2) 916 | averageNERF1_no_Other=averageNERF1_no_Other / (len(self.NERset) -2) 917 | 918 | printer.add("Macro NER chunk RELAXED ^Other", self.tpsNERMacro_no_other, self.fpsNERMacro_no_other, self.fnsNERMacro_no_other, 919 | averageNERprecision_no_Other, averageNERrecall_no_Other, 920 | averageNERF1_no_Other) 921 | else: 922 | averageNERprecision_no_Other = averageNERprecision 923 | averageNERrecall_no_Other = averageNERrecall 924 | averageNERF1_no_Other = averageNERF1 925 | 926 | 927 | printer.add("Macro NER chunk RELAXED", self.tpsNERMacro, self.fpsNERMacro, self.fnsNERMacro, 928 | averageNERprecision, averageNERrecall, 929 | averageNERF1) 930 | if printScores ==True: 931 | 932 | printer.print() 933 | 934 | printer = printClasses() 935 | 936 | averageRELF1 = 0 937 | 938 | averageRELrecall = 0 939 | 940 | averageRELprecision = 0 941 | 942 | for label in self.RELset: 943 | 944 | if label != "N": 945 | self.tpsRELMacro += self.tpsClassesREL[label] 946 | 947 | self.fnsRELMacro += self.fnsClassesREL[label] 948 | self.fpsRELMacro += self.fpsClassesREL[label] 949 | 950 | f1_class = self.getF1(self.tpsClassesREL[label], self.fpsClassesREL[label], self.fnsClassesREL[label]) 951 | precision_class = self.getPrecision(self.tpsClassesREL[label], self.fpsClassesREL[label]) 952 | recall_class = self.getRecall(self.tpsClassesREL[label], self.fnsClassesREL[label]) 953 | 954 | averageRELF1+=f1_class 955 | averageRELrecall += recall_class 956 | averageRELprecision += precision_class 957 | 958 | printer.add(label, self.tpsClassesREL[label], self.fpsClassesREL[label], self.fnsClassesREL[label], 959 | precision_class, 960 | recall_class, 961 | f1_class) 962 | 963 | 964 | 965 | # print('%s TP: %d FP: %d FN: %d TN: %d precision: %f recall: %f F1: %f' % (label,self.tpsClasses[label],self.fpsClasses[label],self.fnsClasses[label],self.tnsClasses[label], self.precision[label], self.recall[label], self.f1[label])) 966 | printer.add("-", "-", "-", "-", 967 | "-", "-", 968 | "-") 969 | 970 | 971 | averageRELrecall=averageRELrecall/(len(self.RELset) - 1) 972 | averageRELprecision=averageRELprecision/(len(self.RELset) - 1) 973 | averageRELF1 =averageRELF1 /(len(self.RELset) - 1) 974 | 975 | 976 | 977 | 978 | printer.add("Macro REL chunk RELAXED", self.tpsRELMacro, self.fpsRELMacro, self.fnsRELMacro, 979 | averageRELprecision, averageRELrecall, 980 | averageRELF1) 981 | 982 | if printScores == True: 983 | printer.print() 984 | 985 | over_avg_f1 = (averageNERF1 + averageRELF1) / 2 986 | over_avg_f1_no_other = (averageNERF1_no_Other + averageRELF1) / 2 987 | 988 | t = PrettyTable(['Type','NER_F1', 'REL_F1', 'AVG_F1']) 989 | 990 | t.add_row(['Overall', averageNERF1, averageRELF1, over_avg_f1]) 991 | if "other" in [x.lower() for x in self.NERset]: 992 | t.add_row(['Overall ^Other', averageNERF1_no_Other, averageRELF1, over_avg_f1_no_other]) 993 | 994 | if printScores == True: 995 | print (t) 996 | 997 | self.NERF1Macro = averageNERF1 998 | self.NERF1Macro_no_other = averageNERF1_no_Other 999 | self.RELF1Macro = averageRELF1 1000 | self.OverallF1Macro = over_avg_f1 1001 | self.OverallF1Macro_no_other = over_avg_f1_no_other 1002 | 1003 | 1004 | def printInfoMicro(self): 1005 | 1006 | printer = printClasses() 1007 | 1008 | for label in self.NERset: 1009 | # if label != "O" : 1010 | self.tpsNER += self.tpsClassesNER[label] 1011 | 1012 | self.fnsNER += self.fnsClassesNER[label] 1013 | self.fpsNER += self.fpsClassesNER[label] 1014 | 1015 | printer.add(label, self.tpsClassesNER[label], self.fpsClassesNER[label], self.fnsClassesNER[label], 1016 | self.getPrecision(self.tpsClassesNER[label], self.fpsClassesNER[label]), 1017 | self.getRecall(self.tpsClassesNER[label], self.fnsClassesNER[label]), 1018 | self.getF1(self.tpsClassesNER[label], self.fpsClassesNER[label], self.fnsClassesNER[label])) 1019 | 1020 | 1021 | 1022 | # print('%s TP: %d FP: %d FN: %d TN: %d precision: %f recall: %f F1: %f' % (label,self.tpsClasses[label],self.fpsClasses[label],self.fnsClasses[label],self.tnsClasses[label], self.precision[label], self.recall[label], self.f1[label])) 1023 | printer.add("-", "-", "-", "-", 1024 | "-", "-", 1025 | "-") 1026 | printer.add("Micro NER chunk RELAXED", self.tpsNER, self.fpsNER, self.fnsNER, 1027 | self.getPrecision(self.tpsNER, self.fpsNER), self.getRecall(self.tpsNER, self.fnsNER), 1028 | self.getF1(self.tpsNER, self.fpsNER, self.fnsNER)) 1029 | 1030 | printer.print() 1031 | 1032 | printer = printClasses() 1033 | for label in self.RELset: 1034 | 1035 | if label != "N": 1036 | self.tpsREL += self.tpsClassesREL[label] 1037 | 1038 | self.fnsREL += self.fnsClassesREL[label] 1039 | self.fpsREL += self.fpsClassesREL[label] 1040 | 1041 | printer.add(label, self.tpsClassesREL[label], self.fpsClassesREL[label], self.fnsClassesREL[label], 1042 | self.getPrecision(self.tpsClassesREL[label], self.fpsClassesREL[label]), 1043 | self.getRecall(self.tpsClassesREL[label], self.fnsClassesREL[label]), 1044 | self.getF1(self.tpsClassesREL[label], self.fpsClassesREL[label], 1045 | self.fnsClassesREL[label])) 1046 | 1047 | 1048 | 1049 | # print('%s TP: %d FP: %d FN: %d TN: %d precision: %f recall: %f F1: %f' % (label,self.tpsClasses[label],self.fpsClasses[label],self.fnsClasses[label],self.tnsClasses[label], self.precision[label], self.recall[label], self.f1[label])) 1050 | printer.add("-", "-", "-", "-", 1051 | "-", "-", 1052 | "-") 1053 | printer.add("Micro REL chunk RELAXED", self.tpsREL, self.fpsREL, self.fnsREL, 1054 | self.getPrecision(self.tpsREL, self.fpsREL), self.getRecall(self.tpsREL, self.fnsREL), 1055 | self.getF1(self.tpsREL, self.fpsREL, self.fnsREL)) 1056 | 1057 | printer.print() --------------------------------------------------------------------------------