├── model.png
├── config
    ├── bio_config
    └── bio_config_adv
├── README.md
├── train.py
├── data_parsers.py
├── data_build.py
├── model.py
├── data_utils.py
└── eval.py


/model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NeilGY/NER_entityRelationExtration/HEAD/model.png


--------------------------------------------------------------------------------
/config/bio_config:
--------------------------------------------------------------------------------
 1 | # pretrained embeddings
 2 | filename_embeddings     =data/vecs.lc.over100freq.txt
 3 | 
 4 | # dataset
 5 | filename_dev            = "data/dev.txt"
 6 | filename_test           = "data/test.txt"
 7 | filename_train          = "data/train.txt"
 8 | 
 9 | # training
10 | nepochs                 = 150
11 | optimizer               = Adam
12 | activation              = tanh
13 | learning_rate           = 1e-3
14 | gradientClipping        = False # if False, no clipping
15 | nepoch_no_imprv         = 30
16 | use_dropout             = True
17 | ner_loss                = crf # or softmax
18 | use_chars               = True 
19 | use_adversarial         = False
20 | ner_classes             = BIO #or EC for entity classification
21 | 
22 | #hyperparameters
23 | dropout_embedding       = 0.9
24 | dropout_lstm            = 0.9
25 | dropout_lstm_output     = 0.9
26 | dropout_fcl_ner         = 1
27 | dropout_fcl_rel         = 1
28 | hidden_size_lstm        = 64
29 | hidden_size_n1          = 64
30 | #hidden_size_n2          = 32
31 | num_lstm_layers         = 3
32 | char_embeddings_size    = 25
33 | hidden_size_char        = 25
34 | label_embeddings_size   = 0 #if 0, no label embeddings
35 | alpha                   = 0.01
36 | 
37 | #evaluation
38 | evaluation_method       = strict # alternatives "boundaries" and "relaxed"
39 | root_node               = False 
40 | 


--------------------------------------------------------------------------------
/config/bio_config_adv:
--------------------------------------------------------------------------------
 1 | # pretrained embeddings
 2 | filename_embeddings     =data/CoNLL04/vecs.lc.over100freq.txt
 3 | 
 4 | # dataset
 5 | filename_dev            = "data/CoNLL04/dev.txt"
 6 | filename_test           = "data/CoNLL04/test.txt"
 7 | filename_train          = "data/CoNLL04/train.txt"
 8 | 
 9 | # training
10 | nepochs                 = 130
11 | optimizer               = Adam
12 | activation              = tanh
13 | learning_rate           = 1e-3
14 | gradientClipping        = False # if False, no clipping
15 | nepoch_no_imprv         = 40
16 | use_dropout             = True
17 | ner_loss                = crf # or softmax
18 | use_chars               = True 
19 | use_adversarial         = True
20 | ner_classes             = BIO #or EC for entity classification
21 | 
22 | #hyperparameters
23 | dropout_embedding       = 0.9
24 | dropout_lstm            = 0.9
25 | dropout_lstm_output     = 0.9
26 | dropout_fcl_ner         = 1
27 | dropout_fcl_rel         = 1
28 | hidden_size_lstm        = 64
29 | hidden_size_n1          = 64
30 | #hidden_size_n2          = 32
31 | num_lstm_layers         = 3
32 | char_embeddings_size    = 25
33 | hidden_size_char        = 25
34 | label_embeddings_size   = 0 #if 0, no label embeddings
35 | alpha                   = 0.01
36 | 
37 | #evaluation
38 | evaluation_method       = strict # alternatives "boundaries" and "relaxed"
39 | root_node               = False 
40 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 模型图：项目中model.png  
 2 | 请参照模型图理解代码  
 3 | 
 4 | 1.项目大致流程描述：  
 5 | word/char Embedding(特征嵌入层):  
 6 | 在词级别的向量基础上加入字符级的信息，这样的embedding可以捕捉前缀后缀这样的形态特征。  
 7 | 先用skip-gram word2vec 模型预训练得到的词向量表将每个词映射为一个词向量，然后把每个词中字母用一个向量表示，把一个词中所包含的字母的向量送入 BiLSTM,
 8 | 把前后两个最终状态和 词向量进行拼接,得到词的embedding  
 9 | BiLSTM层:  
10 | 把句子中所包含词的embedding输入，然后将前向、后向 每个对应位置的hidden state拼接起来得到新的编码序列。  
11 | CRF Layer:    
12 | 采用BIO标注策略,使用CRF引入标签之间的依赖关系，  
13 | 计算每个词得到不同标签的分数  
14 | 计算句子的标签序列概率  
15 | 采用Viterbi算法得到分数最高的序列标签  
16 | 在进行命名实体时 通过最小化交叉熵损失 来达到 优化网络参数和CRF的目的，测试时用Viterbi算法得到分数最高的序列标签  
17 | Label Embedding:  
18 | 实体标签的embedding。训练时真实标签,测试时为预测标签  
19 | Heads Relations：    
20 | 输入为BiLSTM的hidden state和label Embedding的拼接。可以预测多个头，头和关系的决策是一块完成的,而不是先预测头，再用关系分类器预测关系  
21 | 标签策略： CRF层的输出是采用BIO标注策略的实体识别结果，head Relations层只有在和其他实体有关系时  会给出对应实体的尾单词和关系；在与其他实体没有关系时 head为原单词本身,关系为N  
22 | Adversarial training(AT):  对抗训练 使分类器对于噪音数据有更强的鲁棒性(混合原来的样本+对抗样本)  
23 | 
24 | 
25 | 词向量数据路径：  
26 | 链接: https://pan.baidu.com/s/1P_QtMKKhUdtc0XfOnpSBOw 提取码: 45ic 
27 | 
28 | 2.数据格式描述：  
29 | #doc 5121  文件名  
30 | ['token_id', 'token', "BIO", "relation", 'head']  
31 | token_id : 每个文件中词所在位置下标  
32 | token :    词  
33 | BIO：      标注实体类型  
34 | relation:  实体关系  
35 | head:      当前 实体关系 对应实体的位置下标  
36 | 
37 | data_parsers.py:  
38 | docId:        文件名称id  
39 | token_ids:    词在每个文件中对应位置的下标列表  
40 | tokens:       单词的列表  
41 | BIOs:         词对应的实体列表  
42 | ecs:          没加标注的的实体列表  
43 | relations:    实体关系的列表  
44 | heads:        实体关系对应实体下标位置的列表，如[[2],[3,4]]  
45 | char_ids:     每个单词中的每个字母对应的id的列表,如 两个单词第一个单词包含三个字母,第二个单词包含四个字母[[1,2,3],[11,12,1,4]]  
46 | embedding_ids:单词对应id的列表  
47 | BIO_ids:      实体对应id的列表  
48 | ec_ids:       没加标注的实体对应id的列表  
49 | joint_ids:    实体关系联合的列表：计算规则(可参考后期验证数据校验时的 数据处理规则)：headId*len(set(relations))+relation_id  
50 | 实体关系的去重列表长度:len(set(relations))  
51 | 该实体谷关系对应的实体下标:headId  
52 | 实体关系 对应的id: relation_id  
53 | 
54 | 
55 | 
56 | 3.文件描述：方法详细功能在代码注释中可看  
57 | data_build.py    初始化配置文件及数据  
58 | data_parsers.py  封装数据  
59 | model.py         模型  
60 | train.py         模型训练  
61 | data_utils       数据转换、处理  
62 | eval             模型校验  
63 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | import data_utils
 2 | import model as Model
 3 | from data_build import data_build
 4 | import numpy as np
 5 | import tensorflow as tf
 6 | 
 7 | 
 8 | output_dir='logs/'
 9 | config_file='config/bio_config'
10 | 
11 | def train():
12 |     config = data_build(config_file) #加载配置文件数据,处理训练数据
13 |     train_data = data_utils.HeadData(config.train_id_docs, np.arange(len(config.train_id_docs)))
14 |     dev_data = data_utils.HeadData(config.dev_id_docs, np.arange(len(config.dev_id_docs)))
15 |     test_data = data_utils.HeadData(config.test_id_docs, np.arange(len(config.test_id_docs)))
16 | 
17 |     tf.reset_default_graph()
18 |     tf.set_random_seed(1)
19 | 
20 |     data_utils.printParameters(config)
21 | 
22 |     with tf.Session() as sess:
23 |         embedding_matrix = tf.get_variable('embedding_matrix', shape=config.wordvectors.shape, dtype=tf.float32,
24 |                                            trainable=False).assign(config.wordvectors)
25 |         emb_mtx = sess.run(embedding_matrix)
26 |         #初始化模型
27 |         model = Model.model(config, emb_mtx, sess)
28 |         #获取需要计算的模型损失、预测结果
29 |         obj, m_op, predicted_op_ner, actual_op_ner, predicted_op_rel, actual_op_rel, score_op_rel = model.run()
30 |         #优化函数迭代
31 |         train_step = model.get_train_op(obj)
32 |         #模型参数
33 |         operations = Model.operations(train_step, obj, m_op, predicted_op_ner, actual_op_ner, predicted_op_rel, actual_op_rel, score_op_rel)
34 | 
35 |         sess.run(tf.global_variables_initializer())
36 | 
37 |         best_score = 0
38 |         nepoch_no_imprv = 0  # for early stopping
39 | 
40 |         for iter in range(config.nepochs + 1):
41 |             #模型训练
42 |             model.train(train_data, operations, iter)
43 |             #模型评估
44 |             dev_score = model.evaluate(dev_data, operations, 'dev')
45 |             model.evaluate(test_data, operations, 'test')
46 | 
47 |             if dev_score >= best_score:
48 |                 nepoch_no_imprv = 0
49 |                 best_score = dev_score
50 | 
51 |                 print("- Best dev score {} so far in {} epoch".format(dev_score, iter))
52 | 
53 |             else:
54 |                 nepoch_no_imprv += 1
55 |                 if nepoch_no_imprv >= config.nepoch_no_imprv:
56 |                     print("- early stopping {} epochs without " \
57 |                           "improvement".format(nepoch_no_imprv))
58 | 
59 |                     with open(output_dir + "/es" + ".txt", "w+") as myfile:
60 |                         myfile.write(str(iter))
61 |                         myfile.close()
62 | 
63 |                     break
64 | 
65 | def main(_):
66 |     train()
67 | if __name__ == '__main__':
68 |     tf.app.run(main)


--------------------------------------------------------------------------------
/data_parsers.py:
--------------------------------------------------------------------------------
 1 | import data_utils
 2 | import csv
 3 | import pandas as pd
 4 | 
 5 | 
 6 | class headIdDoc:
 7 |     def __init__(self, id):
 8 |         self.docId = id
 9 |         self.token_ids = []
10 |         self.tokens = []
11 |         self.BIOs = []
12 |         self.relations = []
13 |         self.heads = []
14 | 
15 |         ###extend
16 |         self.embedding_ids = []
17 |         self.char_ids = []
18 |         self.BIO_ids = []
19 |         self.ecs = []
20 |         self.ec_ids = []
21 |         self.joint_ids = []
22 | 
23 |     def append(self, token_id, token, BIO, relations, heads):
24 |         self.tokens.append(str(token))
25 |         self.token_ids.append(token_id)
26 |         self.BIOs.append(BIO)
27 |         self.relations.append(relations)
28 |         self.heads.append(heads)
29 | 
30 |     def extend(self, wordindices, dataset_set_characters, dataset_set_bio_tags, dataset_set_ec_tags,
31 |                dataset_set_relations):
32 |         for tId in range(len(self.tokens)):
33 |             self.embedding_ids.append(int(data_utils.getEmbeddingId(self.tokens[tId], wordindices)))
34 |             self.char_ids.append(data_utils.tokenToCharIds(self.tokens[tId], dataset_set_characters))
35 |             self.BIO_ids.append(int(data_utils.getLabelId(self.BIOs[tId], dataset_set_bio_tags)))
36 |             self.ecs.append(data_utils.getECfromBIO(self.BIOs[tId]))
37 |             self.ec_ids.append(int(data_utils.getLabelId(data_utils.getECfromBIO(self.BIOs[tId]), dataset_set_ec_tags)))
38 |             self.joint_ids.append(data_utils.getScoringMatrixHeads(self.relations[tId], dataset_set_relations, self.heads[tId]))
39 | 
40 | 
41 | class headIdParser:
42 |     def __init__(self, file):
43 |         docNr = -1
44 |         self.head_docs = []
45 |         tokens = headIdDoc("")
46 | 
47 |         for i in range(file.shape[0]):
48 |             if '#doc' in file[i][0] or i == file.shape[0] - 1:  # append all docs including the last one
49 |                 if (i == file.shape[0] - 1):  # append last line
50 |                     tokens.append(int(file[i][0]), file[i][1], file[i][2],  data_utils.strToLst(file[i][3]),
51 |                                   data_utils.
52 |                                   strToLst(file[i][4]))  # append lines
53 |                 if (docNr != -1):
54 |                     self.head_docs.append(tokens)
55 |                 docNr += 1
56 |                 tokens = headIdDoc(file[i][0])
57 |             else:
58 |                 tokens.append(int(file[i][0]), file[i][1], file[i][2], data_utils.strToLst(file[i][3]),
59 |                               data_utils.
60 |                               strToLst(file[i][4]))  # append lines
61 | 
62 | 
63 | def readHeadFile(headFile):
64 |     # head_id_col_vector = ['tId', 'emId', "token", "nerId", "nerBilou","nerBIO", "ner", 'relLabels', "headIds", 'rels', 'relIds','scoringMatrixHeads','tokenWeights']
65 |     head_id_col_vector = ['token_id', 'token', "BIO", "relation", 'head']
66 |     headfile = pd.read_csv(headFile, names=head_id_col_vector, encoding="utf-8",
67 |                            engine='python', sep="\t", quoting=csv.QUOTE_NONE).as_matrix()
68 | 
69 |     return headIdParser(headfile).head_docs
70 | 
71 | def preprocess(docs,wordindices,dataset_set_characters,dataset_set_bio_tags,dataset_set_ec_tags,dataset_set_relations):
72 |     for doc in docs:
73 |         doc.extend(wordindices,dataset_set_characters,dataset_set_bio_tags,dataset_set_ec_tags,dataset_set_relations)
74 | 
75 | class read_properties:
76 |     def __init__(self,filepath, sep='=', comment_char='#'):
77 |         """Read the file passed as parameter as a properties file."""
78 |         self.props = {}
79 |         #print filepath
80 |         with open(filepath, "rt") as f:
81 |             for line in f:
82 |                 #print line
83 |                 l = line.strip()
84 |                 if l and not l.startswith(comment_char):
85 |                     key_value = l.split(sep)
86 |                     self.props[key_value[0].strip()] = key_value[1].split("#")[0].strip('" \t')
87 | 
88 | 
89 |     def getProperty(self,propertyName):
90 |         return self.props.get(propertyName)
91 | 
92 | 


--------------------------------------------------------------------------------
/data_build.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import data_utils
 3 | import data_parsers as parsers
 4 | from sklearn.externals import joblib
 5 | import os.path
 6 | 
 7 | """"Read the configuration file and set the parameters of the model"""
 8 | 
 9 | 
10 | class data_build():
11 |     def __init__(self, fname):
12 | 
13 |         config_file = parsers.read_properties(fname) #加载配置文件
14 |         self.config_fname = fname
15 | 
16 |         # load data
17 |         self.filename_embeddings = config_file.getProperty("filename_embeddings")
18 |         self.filename_train = config_file.getProperty("filename_train")
19 |         self.filename_test = config_file.getProperty("filename_test")
20 |         self.filename_dev = config_file.getProperty("filename_dev")
21 |         #生成各列数据的集合
22 |         self.train_id_docs = parsers.readHeadFile(self.filename_train)
23 |         self.dev_id_docs = parsers.readHeadFile(self.filename_dev)
24 |         self.test_id_docs = parsers.readHeadFile(self.filename_test)
25 | 
26 |         # 将所有数据加到一个大集合中
27 |         dataset_documents = []
28 |         dataset_documents.extend(self.train_id_docs)
29 |         dataset_documents.extend(self.dev_id_docs)
30 |         dataset_documents.extend(self.test_id_docs)
31 | 
32 |         self.dataset_set_characters = data_utils.getCharsFromDocuments(dataset_documents)#获得所有数据中 字母 数字的集合
33 |         self.dataset_set_bio_tags, self.dataset_set_ec_tags = data_utils.getEntitiesFromDocuments(dataset_documents)#获得所有数据中 实体 的集合
34 |         self.dataset_set_relations = data_utils.getRelationsFromDocuments(dataset_documents)#获得所有数据中 关系 的集合
35 |         #加载预训练好的词向量
36 |         if os.path.isfile(self.filename_embeddings + ".pkl") == False:
37 |             self.wordvectors, self.representationsize, self.words = data_utils.readWordvectorsNumpy(self.filename_embeddings, isBinary=True if self.filename_embeddings.endswith(".bin") else False)
38 |             self.wordindices = data_utils.readIndices(self.filename_embeddings,
39 |                                                  isBinary=True if self.filename_embeddings.endswith(".bin") else False)
40 |             joblib.dump((self.wordvectors, self.representationsize, self.words, self.wordindices), self.filename_embeddings + ".pkl")
41 | 
42 |         else:
43 |             self.wordvectors, self.representationsize, self.words, self.wordindices = joblib.load(self.filename_embeddings + ".pkl")  # loading is faster
44 |         #将数据转换成对应id的列表
45 |         parsers.preprocess(self.train_id_docs, self.wordindices, self.dataset_set_characters,
46 |                            self.dataset_set_bio_tags, self.dataset_set_ec_tags, self.dataset_set_relations)
47 |         parsers.preprocess(self.dev_id_docs, self.wordindices, self.dataset_set_characters,
48 |                            self.dataset_set_bio_tags, self.dataset_set_ec_tags, self.dataset_set_relations)
49 |         parsers.preprocess(self.test_id_docs, self.wordindices, self.dataset_set_characters,
50 |                            self.dataset_set_bio_tags, self.dataset_set_ec_tags, self.dataset_set_relations)
51 | 
52 |         # training
53 |         self.nepochs = int(config_file.getProperty("nepochs"))
54 |         self.optimizer = config_file.getProperty("optimizer")
55 |         self.activation = config_file.getProperty("activation")
56 |         self.learning_rate = float(config_file.getProperty("learning_rate"))
57 |         self.gradientClipping = data_utils.strToBool(config_file.getProperty("gradientClipping"))
58 |         self.nepoch_no_imprv = int(config_file.getProperty("nepoch_no_imprv"))
59 |         self.use_dropout = data_utils.strToBool(config_file.getProperty("use_dropout"))
60 |         self.ner_loss = config_file.getProperty("ner_loss")
61 |         self.ner_classes = config_file.getProperty("ner_classes")
62 |         self.use_chars = data_utils.strToBool(config_file.getProperty("use_chars"))
63 |         self.use_adversarial = data_utils.strToBool(config_file.getProperty("use_adversarial"))
64 | 
65 |         # hyperparameters
66 |         self.dropout_embedding = float(config_file.getProperty("dropout_embedding"))
67 |         self.dropout_lstm = float(config_file.getProperty("dropout_lstm"))
68 |         self.dropout_lstm_output = float(config_file.getProperty("dropout_lstm_output"))
69 |         self.dropout_fcl_ner = float(config_file.getProperty("dropout_fcl_ner"))
70 |         self.dropout_fcl_rel = float(config_file.getProperty("dropout_fcl_rel"))
71 |         self.hidden_size_lstm = int(config_file.getProperty("hidden_size_lstm"))
72 |         self.hidden_size_n1 = int(config_file.getProperty("hidden_size_n1"))
73 |         # self.hidden_size_n2 = config_file.getProperty("hidden_size_n2")
74 |         self.num_lstm_layers = int(config_file.getProperty("num_lstm_layers"))
75 |         self.char_embeddings_size = int(config_file.getProperty("char_embeddings_size"))
76 |         self.hidden_size_char = int(config_file.getProperty("hidden_size_char"))
77 |         self.label_embeddings_size = int(config_file.getProperty("label_embeddings_size"))
78 |         self.alpha = float(config_file.getProperty("alpha"))
79 | 
80 |         # evaluation
81 |         self.evaluation_method = config_file.getProperty("evaluation_method")
82 |         self.root_node = data_utils.strToBool(config_file.getProperty("root_node"))
83 | 
84 |         self.shuffle = False
85 |         self.batchsize = 16
86 | 
87 | 
88 | 
89 | 
90 | 
91 | 
92 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | import data_utils
  2 | import time
  3 | import eval
  4 | import tensorflow as tf
  5 | 
  6 | 
  7 | class model:
  8 |     """Set of classes and methods for training the model and computing the ner and head selection loss"""
  9 | 
 10 |     def __init__(self, config, emb_mtx, sess):
 11 |         """"Initialize data"""
 12 |         self.config = config
 13 |         self.emb_mtx = emb_mtx
 14 |         self.sess = sess
 15 | 
 16 |     def getEvaluator(self):
 17 |         if self.config.evaluation_method == "strict" and self.config.ner_classes == "BIO":  # the most common metric
 18 |             return eval.chunkEvaluator(self.config, ner_chunk_eval="boundaries_type",
 19 |                                        rel_chunk_eval="boundaries_type")
 20 |         elif self.config.evaluation_method == "boundaries" and self.config.ner_classes == "BIO":  # s
 21 |             return eval.chunkEvaluator(self.config, ner_chunk_eval="boundaries", rel_chunk_eval="boundaries")
 22 |         elif self.config.evaluation_method == "relaxed" and self.config.ner_classes == "EC":  # todo
 23 |             return eval.relaxedChunkEvaluator(self.config, rel_chunk_eval="boundaries_type")
 24 |         else:
 25 |             raise ValueError(
 26 |                 'Valid evaluation methods : "strict" and "boundaries" in "BIO" mode and "relaxed" in "EC" mode .')
 27 | 
 28 |     def train(self, train_data, operations, iter):
 29 | 
 30 |         loss = 0
 31 | 
 32 |         evaluator = self.getEvaluator()
 33 |         start_time = time.time()
 34 |         for x_train in data_utils.generator(train_data, operations.m_op, self.config, train=True):
 35 |             _, val, predicted_ner, actual_ner, predicted_rel, actual_rel, _, m_train = self.sess.run(
 36 |                 [operations.train_step, operations.obj, operations.predicted_op_ner, operations.actual_op_ner, operations.predicted_op_rel, operations.actual_op_rel, operations.score_op_rel,
 37 |                  operations.m_op], feed_dict=x_train)  # sess.run(embedding_init, feed_dict={embedding_placeholder: wordvectors})
 38 | 
 39 |             if self.config.evaluation_method == "relaxed":
 40 |                 evaluator.add(predicted_ner, actual_ner, predicted_rel, actual_rel, m_train['BIO'])
 41 |             else:
 42 |                 evaluator.add(predicted_ner, actual_ner, predicted_rel, actual_rel)
 43 | 
 44 |             loss += val
 45 | 
 46 |         print('****iter %d****' % (iter))
 47 |         print('-------Train-------')
 48 |         print('loss: %f ' % (loss))
 49 | 
 50 |         if self.config.evaluation_method == "relaxed":
 51 |             evaluator.computeInfoMacro()
 52 |         else:
 53 |             evaluator.printInfo()
 54 | 
 55 |         elapsed_time = time.time() - start_time
 56 |         print("Elapsed train time in sec:" + str(elapsed_time))
 57 |         print()
 58 | 
 59 |     def evaluate(self, eval_data, operations, set):
 60 | 
 61 |         print('-------Evaluate on ' + set + '-------')
 62 | 
 63 |         evaluator = self.getEvaluator()
 64 |         for x_dev in data_utils.generator(eval_data, operations.m_op, self.config, train=False):
 65 |             predicted_ner, actual_ner, predicted_rel, actual_rel, _, m_eval = self.sess.run(
 66 |                 [operations.predicted_op_ner, operations.actual_op_ner, operations.predicted_op_rel, operations.actual_op_rel, operations.score_op_rel, operations.m_op], feed_dict=x_dev)
 67 | 
 68 |             if self.config.evaluation_method == "relaxed":
 69 |                 evaluator.add(predicted_ner, actual_ner, predicted_rel, actual_rel, m_eval['BIO'])
 70 |             else:
 71 |                 evaluator.add(predicted_ner, actual_ner, predicted_rel, actual_rel)
 72 | 
 73 |         if self.config.evaluation_method == "relaxed":
 74 |             evaluator.computeInfoMacro(printScores=True)
 75 |             if "other" in [x.lower() for x in self.config.dataset_set_ec_tags]:  # if other class exists report score without "Other" class, see previous work on the CoNLL04
 76 |                 return evaluator.getMacroF1scoresNoOtherClass()[2]
 77 |             else:
 78 |                 return evaluator.getMacroF1scores()[2]
 79 | 
 80 |         else:
 81 |             evaluator.printInfo()
 82 |             return evaluator.getChunkedOverallAvgF1()
 83 | 
 84 |     def get_train_op(self, obj):
 85 |         import tensorflow as tf
 86 | 
 87 |         if self.config.optimizer == 'Adam':
 88 | 
 89 |             optim = tf.train.AdamOptimizer(self.config.learning_rate)
 90 | 
 91 |         elif self.config.optimizer == 'Adagrad':
 92 |             optim = tf.train.AdagradOptimizer(self.config.learning_rate)
 93 |         elif self.config.optimizer == 'AdadeltaOptimizer':
 94 |             optim = tf.train.AdadeltaOptimizer(self.config.learning_rate)
 95 |         elif self.config.optimizer == 'GradientDescentOptimizer':
 96 |             optim = tf.train.GradientDescentOptimizer(self.config.learning_rate)
 97 | 
 98 |         if self.config.gradientClipping == True:
 99 | 
100 |             gvs = optim.compute_gradients(obj)
101 | 
102 |             new_gvs = self.correctGradients(gvs)
103 | 
104 |             capped_gvs = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in new_gvs]
105 |             train_step = optim.apply_gradients(capped_gvs)
106 | 
107 | 
108 |         else:
109 |             train_step = optim.minimize(obj)
110 | 
111 |         return train_step
112 | 
113 |     def correctGradients(self, gvs):
114 |         new_gvs = []
115 |         for grad, var in gvs:
116 |             # print (grad)
117 |             if grad == None:
118 |                 grad = tf.zeros_like(var)
119 | 
120 |             new_gvs.append((grad, var))
121 |         if len(gvs) != len(new_gvs):
122 |             print("gradient Error")
123 |         return new_gvs
124 | 
125 |     def broadcasting(self, left, right):
126 |         left = tf.transpose(left, perm=[1, 0, 2])
127 |         left = tf.expand_dims(left, 3)
128 | 
129 |         right = tf.transpose(right, perm=[0, 2, 1])
130 |         right = tf.expand_dims(right, 0)
131 | 
132 |         B = left + right
133 |         B = tf.transpose(B, perm=[1, 0, 3, 2])
134 | 
135 |         return B
136 | 
137 |     def getNerScores(self, lstm_out, n_types=1, dropout_keep_in_prob=1):
138 |         u_a = tf.get_variable("u_typ", [self.config.hidden_size_lstm * 2, self.config.hidden_size_n1])  # [128 32]
139 |         v = tf.get_variable("v_typ", [self.config.hidden_size_n1, n_types])  # [32,1] or [32,10]
140 |         b_s = tf.get_variable("b_typ", [self.config.hidden_size_n1])
141 |         b_c = tf.get_variable("b_ctyp", [n_types])
142 | 
143 |         mul = tf.einsum('aij,jk->aik', lstm_out, u_a)  # [16 348 64] * #[64 32] = [16 348 32]
144 | 
145 |         sum = mul + b_s
146 |         if self.config.activation == "tanh":
147 |             output = tf.nn.tanh(sum)
148 |         elif self.config.activation == "relu":
149 |             output = tf.nn.relu(sum)
150 | 
151 |         if self.config.use_dropout == True:
152 |             output = tf.nn.dropout(output, keep_prob=dropout_keep_in_prob)
153 | 
154 |         g = tf.einsum('aik,kp->aip', output, v) + b_c
155 | 
156 |         return g
157 | 
158 |     def getHeadSelectionScores(self, lstm_out, dropout_keep_in_prob=1):
159 |         u_a = tf.get_variable("u_a", [(self.config.hidden_size_lstm * 2) + self.config.label_embeddings_size, self.config.hidden_size_n1])  # [128 32]
160 |         w_a = tf.get_variable("w_a", [(self.config.hidden_size_lstm * 2) + self.config.label_embeddings_size, self.config.hidden_size_n1])  # [128 32]
161 |         v = tf.get_variable("v", [self.config.hidden_size_n1, len(self.config.dataset_set_relations)])  # [32,1] or [32,4]
162 |         b_s = tf.get_variable("b_s", [self.config.hidden_size_n1])
163 | 
164 |         left = tf.einsum('aij,jk->aik', lstm_out, u_a)  # [16 348 64] * #[64 32] = [16 348 32]
165 |         right = tf.einsum('aij,jk->aik', lstm_out, w_a)  # [16 348 64] * #[64 32] = [16 348 32]
166 | 
167 |         outer_sum = self.broadcasting(left, right)  # [16 348 348 32]
168 | 
169 |         outer_sum_bias = outer_sum + b_s
170 | 
171 |         if self.config.activation == "tanh":
172 |             output = tf.tanh(outer_sum_bias)
173 |         elif self.config.activation == "relu":
174 |             output = tf.nn.relu(outer_sum_bias)
175 | 
176 |         if self.config.use_dropout == True:
177 |             output = tf.nn.dropout(output, keep_prob=dropout_keep_in_prob)
178 | 
179 |         output = tf.nn.dropout(output, keep_prob=dropout_keep_in_prob)
180 | 
181 |         g = tf.einsum('aijk,kp->aijp', output, v)
182 | 
183 |         g = tf.reshape(g, [tf.shape(g)[0], tf.shape(g)[1], tf.shape(g)[2] * len(self.config.dataset_set_relations)])
184 | 
185 |         return g
186 | 
187 |     def computeLoss(self, input_rnn, dropout_embedding_keep, dropout_lstm_keep, dropout_lstm_output_keep,
188 |                     seqlen, dropout_fcl_ner_keep, ners_ids, dropout_fcl_rel_keep, is_train, scoring_matrix_gold, reuse=False):
189 | 
190 |         with tf.variable_scope("loss_computation", reuse=reuse):
191 | 
192 |             if self.config.use_dropout:
193 |                 input_rnn = tf.nn.dropout(input_rnn, keep_prob=dropout_embedding_keep)
194 |                 # input_rnn = tf.Print(input_rnn, [dropout_embedding_keep], 'embedding:  ', summarize=1000)
195 |             for i in range(self.config.num_lstm_layers):
196 |                 if self.config.use_dropout and i > 0:
197 |                     input_rnn = tf.nn.dropout(input_rnn, keep_prob=dropout_lstm_keep)
198 |                     # input_rnn = tf.Print(input_rnn, [dropout_lstm_keep], 'lstm:  ', summarize=1000)
199 | 
200 |                 lstm_fw_cell = tf.contrib.rnn.BasicLSTMCell(self.config.hidden_size_lstm)
201 |                 # Backward direction cell
202 |                 lstm_bw_cell = tf.contrib.rnn.BasicLSTMCell(self.config.hidden_size_lstm)
203 |                 #scope='BiLSTM' + str(i) 解决每层LSTM输入维度不一致问题
204 |                 lstm_out, _ = tf.nn.bidirectional_dynamic_rnn(
205 |                     cell_fw=lstm_fw_cell,
206 |                     cell_bw=lstm_bw_cell,
207 |                     inputs=input_rnn,
208 |                     sequence_length=seqlen,
209 |                     dtype=tf.float32, scope='BiLSTM' + str(i))
210 | 
211 |                 input_rnn = tf.concat(lstm_out, 2)
212 | 
213 |                 lstm_output = input_rnn
214 | 
215 |             if self.config.use_dropout:
216 |                 lstm_output = tf.nn.dropout(lstm_output, keep_prob=dropout_lstm_output_keep)
217 | 
218 |             mask = tf.sequence_mask(seqlen, dtype=tf.float32)
219 | 
220 |             ner_input = lstm_output
221 |             # loss= tf.Print(loss, [tf.shape(loss)], 'shape of loss is:') # same as scoring matrix ie, [1 59 590]
222 |             #实体识别
223 |             if self.config.ner_classes == "EC":
224 | 
225 |                 nerScores = self.getNerScores(ner_input, len(self.config.dataset_set_ec_tags),
226 |                                               dropout_keep_in_prob=dropout_fcl_ner_keep)
227 |                 label_matrix = tf.get_variable(name="label_embeddings", dtype=tf.float32,
228 |                                                shape=[len(self.config.dataset_set_ec_tags),
229 |                                                       self.config.label_embeddings_size])
230 |             elif self.config.ner_classes == "BIO":
231 | 
232 |                 nerScores = self.getNerScores(ner_input, len(self.config.dataset_set_bio_tags),
233 |                                               dropout_keep_in_prob=dropout_fcl_ner_keep)
234 |                 label_matrix = tf.get_variable(name="label_embeddings", dtype=tf.float32,
235 |                                                shape=[len(self.config.dataset_set_bio_tags),
236 |                                                       self.config.label_embeddings_size])
237 | 
238 |             # nerScores = tf.Print(nerScores, [tf.shape(ners_ids), ners_ids, tf.shape(nerScores)], 'ners_ids:  ', summarize=1000)
239 | 
240 |             log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(
241 |                 nerScores, ners_ids, seqlen)
242 |             if self.config.ner_loss == "crf":
243 | 
244 |                 lossNER = -log_likelihood
245 |                 predNers, viterbi_score = tf.contrib.crf.crf_decode(
246 |                     nerScores, transition_params, seqlen)
247 | 
248 |             elif self.config.ner_loss == "softmax":
249 |                 lossNER = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=nerScores, labels=ners_ids)
250 | 
251 |                 predNers = tf.cast(tf.arg_max(nerScores, 2), tf.int32)
252 | 
253 |             if self.config.label_embeddings_size > 0:
254 | 
255 |                 labels = tf.cond(is_train > 0, lambda: ners_ids, lambda: predNers)
256 | 
257 |                 label_embeddings = tf.nn.embedding_lookup(label_matrix, labels)
258 |                 rel_input = tf.concat([lstm_output, label_embeddings], axis=2)
259 | 
260 |             else:
261 | 
262 |                 rel_input = lstm_output
263 | 
264 |             #关系抽取
265 |             rel_scores = self.getHeadSelectionScores(rel_input,
266 |                                                      dropout_keep_in_prob=dropout_fcl_rel_keep)
267 | 
268 |             lossREL = tf.nn.sigmoid_cross_entropy_with_logits(logits=rel_scores, labels=scoring_matrix_gold)
269 |             probas = tf.nn.sigmoid(rel_scores)
270 |             predictedRel = tf.round(probas)
271 | 
272 |             return lossNER, lossREL, predNers, predictedRel, rel_scores
273 | 
274 |     def run(self):
275 |         # shape = (batch size, max length of sentence, max length of word)
276 |         char_ids = tf.placeholder(tf.int32, shape=[None, None, None])
277 |         is_train = tf.placeholder(tf.int32)
278 | 
279 |         # shape = (batch_size, max_length of sentence)
280 |         word_lengths = tf.placeholder(tf.int32, shape=[None, None])
281 | 
282 |         embedding_ids = tf.placeholder(tf.int32, [None, None])  # [ batch_size,max_length of sentence ]
283 | 
284 |         token_ids = tf.placeholder(tf.int32, [None, None])  # [ batch_size  *   max_sequence ]
285 | 
286 |         entity_tags_ids = tf.placeholder(tf.int32, [None, None])
287 | 
288 |         scoring_matrix_gold = tf.placeholder(tf.float32, [None, None, None])  # [ batch_size  *   max_sequence]
289 | 
290 |         tokens = tf.placeholder(tf.string, [None, None])  # [ batch_size  *   max_sequence]
291 |         BIO = tf.placeholder(tf.string, [None, None])  # [ batch_size  *   max_sequence]
292 |         entity_tags = tf.placeholder(tf.string, [None, None])  # [ batch_size  *   max_sequence]
293 | 
294 |         # classes = ...
295 |         seqlen = tf.placeholder(tf.int32, [None])  # [ batch_size ]
296 | 
297 |         doc_ids = tf.placeholder(tf.string, [None])  # [ batch_size ]
298 | 
299 |         dropout_embedding_keep = tf.placeholder(tf.float32, name="dropout_embedding_keep")
300 |         dropout_lstm_keep = tf.placeholder(tf.float32, name="dropout_lstm_keep")
301 |         dropout_lstm_output_keep = tf.placeholder(tf.float32, name="dropout_lstm_output_keep")
302 |         dropout_fcl_ner_keep = tf.placeholder(tf.float32, name="dropout_fcl_ner_keep")
303 |         dropout_fcl_rel_keep = tf.placeholder(tf.float32, name="dropout_fcl_rel_keep")
304 | 
305 |         embedding_matrix = tf.get_variable(name="embeddings", shape=self.emb_mtx.shape,
306 |                                            initializer=tf.constant_initializer(self.emb_mtx), trainable=False)
307 | 
308 |         #####char embeddings
309 | 
310 |         # 1. get character embeddings
311 | 
312 |         K = tf.get_variable(name="char_embeddings", dtype=tf.float32,
313 |                             shape=[len(self.config.dataset_set_characters), self.config.char_embeddings_size])
314 |         # shape = (batch, sentence, word, dim of char embeddings)
315 |         char_embeddings = tf.nn.embedding_lookup(K, char_ids)
316 | 
317 |         # 2. put the time dimension on axis=1 for dynamic_rnn
318 |         s = tf.shape(char_embeddings)  # store old shape
319 | 
320 |         char_embeddings_reshaped = tf.reshape(char_embeddings, shape=[-1, s[-2], self.config.char_embeddings_size])
321 |         word_lengths_reshaped = tf.reshape(word_lengths, shape=[-1])
322 | 
323 |         char_hidden_size = self.config.hidden_size_char
324 | 
325 |         # 3. bi lstm on chars
326 |         cell_fw = tf.contrib.rnn.BasicLSTMCell(char_hidden_size, state_is_tuple=True)
327 |         cell_bw = tf.contrib.rnn.BasicLSTMCell(char_hidden_size, state_is_tuple=True)
328 | 
329 |         _, ((_, output_fw), (_, output_bw)) = tf.nn.bidirectional_dynamic_rnn(cell_fw=cell_fw, cell_bw=cell_bw,
330 |                                                                               inputs=char_embeddings_reshaped,
331 |                                                                               sequence_length=word_lengths_reshaped,
332 |                                                                               dtype=tf.float32)
333 |         # shape = (batch x sentence, 2 x char_hidden_size)
334 |         output = tf.concat([output_fw, output_bw], axis=-1)
335 | 
336 |         # shape = (batch, sentence, 2 x char_hidden_size)
337 |         char_rep = tf.reshape(output, shape=[-1, s[1], 2 * char_hidden_size])
338 | 
339 |         # concat char embeddings
340 | 
341 |         word_embeddings = tf.nn.embedding_lookup(embedding_matrix, embedding_ids)
342 |         #词向量+字符向量
343 |         if self.config.use_chars == True:
344 |             input_rnn = tf.concat([word_embeddings, char_rep], axis=-1)
345 | 
346 |         else:
347 |             input_rnn = word_embeddings
348 | 
349 |         embeddings_input = input_rnn
350 |         #计算损失，预测值
351 |         lossNER, lossREL, predicted_entity_tags_ids, predictedRel, rel_scores = self.computeLoss(input_rnn,
352 |                                                                                                  dropout_embedding_keep,
353 |                                                                                                  dropout_lstm_keep,
354 |                                                                                                  dropout_lstm_output_keep, seqlen,
355 |                                                                                                  dropout_fcl_ner_keep,
356 |                                                                                                  entity_tags_ids, dropout_fcl_rel_keep,
357 |                                                                                                  is_train,
358 |                                                                                                  scoring_matrix_gold, reuse=False)
359 | 
360 |         obj = tf.reduce_sum(lossNER) + tf.reduce_sum(lossREL)
361 |         # 生成对抗样本
362 |         raw_perturb = tf.gradients(obj, embeddings_input)[0]  # [batch, L, dim]
363 |         normalized_per = tf.nn.l2_normalize(raw_perturb, dim=[1, 2])
364 |         perturb = self.config.alpha * tf.sqrt(tf.cast(tf.shape(input_rnn)[2], tf.float32)) * tf.stop_gradient(normalized_per)
365 |         perturb_inputs = embeddings_input + perturb #训练样本+对抗样本
366 |         #计算训练样本+对抗样本 的损失
367 |         lossNER_per, lossREL_per, _, _, _ = self.computeLoss(perturb_inputs,
368 |                                                              dropout_embedding_keep,
369 |                                                              dropout_lstm_keep,
370 |                                                              dropout_lstm_output_keep, seqlen,
371 |                                                              dropout_fcl_ner_keep,
372 |                                                              entity_tags_ids, dropout_fcl_rel_keep,
373 |                                                              is_train,
374 |                                                              scoring_matrix_gold, reuse=True)
375 | 
376 |         actualRel = tf.round(scoring_matrix_gold)
377 | 
378 |         if self.config.use_adversarial == True:
379 |             obj += tf.reduce_sum(lossNER_per) + tf.reduce_sum(lossREL_per)
380 | 
381 |         m = {}
382 |         m['isTrain'] = is_train
383 |         m['embeddingIds'] = embedding_ids
384 |         m['charIds'] = char_ids
385 |         m['tokensLens'] = word_lengths
386 |         m['entity_tags_ids'] = entity_tags_ids
387 |         m['scoringMatrixGold'] = scoring_matrix_gold
388 |         m['seqlen'] = seqlen
389 |         m['doc_ids'] = doc_ids
390 |         m['tokenIds'] = token_ids
391 |         m['dropout_embedding'] = dropout_embedding_keep
392 |         m['dropout_lstm'] = dropout_lstm_keep
393 |         m['dropout_lstm_output'] = dropout_lstm_output_keep
394 |         m['dropout_fcl_ner'] = dropout_fcl_ner_keep
395 |         m['dropout_fcl_rel'] = dropout_fcl_rel_keep
396 |         m['tokens'] = tokens
397 |         m['BIO'] = BIO
398 |         m['entity_tags'] = entity_tags
399 | 
400 |         return obj, m, predicted_entity_tags_ids, entity_tags_ids, predictedRel, actualRel, rel_scores
401 | 
402 | 
403 | class operations():
404 |     def __init__(self, train_step, obj, m_op, predicted_op_ner, actual_op_ner, predicted_op_rel, actual_op_rel, score_op_rel):
405 |         self.train_step = train_step
406 |         self.obj = obj
407 |         self.m_op = m_op
408 |         self.predicted_op_ner = predicted_op_ner
409 |         self.actual_op_ner = actual_op_ner
410 |         self.predicted_op_rel = predicted_op_rel
411 |         self.actual_op_rel = actual_op_rel
412 |         self.score_op_rel = score_op_rel


--------------------------------------------------------------------------------
/data_utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import random
  3 | import gensim
  4 | import gzip
  5 | import numpy as np
  6 | import ast
  7 | import copy
  8 | import sys
  9 | from sklearn.model_selection  import train_test_split
 10 | from prettytable import PrettyTable
 11 | import re
 12 | import tensorflow as tf
 13 | 
 14 | """Generic set of classes and methods"""
 15 | 
 16 | 
 17 | def strToLst(string):
 18 |     return ast.literal_eval(string)
 19 | 
 20 | 
 21 | class HeadData:
 22 |     def __init__(self, data, indices):
 23 |         self.data = data
 24 |         self.indices = indices
 25 | 
 26 |     def split(self, fraction):
 27 | 
 28 |         data_train, data_test, idx_train, idx_test = train_test_split(self.data, self.indices, test_size=fraction,
 29 |                                                                       random_state=42)
 30 | 
 31 |         train = HeadData(data_train, idx_train)
 32 | 
 33 |         test = HeadData(data_test, idx_test)
 34 |         return train, test
 35 | 
 36 | def transformToInitialInput(matrix,tags):
 37 |         active_relations = np.nonzero(matrix)
 38 |         active_relations_iidx = active_relations[0]
 39 |         active_relations_jidx = active_relations[1]
 40 | 
 41 |         tokens_ids = []
 42 |         heads_ids = []
 43 |         labels_ids = []
 44 |         head_labels_ids = []
 45 |         labels_name = []
 46 | 
 47 |         for m_idx in range(len(matrix)):
 48 |             tokens_ids.append(m_idx)
 49 |             heads_ids.append([])
 50 |             labels_ids.append([])
 51 |             head_labels_ids.append([])
 52 |             labels_name.append([])
 53 | 
 54 |         for i_idx in range(len(active_relations_iidx)):
 55 |             head_id = int(active_relations_jidx[i_idx] / len(tags))
 56 |             label_id = active_relations_jidx[i_idx] % len(tags)
 57 |             token_id = active_relations_iidx[i_idx]
 58 |             head_label_id = active_relations_jidx[i_idx]
 59 | 
 60 |             # idx=tokens_ids.index(token_id)
 61 |             heads_ids[token_id].append(head_id)
 62 |             labels_ids[token_id].append(label_id)
 63 |             head_labels_ids[token_id].append(head_label_id)
 64 |             labels_name[token_id].append(tags[label_id])
 65 | 
 66 |             # print (str(token_id) + " " +str(head_label_id)+ " " +str(head)+ " " +str(label))
 67 |         return tokens_ids, head_labels_ids, labels_ids, heads_ids, labels_name
 68 | 
 69 | 
 70 | ###run one time to obtain the characters
 71 | def getCharsFromDocuments(documents):
 72 |     chars = []
 73 |     for doc in documents:
 74 |         for tokens in doc.tokens:
 75 |             for char in tokens:
 76 |                 # print (token)
 77 |                 chars.append(char)
 78 |     chars = list(set(chars))
 79 |     chars.sort()
 80 |     return chars
 81 | 
 82 | 
 83 | ###run one time to obtain the ner labels
 84 | def getEntitiesFromDocuments(documents):
 85 |     BIOtags = []
 86 |     ECtags = []
 87 |     for doc in documents:
 88 |         for tag in doc.BIOs:
 89 |             BIOtags.append(tag)
 90 |             if tag.startswith("B-") or tag.startswith("I-"):
 91 |                 ECtags.append(tag[2:])
 92 |             else:
 93 |                 ECtags.append(tag)
 94 | 
 95 |     BIOtags = list(set(BIOtags))
 96 |     BIOtags.sort()
 97 |     ECtags = list(set(ECtags))
 98 |     ECtags.sort()
 99 |     return BIOtags, ECtags
100 | 
101 | 
102 | def getECfromBIO(BIO_tag):
103 |     if BIO_tag.startswith("B-") or BIO_tag.startswith("I-"):
104 |         return (BIO_tag[2:])
105 |     else:
106 |         return (BIO_tag)
107 | 
108 | 
109 | ###run one time to obtain the relations
110 | def getRelationsFromDocuments(documents):
111 |     relations = []
112 |     for doc in documents:
113 |         for relation_list in doc.relations:
114 |             for relation in relation_list:
115 |                 relations.append(relation)
116 | 
117 |     relations = list(set(relations))
118 |     relations.sort()
119 |     return relations
120 | 
121 | 
122 | def tokenToCharIds(token, characters):
123 |     charIds = []
124 |     for char in token:
125 |         charIds.append(characters.index(char))
126 |     return charIds
127 | 
128 | 
129 | def labelsListToIds(listofLabels, setofLabels):
130 |     labelIds = []
131 |     for label in listofLabels:
132 |         labelIds.append(setofLabels.index(label))
133 | 
134 |     return labelIds
135 | 
136 | 
137 | def getScoringMatrixHeads(listofRelations, setofLabels, heads):
138 |     scoringMatrixHeads = []
139 |     relationIds = labelsListToIds(listofRelations, setofLabels)
140 | 
141 | 
142 |     for relIdx in range(len(relationIds)):
143 |         # print (rels[relIdx]*getNumberOfClasses()+labelJointIds[relIdx])
144 |         scoringMatrixHeads.append(heads[relIdx] * len(setofLabels) + relationIds[relIdx])
145 |     return scoringMatrixHeads
146 | 
147 | 
148 | def getLabelId(label, setofLabels):
149 |     return setofLabels.index(label)
150 | 
151 | def strToBool(str):
152 |     if str.lower() in ['true', '1']:
153 |         return True
154 |     return False
155 | 
156 | 
157 | 
158 | def getEmbeddingId(word, embeddingsList):
159 |     # modified method from http://cistern.cis.lmu.de/globalNormalization/globalNormalization_all.zip
160 |     if word != "<empty>":
161 |         if not word in embeddingsList:
162 |             if re.search(r'^\d+$', word):
163 |                 word = "0"
164 |             if word.islower():
165 |                 word = word.title()
166 |             else:
167 |                 word = word.lower()
168 |         if not word in embeddingsList:
169 |             word = "<unk>"
170 |         curIndex = embeddingsList[word]
171 |         return curIndex
172 | 
173 | 
174 | def readWordvectorsNumpy(wordvectorfile, isBinary=False):
175 | 
176 |     # modified method from http://cistern.cis.lmu.de/globalNormalization/globalNormalization_all.zip
177 |     wordvectors = []
178 |     words = []
179 |     model = gensim.models.KeyedVectors.load_word2vec_format(wordvectorfile, binary=isBinary,unicode_errors='ignore')
180 | 
181 |     vectorsize = model.vector_size
182 | 
183 |     for key in list(model.vocab.keys()):
184 |         wordvectors.append(model.wv[key])
185 |         words.append(key)
186 | 
187 |     zeroVec = [0 for i in range(vectorsize)]
188 |     random.seed(123456)
189 |     randomVec = [random.uniform(-np.sqrt(1. / len(wordvectors)), np.sqrt(1. / len(wordvectors))) for i in
190 |                  range(vectorsize)]
191 |     wordvectors.insert(0, randomVec)
192 |     words.insert(0, "<unk>")
193 |     wordvectors.insert(0, zeroVec)
194 |     words.insert(0, "<empty>")
195 | 
196 |     wordvectorsNumpy = np.array(wordvectors)
197 |     return wordvectorsNumpy, vectorsize, words
198 | 
199 | 
200 | def readIndices(wordvectorfile, isBinary=False):
201 |     # modified method from http://cistern.cis.lmu.de/globalNormalization/globalNormalization_all.zip
202 |     indices = {}
203 |     curIndex = 0
204 |     indices["<empty>"] = curIndex
205 |     curIndex += 1
206 |     indices["<unk>"] = curIndex
207 |     curIndex += 1
208 | 
209 |     model = gensim.models.KeyedVectors.load_word2vec_format(wordvectorfile, binary=isBinary,unicode_errors='ignore')
210 | 
211 |     count = 0
212 |     # c=0
213 |     for key in list(model.vocab.keys()):
214 |         indices[key] = curIndex
215 |         curIndex += 1
216 | 
217 |     return indices
218 | 
219 | 
220 | 
221 | def printParameters(config):
222 | 
223 |     t = PrettyTable(['Params', 'Value'])
224 | 
225 |     #dataset
226 |     t.add_row(['Config', config.config_fname])
227 |     t.add_row(['Embeddings', config.filename_embeddings])
228 |     t.add_row(['Embeddings size ', config.representationsize])
229 |     t.add_row(['Train', config.filename_train])
230 |     t.add_row(['Dev', config.filename_dev])
231 |     t.add_row(['Test', config.filename_test])
232 | 
233 |     #training
234 |     t.add_row(['Epochs ', config.nepochs])
235 |     t.add_row(['Optimizer ', config.optimizer])
236 |     t.add_row(['Activation ', config.activation])
237 |     t.add_row(['Learning rate ', config.learning_rate])
238 |     t.add_row(['Gradient clipping ', config.gradientClipping])
239 |     t.add_row(['Patience ', config.nepoch_no_imprv])
240 |     t.add_row(['Use dropout', config.use_dropout])
241 |     t.add_row(['Ner loss ', config.ner_loss])
242 |     t.add_row(['Ner classes ', config.ner_classes])
243 |     t.add_row(['Use char embeddings ', config.use_chars])
244 |     t.add_row(['Use adversarial',config.use_adversarial])
245 | 
246 |     # hyperparameters
247 |     t.add_row(['Dropout embedding ', config.dropout_embedding])
248 |     t.add_row(['Dropout lstm ', config.dropout_lstm])
249 |     t.add_row(['Dropout lstm output ', config.dropout_lstm_output])
250 |     t.add_row(['Dropout fcl ner ', config.dropout_fcl_ner])
251 |     t.add_row(['Dropout fcl rel ', config.dropout_fcl_rel])
252 |     t.add_row(['Hidden lstm size ', config.hidden_size_lstm])
253 |     t.add_row(['LSTM layers ', config.num_lstm_layers])
254 |     t.add_row(['Hidden nn size ', config.hidden_size_n1])
255 |     t.add_row(['Char embeddings size ', config.char_embeddings_size])
256 |     t.add_row(['Hidden size char ', config.hidden_size_char])
257 |     t.add_row(['Label embeddings size ', config.label_embeddings_size])
258 |     t.add_row(['Alpha ', config.alpha])
259 |     t.add_row(['Root node ', config.root_node])
260 | 
261 |     #evaluation
262 |     t.add_row(['Evaluation method ', config.evaluation_method])
263 | 
264 | 
265 |     print(t)
266 | 
267 | def getSegmentationDict(lst):
268 |     return {k: v for v, k in enumerate(lst)}
269 | 
270 | def generator(data, m,config,train=False):
271 |     # generate the data
272 |     embeddingIds = m['embeddingIds']
273 |     isTrain=m['isTrain']
274 | 
275 |     scoringMatrixGold = m['scoringMatrixGold']
276 |     BIO = m['BIO'] # always the BIO tags
277 |     entity_tags=m['entity_tags'] # either the BIO tags or the EC tags - depends on the NER target values
278 |     entity_tags_ids = m['entity_tags_ids']
279 |     tokens = m['tokens']
280 |     tokenIds = m['tokenIds']
281 |     charIds = m['charIds']
282 |     tokensLens = m['tokensLens']
283 | 
284 |     seqlen = m['seqlen']
285 |     doc_ids=m['doc_ids']
286 | 
287 | 
288 |     dropout_embedding_keep = m['dropout_embedding']
289 |     dropout_lstm_keep = m['dropout_lstm']
290 |     dropout_lstm_output_keep = m['dropout_lstm_output']
291 |     dropout_fcl_ner_keep = m['dropout_fcl_ner']
292 |     dropout_fcl_rel_keep = m['dropout_fcl_rel']
293 | 
294 |     dropout_embedding_prob = 1
295 |     dropout_lstm_prob = 1
296 |     dropout_lstm_output_prob = 1
297 |     dropout_fcl_ner_prob = 1
298 |     dropout_fcl_rel_prob = 1
299 | 
300 |     if config.use_dropout == True and train==True:
301 | 
302 |         dropout_embedding_prob = config.dropout_embedding
303 |         dropout_lstm_prob = config.dropout_lstm
304 |         dropout_lstm_output_prob = config.dropout_lstm_output
305 |         dropout_fcl_ner_prob = config.dropout_fcl_ner
306 |         dropout_fcl_rel_prob = config.dropout_fcl_rel
307 | 
308 |     data_copy = copy.deepcopy(data)
309 |     # train_ind=np.arange(len(train.data))
310 |     #将样本数据打乱
311 |     if config.shuffle == True:
312 |         shuffled_data, _, shuffled_data_idx, _ = train_test_split(data_copy.data, data_copy.indices, test_size=0,
313 |                                                                   random_state=42)
314 |         # shuffled_data, _, shuffled_data_idx, _ = train_test_split(data_copy.data, data_copy.indices, test_size=0,random_state=42)
315 | 
316 |         data_copy = HeadData(shuffled_data, shuffled_data_idx)
317 |         # print ("shuffle:"+ str(shuffle) )
318 |         # print(data_copy.indices)
319 |     else:
320 | 
321 |         data_copy = HeadData(data_copy.data, data_copy.indices)
322 |         # data_copy = HeadData(data_copy.data, data_copy.indices)
323 | 
324 |         # print("shuffle:" + str(shuffle))
325 |         # print(data_copy.indices)
326 | 
327 |     # batchsize=16 # number of documents per batch
328 |     batches_embeddingIds = []  # e.g., 131 batches
329 |     batches_charIds = []  # e.g., 131 batches
330 |     batches_scoringMatrixHeadIds = []  # e.g., 131 batches
331 |     batches_scoringMatrix = []  # e.g., 131 batches
332 |     batches_tokens = []
333 | 
334 |     batches_entity_tags = []
335 |     batches_entity_tags_ids = []
336 |     batches_BIO=[]
337 |     batches_tokenIds = []
338 |     batches_doc_ids = []
339 | 
340 |     docs_batch_embeddingIds = []  # e.g., 587 max doc length - complete with -1 when the size of the doc is smaller
341 |     docs_batch_charIds = []  # e.g., 587 max doc length - complete with -1 when the size of the doc is smaller
342 |     docs_batch_scoringMatrixHeadIds = []
343 |     docs_batch_scoringMatrix = []
344 | 
345 |     docs_batch_entity_tags=[]
346 |     docs_batch_entity_tags_ids = []
347 | 
348 |     docs_batch_tokens = []
349 | 
350 |     docs_batch_BIO = []
351 |     docs_batch_tokenIds = []
352 |     docs_batch_doc_ids = []
353 | 
354 |     maxScoringMatrixLenList0 = []
355 |     maxScoringMatrixLen0 = -1
356 |     maxScoringMatrixLenList1 = []
357 |     maxScoringMatrixLen1 = -1
358 | 
359 |     maxDocLenList = []
360 |     maxSentenceLen = -1
361 | 
362 |     maxWordLenList = []
363 |     maxWordLen = -1
364 | 
365 |     wordLenList = []
366 |     wordLens = []
367 | 
368 |     lenBatchesDoc = []
369 |     lenEmbeddingssDoc = []
370 | 
371 |     lenBatchesChars = []
372 |     lenCharsDoc = []
373 | 
374 |     sumLen = 0
375 |     for docIdx in range(len(data_copy.data)):
376 |         doc = data_copy.data[docIdx]
377 |         # print (doc)
378 |         if docIdx % config.batchsize == 0 and docIdx > 0:
379 |             # print (docIdx)
380 |             # print ("new batch")
381 |             batches_embeddingIds.append(docs_batch_embeddingIds)
382 |             batches_charIds.append(docs_batch_charIds)
383 | 
384 |             batches_scoringMatrixHeadIds.append(docs_batch_scoringMatrixHeadIds)
385 |             batches_scoringMatrix.append(docs_batch_scoringMatrix)
386 |             batches_entity_tags.append(docs_batch_entity_tags)
387 |             batches_entity_tags_ids.append(docs_batch_entity_tags_ids)
388 | 
389 |             batches_tokens.append(docs_batch_tokens)
390 | 
391 |             batches_BIO.append(docs_batch_BIO)
392 |             batches_tokenIds.append(docs_batch_tokenIds)
393 |             batches_doc_ids.append(docs_batch_doc_ids)
394 | 
395 |             docs_batch_embeddingIds = []  # e.g., 587 max doc length - complete with -1 when the size of the doc is smaller
396 |             docs_batch_charIds = []  # e.g., 587 max doc length - complete with -1 when the size of the doc is smaller
397 |             docs_batch_scoringMatrixHeadIds = []
398 |             docs_batch_scoringMatrix = []
399 | 
400 |             docs_batch_tokens = []
401 | 
402 |             docs_batch_entity_tags = []
403 |             docs_batch_entity_tags_ids = []
404 |             docs_batch_BIO = []
405 |             docs_batch_tokenIds = []
406 |             docs_batch_doc_ids = []
407 | 
408 |             maxDocLenList.append(maxSentenceLen)
409 |             maxSentenceLen = -1
410 | 
411 |             maxScoringMatrixLenList0.append(maxScoringMatrixLen0)
412 |             maxScoringMatrixLen0 = -1
413 | 
414 |             maxScoringMatrixLenList1.append(maxScoringMatrixLen1)
415 |             maxScoringMatrixLen1 = -1
416 | 
417 |             maxWordLenList.append(maxWordLen)
418 |             maxWordLen = -1
419 | 
420 |             wordLenList.append(wordLens)
421 | 
422 | 
423 | 
424 |         if len(doc.token_ids) > maxSentenceLen:
425 |             maxSentenceLen = len(doc.token_ids)
426 | 
427 |         longest_token_list=max(doc.char_ids, key=len)
428 |         if len(longest_token_list) > maxWordLen:
429 |             maxWordLen = len(longest_token_list)
430 | 
431 |         wordLens=[len(token) for token in doc.char_ids]
432 | 
433 | 
434 |         sumLen += len(doc.token_ids)
435 |         docs_batch_embeddingIds.append(doc.embedding_ids)
436 |         docs_batch_charIds.append(doc.char_ids)
437 |         docs_batch_scoringMatrixHeadIds.append(doc.joint_ids)
438 | 
439 |         scoringMatrix = np.zeros((len(doc.joint_ids), len(doc.joint_ids) *len(config.dataset_set_relations) ))
440 |         if scoringMatrix.shape[0] > maxScoringMatrixLen0:
441 |             maxScoringMatrixLen0 = scoringMatrix.shape[0]
442 |         if scoringMatrix.shape[1] > maxScoringMatrixLen1:
443 |             maxScoringMatrixLen1 = scoringMatrix.shape[1]
444 | 
445 | 
446 |         for tokenIdx in range(len(doc.joint_ids)):
447 |             tokenHeads = doc.joint_ids[tokenIdx]
448 |             for head in tokenHeads:
449 |                 # print (str(tokenIdx)+ " "+ str(head))
450 |                 scoringMatrix[tokenIdx, head] = 1
451 | 
452 |         docs_batch_scoringMatrix.append(scoringMatrix.tolist())
453 |         # print (scoringMatrix)
454 | 
455 |         #print (doc.jlabel_names)
456 |         if config.ner_classes=="BIO":
457 |             docs_batch_entity_tags.append(doc.BIOs)##to do
458 |             docs_batch_entity_tags_ids.append(doc.BIO_ids)
459 | 
460 |         elif config.ner_classes=="EC":
461 |             docs_batch_entity_tags.append(doc.ecs)##to do
462 |             docs_batch_entity_tags_ids.append(doc.ec_ids)
463 | 
464 |         docs_batch_tokens.append(doc.tokens)
465 | 
466 |         docs_batch_BIO.append(doc.BIOs)##to do
467 |         docs_batch_tokenIds.append(doc.token_ids)
468 |         docs_batch_doc_ids.append(doc.docId)
469 |         if docIdx == len(
470 |                 data_copy.data) - 1:  ## if there are no documents left - append the batch - usually it is shorter batch
471 |             batches_embeddingIds.append(docs_batch_embeddingIds)
472 |             batches_charIds.append(docs_batch_charIds)
473 |             batches_scoringMatrixHeadIds.append(docs_batch_scoringMatrixHeadIds)
474 |             batches_scoringMatrix.append(docs_batch_scoringMatrix)
475 | 
476 |             batches_entity_tags.append(docs_batch_entity_tags)
477 |             batches_entity_tags_ids.append(docs_batch_entity_tags_ids)
478 |             batches_tokens.append(docs_batch_tokens)
479 | 
480 |             batches_BIO.append(docs_batch_BIO)
481 |             batches_tokenIds.append(docs_batch_tokenIds)
482 |             batches_doc_ids.append(docs_batch_doc_ids)
483 |             maxDocLenList.append(maxSentenceLen)
484 |             maxScoringMatrixLenList0.append(maxScoringMatrixLen0)
485 |             maxScoringMatrixLenList1.append(maxScoringMatrixLen1)
486 |             maxWordLenList.append(maxWordLen)
487 |             wordLenList.append(wordLens)
488 |             # maxDocLen.append(maxWordLen)
489 | 
490 |     # 按最长维度填充
491 |     for bIdx in range(len(batches_embeddingIds)):
492 | 
493 |         batch_embeddingIds = batches_embeddingIds[bIdx]
494 |         batch_charIds = batches_charIds[bIdx]
495 |         batch_scoringMatrixHeadIds = batches_scoringMatrixHeadIds[bIdx]
496 |         batch_scoringMatrix = batches_scoringMatrix[bIdx]
497 | 
498 |         batch_entity_tags = batches_entity_tags[bIdx]
499 |         batch_entity_tags_ids = batches_entity_tags_ids[bIdx]
500 |         batch_tokens = batches_tokens[bIdx]
501 | 
502 |         batch_tokenIds = batches_tokenIds[bIdx]
503 | 
504 |         for dIdx in range(len(batch_embeddingIds)):
505 |             embeddingId_doc = batch_embeddingIds[dIdx]
506 |             charIds_doc = batch_charIds[dIdx]
507 |             scoringMatrixHeadId_doc = batch_scoringMatrixHeadIds[dIdx]
508 |             scoringMatrix_doc = batch_scoringMatrix[dIdx]
509 | 
510 |             ner_doc=batch_entity_tags[dIdx]
511 |             ner_doc_ids=batch_entity_tags_ids[dIdx]
512 |             token_doc = batch_tokens[dIdx]
513 | 
514 |             token_id_doc = batch_tokenIds[dIdx]
515 | 
516 |             lenEmbeddingssDoc.append(len(embeddingId_doc))
517 |             tokensLen=[len(token) for token in charIds_doc]#每个句子中单词长度的列表
518 |             lenCharsDoc.append(tokensLen)
519 | 
520 | 
521 |             for tokenIdx in range(len(tokensLen)):
522 |                 tokenLen=tokensLen[tokenIdx]
523 | 
524 |                 if tokenLen<maxWordLenList[bIdx]:
525 | 
526 |                     for i in np.arange(maxWordLenList[bIdx]-tokenLen):
527 |                         #print (charIds_doc)
528 |                         charIds_doc[tokenIdx].append(0)#填充字母
529 | 
530 | 
531 |             for i in np.arange(len(scoringMatrix_doc)):
532 |                 if len(scoringMatrix_doc[i]) < maxScoringMatrixLenList1[bIdx]:  # 填充列
533 |                     for j in np.arange(maxScoringMatrixLenList1[bIdx] - len(scoringMatrix_doc[i])):
534 |                         scoringMatrix_doc[i].append(0)
535 | 
536 |             if len(scoringMatrix_doc) < maxScoringMatrixLenList0[bIdx]:
537 |                 for i in np.arange(maxScoringMatrixLenList0[bIdx] - len(scoringMatrix_doc)):
538 |                     scoringMatrix_doc.append([0]*maxScoringMatrixLenList1[bIdx]) #填充行
539 | 
540 |             if len(embeddingId_doc) < maxDocLenList[bIdx]:
541 |                 # print  (maxWordLen-len(word_doc))
542 |                 # print ('here')
543 |                 for i in np.arange(maxDocLenList[bIdx] - len(embeddingId_doc)):
544 |                     # pass
545 |                     embeddingId_doc.append(0)
546 |                     charIds_doc.append([0]*maxWordLenList[bIdx]) #填充词
547 | 
548 |                     scoringMatrixHeadId_doc.append([maxDocLenList[bIdx] - 1])
549 |                     token_doc.append("ZERO")
550 | 
551 |                     ner_doc.append("ZERO")
552 |                     ner_doc_ids.append(0)
553 |                     token_id_doc.append(maxDocLenList[bIdx] - 1)
554 | 
555 |         lenBatchesDoc.append(lenEmbeddingssDoc)
556 |         lenCharsDoc = list(map(lambda l: l + [0] * (maxDocLenList[bIdx] - len(l)), lenCharsDoc))
557 |         lenBatchesChars.append(lenCharsDoc)
558 | 
559 |         lenEmbeddingssDoc = []
560 |         lenCharsDoc=[]
561 | 
562 |     # 将数据转为numpy格式
563 |     for bIdx in range(len(batches_embeddingIds)):  # 131
564 |         # print (bIdx)
565 |         batch_embeddingIds = np.asarray(batches_embeddingIds[bIdx])
566 |         batch_charIds = np.asarray(batches_charIds[bIdx])
567 |         batch_scoringMatrix = np.asarray(batches_scoringMatrix[bIdx])
568 | 
569 |         batch_ner = np.asarray(batches_entity_tags[bIdx])
570 |         batch_ner_ids = np.asarray(batches_entity_tags_ids[bIdx])
571 |         batch_token = np.asarray(batches_tokens[bIdx])
572 | 
573 |         batch_bio = np.asarray(batches_BIO[bIdx])
574 |         batch_tokenId = np.asarray(batches_tokenIds[bIdx])
575 | 
576 |         batch_doc_id = np.asarray(batches_doc_ids[bIdx])
577 | 
578 |         docs_length = np.asarray(lenBatchesDoc[bIdx])
579 |         tokenslength = np.asarray(lenBatchesChars[bIdx])
580 | 
581 | 
582 | 
583 | 
584 |         yield {dropout_embedding_keep:dropout_embedding_prob,dropout_lstm_keep:dropout_lstm_prob,dropout_lstm_output_keep:dropout_lstm_output_prob,
585 |                dropout_fcl_ner_keep:dropout_fcl_ner_prob,dropout_fcl_rel_keep:dropout_fcl_rel_prob,isTrain:train,charIds:batch_charIds,
586 |                tokensLens:tokenslength, embeddingIds: batch_embeddingIds,entity_tags_ids:batch_ner_ids,entity_tags:batch_ner,
587 |                tokens:batch_token,BIO: batch_bio,tokenIds:batch_tokenId,scoringMatrixGold:batch_scoringMatrix, seqlen:docs_length, doc_ids:batch_doc_id }
588 | 
589 | 
590 | 
591 | 
592 | 
593 | 


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
   1 | import data_utils
   2 | import copy
   3 | from prettytable import PrettyTable
   4 | 
   5 | 
   6 | """
   7 | Set of classes and methods for computing and printing the results using the strict, boundaries and relaxed evaluation methods.
   8 | For more info about how to use them see tf_utils.py
   9 | """
  10 | 
  11 | class printClasses():
  12 |     def __init__(self):
  13 |         self.t = PrettyTable(['Class', 'TP', 'FP', 'FN', 'Pr', 'Re', 'F1'])
  14 | 
  15 |     def add(self, Class, TP, FP, FN, Pr, Re, F1):
  16 |         if Class!="O":
  17 |             self.t.add_row([Class, TP, FP, FN, Pr, Re, F1])
  18 | 
  19 |     def print(self):
  20 |         print(self.t)
  21 | 
  22 | 
  23 | def get_chunk_type(tok, idx_to_tag):
  24 |     # method implemented in https://github.com/guillaumegenthial/sequence_tagging/blob/master/model/data_utils.py
  25 |     """
  26 |     Args:
  27 |         tok: id of token, ex 4
  28 |         idx_to_tag: dictionary {4: "B-PER", ...}
  29 |     Returns:
  30 |         tuple: "B", "PER"
  31 |     """
  32 |     tag_name = idx_to_tag[tok]
  33 |     tag_class = tag_name.split('-')[0]
  34 |     tag_type = tag_name.split('-')[-1]
  35 |     return tag_class, tag_type
  36 | 
  37 | 
  38 | def get_chunks(seq, tags):
  39 |     # method implemented in https://github.com/guillaumegenthial/sequence_tagging/blob/master/model/data_utils.py
  40 |     """Given a sequence of tags, group entities and their position
  41 |     Args:
  42 |         seq: [4, 4, 0, 0, ...] sequence of labels
  43 |         tags: dict["O"] = 4
  44 |     Returns:
  45 |         list of (chunk_type, chunk_start, chunk_end)
  46 |     Example:
  47 |         seq = [4, 5, 0, 3]
  48 |         tags = {"B-PER": 4, "I-PER": 5, "B-LOC": 3}
  49 |         result = [("PER", 0, 2), ("LOC", 3, 4)]
  50 |     """
  51 | 
  52 |     default = tags['O']
  53 |     idx_to_tag = {idx: tag for tag, idx in tags.items()}
  54 |     chunks = []
  55 |     chunk_type, chunk_start = None, None
  56 |     for i, tok in enumerate(seq):
  57 |         # End of a chunk 1
  58 |         if tok == default and chunk_type is not None:
  59 |             # Add a chunk.
  60 |             chunk = (chunk_type, chunk_start, i-1)
  61 |             chunks.append(chunk)
  62 |             chunk_type, chunk_start = None, None
  63 | 
  64 |         # End of a chunk + start of a chunk!
  65 |         elif tok != default:
  66 |             tok_chunk_class, tok_chunk_type = get_chunk_type(tok, idx_to_tag)
  67 |             if chunk_type is None:
  68 |                 chunk_type, chunk_start = tok_chunk_type, i
  69 |             elif tok_chunk_type != chunk_type or tok_chunk_class == "B":
  70 |                 chunk = (chunk_type, chunk_start, i-1)
  71 |                 chunks.append(chunk)
  72 |                 chunk_type, chunk_start = tok_chunk_type, i
  73 |         else:
  74 |             pass
  75 | 
  76 |     # end condition
  77 |     if chunk_type is not None:
  78 |         chunk = (chunk_type, chunk_start, len(seq)-1)
  79 |         chunks.append(chunk)
  80 | 
  81 |     return chunks
  82 | 
  83 | 
  84 | 
  85 | def relationChunks(relations, ners, relationTuple="boundaries_type"):
  86 |     relationChunks = []
  87 |     for rel in relations:
  88 | 
  89 |         relation = rel[1]
  90 |         left_chunk = ""
  91 |         right_chunk = ""
  92 |         for ner in ners:
  93 |             if rel[0] >= ner[1] and rel[0] <= ner[2]:
  94 |                 # print (ner)
  95 |                 if relationTuple == "boundaries_type":
  96 |                     left_chunk = ner
  97 |                 elif relationTuple == "boundaries":
  98 |                     left_chunk = (ner[1], ner[2])
  99 |                 elif relationTuple == "type":
 100 |                     left_chunk = (ner[0])
 101 |             if rel[2] >= ner[1] and rel[2] <= ner[2]:
 102 |                 # print (ner)
 103 |                 if relationTuple == "boundaries_type":
 104 |                     right_chunk = ner
 105 |                 elif relationTuple == "boundaries":
 106 |                     right_chunk = (ner[1], ner[2])
 107 |                 elif relationTuple == "type":
 108 |                     right_chunk = (ner[0])
 109 |         if (left_chunk != "" and right_chunk != ""):
 110 |             relationChunks.append((left_chunk, relation, right_chunk))
 111 |     return relationChunks
 112 | 
 113 | def getTokenRelations(label_names, head_ids, token_ids):
 114 |         relations = []
 115 |         for labelLIdx in range(len(label_names)):
 116 |             # print (predLabel)
 117 |             labelL = label_names[labelLIdx]
 118 |             headL = head_ids[labelLIdx]
 119 |             tokenId = token_ids[labelLIdx]
 120 |             for labelIdx in range(len(labelL)):
 121 | 
 122 |                 label = labelL[labelIdx]
 123 |                 head = headL[labelIdx]
 124 |                 # print (label)
 125 |                 # print ((tokenId)+" "+ label+ " " + head)
 126 |                 if label != "N":
 127 |                     # print (label)
 128 |                     relations.append((tokenId, label, head))
 129 |                     # print (tokenId,label,head)
 130 |         return relations
 131 | 
 132 | 
 133 | def keepOnlyChunkBoundaries(ners):
 134 |     nersNoBounds = []
 135 |     ners = list(ners)
 136 |     for ner in ners:
 137 |         # ner[0]=None
 138 |         # print (ner)
 139 |         nersNoBounds.append((None, ner[1], ner[2]))
 140 |     return nersNoBounds
 141 | 
 142 | class chunkEvaluator:
 143 |     def __init__(self,config,ner_chunk_eval="boundaries_type",rel_chunk_eval="boundaries"):
 144 |         self.nerSegmentationTags=config.dataset_set_bio_tags
 145 | 
 146 |         self.NERset = config.dataset_set_ec_tags
 147 |         self.RELset = config.dataset_set_relations
 148 | 
 149 |         self.root_node=config.root_node
 150 | 
 151 | 
 152 |         self.ner_chunk_eval=ner_chunk_eval
 153 |         self.rel_chunk_eval=rel_chunk_eval
 154 | 
 155 | 
 156 |         self.totals = 0
 157 |         self.oks = 0
 158 | 
 159 |         self.tpsNER = 0
 160 |         self.fpsNER = 0
 161 |         self.fnsNER = 0
 162 | 
 163 |         self.tpsREL = 0
 164 |         self.fpsREL = 0
 165 |         self.fnsREL = 0
 166 | 
 167 |         self.tpsClassesNER = dict.fromkeys(self.NERset, 0)
 168 |         self.fpsClassesNER = dict.fromkeys(self.NERset, 0)
 169 |         self.fnsClassesNER = dict.fromkeys(self.NERset, 0)
 170 |         self.precisionNER = dict.fromkeys(self.NERset, 0)
 171 |         self.recallNER = dict.fromkeys(self.NERset, 0)
 172 |         self.f1NER = dict.fromkeys(self.NERset, 0)
 173 | 
 174 |         self.tpsClassesREL = dict.fromkeys(self.RELset, 0)
 175 |         self.fpsClassesREL = dict.fromkeys(self.RELset, 0)
 176 |         self.fnsClassesREL = dict.fromkeys(self.RELset, 0)
 177 | 
 178 |         self.precisionREL = dict.fromkeys(self.RELset, 0)
 179 |         self.recallREL = dict.fromkeys(self.RELset, 0)
 180 |         self.f1REL = dict.fromkeys(self.RELset, 0)
 181 | 
 182 |         self.correct_predsNER, self.total_correctNER, self.total_predsNER = 0., 0., 0.
 183 |         self.correct_predsREL, self.total_correctREL, self.total_predsREL = 0., 0., 0.
 184 | 
 185 |     def add(self, pred_batchesNER, true_batchesNER, pred_batchesREL, true_batchesREL):
 186 | 
 187 | 
 188 | 
 189 |         for batch_idx in range(len(pred_batchesNER)):
 190 |             predNER = pred_batchesNER[batch_idx]
 191 |             trueNER = true_batchesNER[batch_idx]
 192 | 
 193 |             predRel = pred_batchesREL[batch_idx]
 194 |             trueRel = true_batchesREL[batch_idx]
 195 | 
 196 |             ptoken_ids, _, plabel_ids, phead_ids, plabel_names = data_utils.transformToInitialInput(
 197 |                 predRel, self.RELset)
 198 | 
 199 |             _, _, tlabel_ids, thead_ids, tlabel_names = data_utils.transformToInitialInput(
 200 |                 trueRel, self.RELset)
 201 | 
 202 |             trueRel = getTokenRelations(tlabel_names, thead_ids, ptoken_ids)
 203 | 
 204 |             predRel = getTokenRelations(plabel_names, phead_ids, ptoken_ids)
 205 | 
 206 |             tagsNER = data_utils.getSegmentationDict(self.nerSegmentationTags)#self.
 207 | 
 208 |             if self.ner_chunk_eval == "boundaries_type":
 209 | 
 210 |                 lab_chunks = set(get_chunks(trueNER, tagsNER))
 211 |                 lab_pred_chunks = set(get_chunks(predNER, tagsNER))
 212 | 
 213 |             elif self.ner_chunk_eval == "boundaries":
 214 | 
 215 |                 lab_chunks = set(keepOnlyChunkBoundaries(set(get_chunks(trueNER, tagsNER))))
 216 |                 lab_pred_chunks = set(keepOnlyChunkBoundaries(set(get_chunks(predNER, tagsNER))))
 217 | 
 218 | 
 219 | 
 220 | 
 221 |             lab_chunks_list = list(lab_chunks)
 222 |             lab_pred_chunks_list = list(lab_pred_chunks)
 223 | 
 224 | 
 225 |             if self.ner_chunk_eval == "boundaries_type":
 226 |                 for lab_idx in range(len(lab_pred_chunks_list)):
 227 | 
 228 |                     if lab_pred_chunks_list[lab_idx] in lab_chunks_list:
 229 |                         # print (lab_pred_chunks_list[lab_idx][0])
 230 |                         self.tpsClassesNER[lab_pred_chunks_list[lab_idx][0]] += 1
 231 |                     else:
 232 |                         self.fpsClassesNER[lab_pred_chunks_list[lab_idx][0]] += 1
 233 |                         # fnsEntitiesNER+=1
 234 | 
 235 |                 for lab_idx in range(len(lab_chunks_list)):
 236 | 
 237 |                     if lab_chunks_list[lab_idx] not in lab_pred_chunks_list:
 238 |                         self.fnsClassesNER[lab_chunks_list[lab_idx][0]] += 1
 239 | 
 240 |             elif self.ner_chunk_eval == "boundaries":
 241 |                 for lab_idx in range(len(lab_pred_chunks_list)):
 242 | 
 243 |                     if lab_pred_chunks_list[lab_idx] in lab_chunks_list:
 244 |                         # print (lab_pred_chunks_list[lab_idx][0])
 245 |                         self.tpsNER += 1
 246 |                     else:
 247 |                         self.fpsNER += 1
 248 |                         # fnsEntitiesNER+=1
 249 | 
 250 |                 for lab_idx in range(len(lab_chunks_list)):
 251 | 
 252 |                     if lab_chunks_list[lab_idx] not in lab_pred_chunks_list:
 253 |                         self.fnsNER += 1
 254 | 
 255 |             if self.root_node==True:
 256 |                 lab_chunks_list_with_ROOT = copy.deepcopy(lab_chunks_list)
 257 |                 lab_chunks_list_with_ROOT.append((None,0, 0))
 258 | 
 259 |                 lab_pred_chunks_list_with_ROOT = copy.deepcopy(lab_pred_chunks_list)
 260 |                 lab_pred_chunks_list_with_ROOT.append((None,0, 0))
 261 | 
 262 |                 relTrue = set(relationChunks(trueRel, lab_chunks_list_with_ROOT, relationTuple=self.rel_chunk_eval))
 263 | 
 264 |                 relPred = set(relationChunks(predRel, lab_pred_chunks_list_with_ROOT, relationTuple=self.rel_chunk_eval))
 265 | 
 266 |             else:
 267 |                 relTrue = set(relationChunks(trueRel, lab_chunks_list,relationTuple=self.rel_chunk_eval))
 268 | 
 269 |                 relPred = set(relationChunks(predRel, lab_pred_chunks_list,relationTuple=self.rel_chunk_eval))
 270 | 
 271 |             relTrueList = list(relTrue)  # trueRel#
 272 | 
 273 |             # if (len(trueRel)!=len(relTrueList)):
 274 |             #    print ("warning")
 275 | 
 276 |             relPredList = list(relPred)  # predRel#
 277 | 
 278 |             for lab_idx in range(len(relPredList)):
 279 | 
 280 |                 if relPredList[lab_idx] in relTrueList:
 281 |                     # print (lab_pred_chunks_list[lab_idx][0])
 282 |                     self.tpsClassesREL[relPredList[lab_idx][1]] += 1
 283 |                     # print (relPredList[lab_idx])
 284 |                 else:
 285 |                     self.fpsClassesREL[relPredList[lab_idx][1]] += 1
 286 |                     # fnsEntitiesNER+=1
 287 | 
 288 |             for lab_idx in range(len(relTrueList)):
 289 | 
 290 |                 if relTrueList[lab_idx] not in relPredList:
 291 |                     self.fnsClassesREL[relTrueList[lab_idx][1]] += 1
 292 | 
 293 |             self.correct_predsNER += len(lab_chunks & lab_pred_chunks)
 294 |             self.total_predsNER += len(lab_pred_chunks)
 295 |             self.total_correctNER += len(lab_chunks)
 296 | 
 297 |             self.correct_predsREL += len(relTrue & relPred)
 298 |             self.total_predsREL += len(relPred)
 299 |             self.total_correctREL += len(relTrue)
 300 | 
 301 | 
 302 | 
 303 |     def getResultsNER(self):
 304 |         p = self.correct_predsNER / self.total_predsNER if self.correct_predsNER > 0 else 0
 305 |         r = self.correct_predsNER / self.total_correctNER if self.correct_predsNER > 0 else 0
 306 |         f1 = 2 * p * r / (p + r) if self.correct_predsNER > 0 else 0
 307 | 
 308 |         print(self.correct_predsNER)
 309 |         print(self.total_predsNER)
 310 |         print(self.total_correctNER)
 311 | 
 312 |         print(f1)
 313 |         return f1
 314 | 
 315 |     def getResultsREL(self):
 316 |         p = self.correct_predsREL / self.total_predsREL if self.correct_predsREL > 0 else 0
 317 |         r = self.correct_predsREL / self.total_correctREL if self.correct_predsREL > 0 else 0
 318 |         f1 = 2 * p * r / (p + r) if self.correct_predsREL > 0 else 0
 319 | 
 320 |         print(self.correct_predsREL)
 321 |         print(self.total_predsREL)
 322 |         print(self.total_correctREL)
 323 | 
 324 |         print(f1)
 325 |         return f1
 326 | 
 327 |     def getPrecision(self, tps, fps):
 328 |         if tps == 0:
 329 |             return 0
 330 |         else:
 331 |             return tps / (tps + fps)
 332 | 
 333 |     def getRecall(self, tps, fns):
 334 |         if tps == 0:
 335 |             return 0
 336 |         else:
 337 |             return tps / (tps + fns)
 338 | 
 339 |     def getF1(self, tps, fps, fns):
 340 |         if tps == 0:
 341 |             return 0
 342 |         else:
 343 |             return 2 * self.getPrecision(tps, fps) * self.getRecall(tps, fns) / (
 344 |             self.getPrecision(tps, fps) + self.getRecall(tps, fns))
 345 | 
 346 | 
 347 |     def getChunkedOverallAvgF1(self):
 348 | 
 349 | 
 350 |         return (self.getChunkedNERF1()+self.getChunkedRELF1())/2
 351 | 
 352 |     def getChunkedOverallF1(self):
 353 |         tpsNER=0
 354 |         fnsNER=0
 355 |         fpsNER=0
 356 |         tpsREL=0
 357 |         fnsREL=0
 358 |         fpsREL=0
 359 |         if self.ner_chunk_eval == "boundaries_type":
 360 |             for label in self.NERset:
 361 |                 # if label != "O" :
 362 |                 tpsNER += self.tpsClassesNER[label]
 363 | 
 364 |                 fnsNER += self.fnsClassesNER[label]
 365 |                 fpsNER += self.fpsClassesNER[label]
 366 |         elif self.ner_chunk_eval == "boundaries":
 367 |             tpsNER=self.tpsNER
 368 |             fnsNER = self.fnsNER
 369 |             fpsNER = self.fpsNER
 370 | 
 371 | 
 372 |         for label in self.RELset:
 373 | 
 374 |             if label != "N":
 375 |                 tpsREL += self.tpsClassesREL[label]
 376 | 
 377 |                 fnsREL += self.fnsClassesREL[label]
 378 |                 fpsREL += self.fpsClassesREL[label]
 379 | 
 380 | 
 381 | 
 382 |         return self.getF1(tpsNER+tpsREL, fpsNER+fpsREL, fnsNER+fnsREL)
 383 | 
 384 | 
 385 |     def getOverallF1(self):
 386 |         tpsNER=0
 387 |         fnsNER=0
 388 |         fpsNER=0
 389 |         tpsREL=0
 390 |         fnsREL=0
 391 |         fpsREL=0
 392 | 
 393 |         for label in self.NERset:
 394 |             # if label != "O" :
 395 |             tpsNER += self.tpsClassesNER[label]
 396 | 
 397 |             fnsNER += self.fnsClassesNER[label]
 398 |             fpsNER += self.fpsClassesNER[label]
 399 | 
 400 |         for label in self.RELset:
 401 | 
 402 |             if label != "N":
 403 |                 tpsREL += self.tpsClassesREL[label]
 404 | 
 405 |                 fnsREL += self.fnsClassesREL[label]
 406 |                 fpsREL += self.fpsClassesREL[label]
 407 | 
 408 | 
 409 | 
 410 |         return self.getF1(tpsNER+tpsREL, fpsNER+fpsREL, fnsNER+fnsREL)
 411 | 
 412 |     def getChunkedRELF1(self):
 413 | 
 414 |         tpsREL=0
 415 |         fnsREL=0
 416 |         fpsREL=0
 417 | 
 418 | 
 419 | 
 420 |         for label in self.RELset:
 421 | 
 422 |             if label != "N":
 423 |                 tpsREL += self.tpsClassesREL[label]
 424 | 
 425 |                 fnsREL += self.fnsClassesREL[label]
 426 |                 fpsREL += self.fpsClassesREL[label]
 427 | 
 428 | 
 429 | 
 430 |         return self.getF1(tpsREL, fpsREL, fnsREL)
 431 | 
 432 |     def getChunkedNERF1(self):
 433 |         tpsNER = 0
 434 |         fnsNER = 0
 435 |         fpsNER = 0
 436 |         if self.ner_chunk_eval == "boundaries_type":
 437 | 
 438 | 
 439 |             for label in self.NERset:
 440 |                 # if label != "O" :
 441 |                 tpsNER += self.tpsClassesNER[label]
 442 | 
 443 |                 fnsNER += self.fnsClassesNER[label]
 444 |                 fpsNER += self.fpsClassesNER[label]
 445 | 
 446 | 
 447 |         elif self.ner_chunk_eval== "boundaries":
 448 |             tpsNER =self.tpsNER
 449 |             fnsNER = self.fnsNER
 450 |             fpsNER = self.fpsNER
 451 | 
 452 |         return self.getF1(tpsNER, fpsNER, fnsNER)
 453 |     def getAccuracy(self):
 454 |         return self.oks / self.totals
 455 | 
 456 |     def printInfo(self):
 457 | 
 458 |         printer = printClasses()
 459 | 
 460 |         if self.ner_chunk_eval== "boundaries_type":
 461 |             for label in self.NERset:
 462 |                 # if label != "O" :
 463 |                 self.tpsNER += self.tpsClassesNER[label]
 464 | 
 465 |                 self.fnsNER += self.fnsClassesNER[label]
 466 |                 self.fpsNER += self.fpsClassesNER[label]
 467 | 
 468 |                 printer.add(label, self.tpsClassesNER[label], self.fpsClassesNER[label], self.fnsClassesNER[label],
 469 |                             self.getPrecision(self.tpsClassesNER[label], self.fpsClassesNER[label]),
 470 |                             self.getRecall(self.tpsClassesNER[label], self.fnsClassesNER[label]),
 471 |                             self.getF1(self.tpsClassesNER[label], self.fpsClassesNER[label], self.fnsClassesNER[label]))
 472 | 
 473 | 
 474 | 
 475 |                 # print('%s TP: %d  FP: %d  FN: %d TN: %d precision: %f recall: %f F1: %f' % (label,self.tpsClasses[label],self.fpsClasses[label],self.fnsClasses[label],self.tnsClasses[label], self.precision[label], self.recall[label], self.f1[label]))
 476 |             printer.add("-", "-", "-", "-",
 477 |                         "-", "-",
 478 |                         "-")
 479 |             printer.add("Micro NER chunk", self.tpsNER, self.fpsNER, self.fnsNER,
 480 |                     self.getPrecision(self.tpsNER, self.fpsNER), self.getRecall(self.tpsNER, self.fnsNER),
 481 |                     self.getF1(self.tpsNER, self.fpsNER, self.fnsNER))
 482 | 
 483 |         elif self.ner_chunk_eval== "boundaries":
 484 |             printer.add("Micro NER chunk boundaries", self.tpsNER, self.fpsNER, self.fnsNER,
 485 |                         self.getPrecision(self.tpsNER, self.fpsNER), self.getRecall(self.tpsNER, self.fnsNER),
 486 |                         self.getF1(self.tpsNER, self.fpsNER, self.fnsNER))
 487 |         printer.print()
 488 | 
 489 |         printer = printClasses()
 490 |         for label in self.RELset:
 491 | 
 492 |             if label != "N":
 493 |                 self.tpsREL += self.tpsClassesREL[label]
 494 | 
 495 |                 self.fnsREL += self.fnsClassesREL[label]
 496 |                 self.fpsREL += self.fpsClassesREL[label]
 497 | 
 498 |                 printer.add(label, self.tpsClassesREL[label], self.fpsClassesREL[label], self.fnsClassesREL[label],
 499 |                             self.getPrecision(self.tpsClassesREL[label], self.fpsClassesREL[label]),
 500 |                             self.getRecall(self.tpsClassesREL[label], self.fnsClassesREL[label]),
 501 |                             self.getF1(self.tpsClassesREL[label], self.fpsClassesREL[label], self.fnsClassesREL[label]))
 502 | 
 503 | 
 504 | 
 505 |                 # print('%s TP: %d  FP: %d  FN: %d TN: %d precision: %f recall: %f F1: %f' % (label,self.tpsClasses[label],self.fpsClasses[label],self.fnsClasses[label],self.tnsClasses[label], self.precision[label], self.recall[label], self.f1[label]))
 506 |         printer.add("-", "-", "-", "-",
 507 |                     "-", "-",
 508 |                     "-")
 509 |         printer.add("Micro REL chunk", self.tpsREL, self.fpsREL, self.fnsREL,
 510 |                     self.getPrecision(self.tpsREL, self.fpsREL), self.getRecall(self.tpsREL, self.fnsREL),
 511 |                     self.getF1(self.tpsREL, self.fpsREL, self.fnsREL))
 512 | 
 513 |         printer.print()
 514 | 
 515 | 
 516 | def getMaxOccurence(lst):
 517 |     from collections import Counter
 518 |     most_common, num_most_common = Counter(lst).most_common(1)[0]  # 4, 6 times
 519 |     return most_common
 520 | 
 521 | 
 522 | def classesToChunks(tokenClasses, chunks):
 523 |     labeled_chunks = []
 524 |     for chunk in chunks:
 525 | 
 526 |         class_list = (tokenClasses[chunk[1]:chunk[2] + 1])
 527 | 
 528 |         if chunk[0] in class_list:
 529 |             labeled_chunks.append((chunk[0], chunk[1], chunk[2]))
 530 |         else:
 531 |             labeled_chunks.append((getMaxOccurence(class_list), chunk[1], chunk[2]))
 532 |             # print (class_list)
 533 |     return labeled_chunks
 534 | 
 535 | 
 536 | def listOfTagsToids(lstTags,tags):
 537 |     lstids = []
 538 |     for ner in lstTags:
 539 |         lstids.append(tags.index(ner))
 540 | 
 541 |     return lstids
 542 | 
 543 | def listOfIdsToTags(lst_ids,tags):
 544 |     lstTags= []
 545 |     for nerId in lst_ids:
 546 |         lstTags.append(tags[nerId])
 547 |     return lstTags
 548 | 
 549 | class relaxedChunkEvaluator:
 550 |     def __init__(self,dataset_params,rel_chunk_eval="boundaries"):
 551 |         self.nerSegmentationTags=dataset_params.dataset_set_bio_tags
 552 | 
 553 |         self.NERset = dataset_params.dataset_set_ec_tags#utils.getNerSetACE04()
 554 |         self.RELset = dataset_params.dataset_set_relations#reutils.getRelSetACE04()
 555 |         #self.nerDict=dataset_params
 556 |         # print (self.NERset)
 557 |         self.rel_chunk_eval=rel_chunk_eval
 558 |         self.totals = 0
 559 |         self.oks = 0
 560 | 
 561 |         self.tpsNER = 0
 562 |         self.fpsNER = 0
 563 |         self.fnsNER = 0
 564 | 
 565 |         self.tpsREL = 0
 566 |         self.fpsREL = 0
 567 |         self.fnsREL = 0
 568 | 
 569 |         self.tpsNERMacro = 0
 570 |         self.fpsNERMacro = 0
 571 |         self.fnsNERMacro = 0
 572 | 
 573 |         self.tpsNERMacro_no_other = 0
 574 |         self.fpsNERMacro_no_other = 0
 575 |         self.fnsNERMacro_no_other = 0
 576 | 
 577 |         self.tpsRELMacro = 0
 578 |         self.fpsRELMacro = 0
 579 |         self.fnsRELMacro = 0
 580 | 
 581 | 
 582 |         self.NERF1Macro=0
 583 |         self.NERF1Macro_no_other = 0
 584 |         self.RELF1Macro = 0
 585 |         self.OverallF1Macro = 0
 586 |         self.OverallF1Macro_no_other  = 0
 587 | 
 588 | 
 589 |         self.tpsClassesNER = dict.fromkeys(self.NERset, 0)
 590 |         self.fpsClassesNER = dict.fromkeys(self.NERset, 0)
 591 |         self.fnsClassesNER = dict.fromkeys(self.NERset, 0)
 592 |         self.precisionNER = dict.fromkeys(self.NERset, 0)
 593 |         self.recallNER = dict.fromkeys(self.NERset, 0)
 594 |         self.f1NER = dict.fromkeys(self.NERset, 0)
 595 | 
 596 |         self.tpsClassesREL = dict.fromkeys(self.RELset, 0)
 597 |         self.fpsClassesREL = dict.fromkeys(self.RELset, 0)
 598 |         self.fnsClassesREL = dict.fromkeys(self.RELset, 0)
 599 | 
 600 |         self.precisionREL = dict.fromkeys(self.RELset, 0)
 601 |         self.recallREL = dict.fromkeys(self.RELset, 0)
 602 |         self.f1REL = dict.fromkeys(self.RELset, 0)
 603 | 
 604 |         self.correct_predsNER, self.total_correctNER, self.total_predsNER = 0., 0., 0.
 605 |         self.correct_predsREL, self.total_correctREL, self.total_predsREL = 0., 0., 0.
 606 | 
 607 |     def add(self, pred_batchesNER, true_batchesNER, pred_batchesREL, true_batchesREL,true_batchesBIONER):
 608 | 
 609 | 
 610 | 
 611 |         for batch_idx in range(len(pred_batchesNER)):
 612 |             predNER = pred_batchesNER[batch_idx]
 613 |             trueNER = true_batchesNER[batch_idx]
 614 | 
 615 |             predRel = pred_batchesREL[batch_idx]
 616 |             trueRel = true_batchesREL[batch_idx]
 617 | 
 618 |             trueBIONER=true_batchesBIONER[batch_idx]
 619 | 
 620 | 
 621 |             ptoken_ids, _, plabel_ids, phead_ids, plabel_names = data_utils.transformToInitialInput(
 622 |                 predRel, self.RELset)
 623 | 
 624 |             _, _, tlabel_ids, thead_ids, tlabel_names = data_utils.transformToInitialInput(
 625 |                 trueRel, self.RELset)
 626 | 
 627 |             trueRel = getTokenRelations(tlabel_names, thead_ids, ptoken_ids)
 628 | 
 629 |             predRel = getTokenRelations(plabel_names, phead_ids, ptoken_ids)
 630 | 
 631 | 
 632 |             #print (self.NERset)
 633 |             tagsNER = data_utils.getSegmentationDict(self.nerSegmentationTags)#self.
 634 | 
 635 | 
 636 | 
 637 |             lab_chunks_ = set(get_chunks(listOfTagsToids(trueBIONER,self.nerSegmentationTags), tagsNER))
 638 |             #lab_pred_chunks = set(get_chunks(predNER, tagsNER))
 639 | 
 640 |             lab_chunks_list_ = list(lab_chunks_)
 641 | 
 642 | 
 643 |             trueNER_tags=listOfIdsToTags(trueNER,self.NERset)
 644 |             predNER_tags=listOfIdsToTags(predNER, self.NERset)
 645 | 
 646 |             lab_chunks = set(classesToChunks(trueNER_tags, lab_chunks_list_))
 647 |             lab_pred_chunks=set(classesToChunks(predNER_tags, lab_chunks_list_))
 648 | 
 649 |             lab_chunks_list = list(lab_chunks)
 650 |             lab_pred_chunks_list = list(lab_pred_chunks)
 651 | 
 652 | 
 653 |             for lab_idx in range(len(lab_pred_chunks_list)):
 654 | 
 655 |                 if lab_pred_chunks_list[lab_idx] in lab_chunks_list:
 656 |                     # print (lab_pred_chunks_list[lab_idx][0])
 657 |                     self.tpsClassesNER[lab_pred_chunks_list[lab_idx][0]] += 1
 658 |                 else:
 659 |                     self.fpsClassesNER[lab_pred_chunks_list[lab_idx][0]] += 1
 660 |                     # fnsEntitiesNER+=1
 661 | 
 662 |             for lab_idx in range(len(lab_chunks_list)):
 663 | 
 664 |                 if lab_chunks_list[lab_idx] not in lab_pred_chunks_list:
 665 |                     self.fnsClassesNER[lab_chunks_list[lab_idx][0]] += 1
 666 | 
 667 |             relTrue = set(relationChunks(trueRel, lab_chunks_list,relationTuple=self.rel_chunk_eval))
 668 | 
 669 |             relPred = set(relationChunks(predRel, lab_pred_chunks_list,relationTuple=self.rel_chunk_eval))
 670 | 
 671 |             relTrueList = list(relTrue)  # trueRel#
 672 | 
 673 |             # if (len(trueRel)!=len(relTrueList)):
 674 |             #    print ("warning")
 675 | 
 676 |             relPredList = list(relPred)  # predRel#
 677 | 
 678 |             #print("GOLD REL chunks:" + str(relTrueList))
 679 | 
 680 |             #print("PRED REL chunks:" + str(relPredList))
 681 | 
 682 |             for lab_idx in range(len(relPredList)):
 683 | 
 684 |                 if relPredList[lab_idx] in relTrueList:
 685 |                     # print (lab_pred_chunks_list[lab_idx][0])
 686 |                     self.tpsClassesREL[relPredList[lab_idx][1]] += 1
 687 |                     # print (relPredList[lab_idx])
 688 |                 else:
 689 |                     self.fpsClassesREL[relPredList[lab_idx][1]] += 1
 690 |                     # fnsEntitiesNER+=1
 691 | 
 692 |             for lab_idx in range(len(relTrueList)):
 693 | 
 694 |                 if relTrueList[lab_idx] not in relPredList:
 695 |                     self.fnsClassesREL[relTrueList[lab_idx][1]] += 1
 696 | 
 697 |             self.correct_predsNER += len(lab_chunks & lab_pred_chunks)
 698 |             self.total_predsNER += len(lab_pred_chunks)
 699 |             self.total_correctNER += len(lab_chunks)
 700 | 
 701 |             self.correct_predsREL += len(relTrue & relPred)
 702 |             self.total_predsREL += len(relPred)
 703 |             self.total_correctREL += len(relTrue)
 704 | 
 705 | 
 706 | 
 707 |     def getResultsNER(self):
 708 |         p = self.correct_predsNER / self.total_predsNER if self.correct_predsNER > 0 else 0
 709 |         r = self.correct_predsNER / self.total_correctNER if self.correct_predsNER > 0 else 0
 710 |         f1 = 2 * p * r / (p + r) if self.correct_predsNER > 0 else 0
 711 | 
 712 |         print(self.correct_predsNER)
 713 |         print(self.total_predsNER)
 714 |         print(self.total_correctNER)
 715 | 
 716 |         print(f1)
 717 |         return f1
 718 | 
 719 |     def getResultsREL(self):
 720 |         p = self.correct_predsREL / self.total_predsREL if self.correct_predsREL > 0 else 0
 721 |         r = self.correct_predsREL / self.total_correctREL if self.correct_predsREL > 0 else 0
 722 |         f1 = 2 * p * r / (p + r) if self.correct_predsREL > 0 else 0
 723 | 
 724 |         print(self.correct_predsREL)
 725 |         print(self.total_predsREL)
 726 |         print(self.total_correctREL)
 727 | 
 728 |         print(f1)
 729 |         return f1
 730 | 
 731 |     def getPrecision(self, tps, fps):
 732 |         if tps == 0:
 733 |             return 0
 734 |         else:
 735 |             return tps / (tps + fps)
 736 | 
 737 |     def getRecall(self, tps, fns):
 738 |         if tps == 0:
 739 |             return 0
 740 |         else:
 741 |             return tps / (tps + fns)
 742 | 
 743 |     def getF1(self, tps, fps, fns):
 744 |         if tps == 0:
 745 |             return 0
 746 |         else:
 747 |             return 2 * self.getPrecision(tps, fps) * self.getRecall(tps, fns) / (
 748 |             self.getPrecision(tps, fps) + self.getRecall(tps, fns))
 749 | 
 750 |     def getChunkedOverallF1(self):
 751 |         tpsNER=0
 752 |         fnsNER=0
 753 |         fpsNER=0
 754 |         tpsREL=0
 755 |         fnsREL=0
 756 |         fpsREL=0
 757 | 
 758 |         for label in self.NERset:
 759 |             # if label != "O" :
 760 |             tpsNER += self.tpsClassesNER[label]
 761 | 
 762 |             fnsNER += self.fnsClassesNER[label]
 763 |             fpsNER += self.fpsClassesNER[label]
 764 | 
 765 |         for label in self.RELset:
 766 | 
 767 |             if label != "N":
 768 |                 tpsREL += self.tpsClassesREL[label]
 769 | 
 770 |                 fnsREL += self.fnsClassesREL[label]
 771 |                 fpsREL += self.fpsClassesREL[label]
 772 | 
 773 | 
 774 | 
 775 |         return self.getF1(tpsNER+tpsREL, fpsNER+fpsREL, fnsNER+fnsREL)
 776 | 
 777 | 
 778 |     def getOverallF1(self):
 779 |         tpsNER=0
 780 |         fnsNER=0
 781 |         fpsNER=0
 782 |         tpsREL=0
 783 |         fnsREL=0
 784 |         fpsREL=0
 785 | 
 786 |         for label in self.NERset:
 787 |             # if label != "O" :
 788 |             tpsNER += self.tpsClassesNER[label]
 789 | 
 790 |             fnsNER += self.fnsClassesNER[label]
 791 |             fpsNER += self.fpsClassesNER[label]
 792 | 
 793 |         for label in self.RELset:
 794 | 
 795 |             if label != "N":
 796 |                 tpsREL += self.tpsClassesREL[label]
 797 | 
 798 |                 fnsREL += self.fnsClassesREL[label]
 799 |                 fpsREL += self.fpsClassesREL[label]
 800 | 
 801 | 
 802 | 
 803 |         return self.getF1(tpsNER+tpsREL, fpsNER+fpsREL, fnsNER+fnsREL)
 804 | 
 805 |     def getChunkedRELF1(self):
 806 | 
 807 |         tpsREL=0
 808 |         fnsREL=0
 809 |         fpsREL=0
 810 | 
 811 | 
 812 | 
 813 |         for label in self.RELset:
 814 | 
 815 |             if label != "N":
 816 |                 tpsREL += self.tpsClassesREL[label]
 817 | 
 818 |                 fnsREL += self.fnsClassesREL[label]
 819 |                 fpsREL += self.fpsClassesREL[label]
 820 | 
 821 | 
 822 | 
 823 |         return self.getF1(tpsREL, fpsREL, fnsREL)
 824 | 
 825 |     def getChunkedNERF1(self):
 826 |         tpsNER=0
 827 |         fnsNER=0
 828 |         fpsNER=0
 829 | 
 830 | 
 831 |         for label in self.NERset:
 832 |             # if label != "O" :
 833 |             tpsNER += self.tpsClassesNER[label]
 834 | 
 835 |             fnsNER += self.fnsClassesNER[label]
 836 |             fpsNER += self.fpsClassesNER[label]
 837 | 
 838 |         return self.getF1(tpsNER, fpsNER, fnsNER)
 839 | 
 840 |     def getAccuracy(self):
 841 |         return self.oks / self.totals
 842 | 
 843 |     def getMacroF1scores(self):
 844 | 
 845 | 
 846 |         return self.NERF1Macro,self.RELF1Macro,self.OverallF1Macro
 847 | 
 848 |     def getMacroF1scoresNoOtherClass(self):
 849 | 
 850 |         return self.NERF1Macro_no_other, self.RELF1Macro, self.OverallF1Macro_no_other
 851 | 
 852 | 
 853 |     def computeInfoMacro(self,printScores=True):
 854 | 
 855 |         printer = printClasses()
 856 | 
 857 | 
 858 |         averageNERF1_no_Other=0
 859 |         averageNERF1 = 0
 860 | 
 861 |         averageNERrecall_no_Other = 0
 862 |         averageNERrecall = 0
 863 | 
 864 |         averageNERprecision_no_Other = 0
 865 |         averageNERprecision = 0
 866 | 
 867 |         for label in self.NERset:
 868 |             if label != "O":
 869 |                 self.tpsNERMacro += self.tpsClassesNER[label]
 870 | 
 871 |                 self.fnsNERMacro += self.fnsClassesNER[label]
 872 |                 self.fpsNERMacro += self.fpsClassesNER[label]
 873 | 
 874 |             f1_class=self.getF1(self.tpsClassesNER[label], self.fpsClassesNER[label], self.fnsClassesNER[label])
 875 |             precision_class=self.getPrecision(self.tpsClassesNER[label], self.fpsClassesNER[label])
 876 |             recall_class=self.getRecall(self.tpsClassesNER[label], self.fnsClassesNER[label])
 877 |             if label!= "O" :
 878 |                 averageNERF1+=f1_class
 879 |                 averageNERrecall += recall_class
 880 |                 averageNERprecision += precision_class
 881 | 
 882 |             if label!= "O" and label!= "Other":
 883 |                 averageNERF1_no_Other+=f1_class
 884 |                 averageNERrecall_no_Other += recall_class
 885 |                 averageNERprecision_no_Other += precision_class
 886 | 
 887 | 
 888 |             if label != "O" and label != "Other":
 889 |                 self.tpsNERMacro_no_other += self.tpsClassesNER[label]
 890 | 
 891 |                 self.fnsNERMacro_no_other += self.fnsClassesNER[label]
 892 |                 self.fpsNERMacro_no_other += self.fpsClassesNER[label]
 893 | 
 894 | 
 895 |             printer.add(label, self.tpsClassesNER[label], self.fpsClassesNER[label], self.fnsClassesNER[label],
 896 |                         precision_class,
 897 |                         recall_class,
 898 |                         f1_class)
 899 | 
 900 | 
 901 | 
 902 |             # print('%s TP: %d  FP: %d  FN: %d TN: %d precision: %f recall: %f F1: %f' % (label,self.tpsClasses[label],self.fpsClasses[label],self.fnsClasses[label],self.tnsClasses[label], self.precision[label], self.recall[label], self.f1[label]))
 903 |         printer.add("-", "-", "-", "-",
 904 |                     "-", "-",
 905 |                     "-")
 906 | 
 907 |         averageNERrecall = averageNERrecall / (len(self.NERset) - 1)
 908 |         averageNERprecision = averageNERprecision / (len(self.NERset) - 1)
 909 |         averageNERF1 = averageNERF1 / (len(self.NERset) - 1)
 910 | 
 911 | 
 912 |         if "other" in [x.lower() for x in self.NERset]:
 913 | 
 914 |             averageNERprecision_no_Other=averageNERprecision_no_Other / (len(self.NERset) -2)
 915 |             averageNERrecall_no_Other=averageNERrecall_no_Other / (len(self.NERset) -2)
 916 |             averageNERF1_no_Other=averageNERF1_no_Other / (len(self.NERset) -2)
 917 | 
 918 |             printer.add("Macro NER chunk RELAXED ^Other", self.tpsNERMacro_no_other, self.fpsNERMacro_no_other, self.fnsNERMacro_no_other,
 919 |                         averageNERprecision_no_Other, averageNERrecall_no_Other,
 920 |                         averageNERF1_no_Other)
 921 |         else:
 922 |             averageNERprecision_no_Other = averageNERprecision
 923 |             averageNERrecall_no_Other = averageNERrecall
 924 |             averageNERF1_no_Other = averageNERF1
 925 | 
 926 | 
 927 |         printer.add("Macro NER chunk RELAXED", self.tpsNERMacro, self.fpsNERMacro, self.fnsNERMacro,
 928 |                     averageNERprecision, averageNERrecall,
 929 |                     averageNERF1)
 930 |         if printScores ==True:
 931 | 
 932 |             printer.print()
 933 | 
 934 |         printer = printClasses()
 935 | 
 936 |         averageRELF1 = 0
 937 | 
 938 |         averageRELrecall = 0
 939 | 
 940 |         averageRELprecision = 0
 941 | 
 942 |         for label in self.RELset:
 943 | 
 944 |             if label != "N":
 945 |                 self.tpsRELMacro += self.tpsClassesREL[label]
 946 | 
 947 |                 self.fnsRELMacro += self.fnsClassesREL[label]
 948 |                 self.fpsRELMacro += self.fpsClassesREL[label]
 949 | 
 950 |                 f1_class = self.getF1(self.tpsClassesREL[label], self.fpsClassesREL[label], self.fnsClassesREL[label])
 951 |                 precision_class = self.getPrecision(self.tpsClassesREL[label], self.fpsClassesREL[label])
 952 |                 recall_class = self.getRecall(self.tpsClassesREL[label], self.fnsClassesREL[label])
 953 | 
 954 |                 averageRELF1+=f1_class
 955 |                 averageRELrecall += recall_class
 956 |                 averageRELprecision += precision_class
 957 | 
 958 |                 printer.add(label, self.tpsClassesREL[label], self.fpsClassesREL[label], self.fnsClassesREL[label],
 959 |                             precision_class,
 960 |                             recall_class,
 961 |                             f1_class)
 962 | 
 963 | 
 964 | 
 965 |                 # print('%s TP: %d  FP: %d  FN: %d TN: %d precision: %f recall: %f F1: %f' % (label,self.tpsClasses[label],self.fpsClasses[label],self.fnsClasses[label],self.tnsClasses[label], self.precision[label], self.recall[label], self.f1[label]))
 966 |         printer.add("-", "-", "-", "-",
 967 |                     "-", "-",
 968 |                     "-")
 969 | 
 970 | 
 971 |         averageRELrecall=averageRELrecall/(len(self.RELset) - 1)
 972 |         averageRELprecision=averageRELprecision/(len(self.RELset) - 1)
 973 |         averageRELF1 =averageRELF1 /(len(self.RELset) - 1)
 974 | 
 975 | 
 976 | 
 977 | 
 978 |         printer.add("Macro REL chunk RELAXED", self.tpsRELMacro, self.fpsRELMacro, self.fnsRELMacro,
 979 |                     averageRELprecision, averageRELrecall,
 980 |                     averageRELF1)
 981 | 
 982 |         if printScores == True:
 983 |             printer.print()
 984 | 
 985 |         over_avg_f1 = (averageNERF1 + averageRELF1) / 2
 986 |         over_avg_f1_no_other = (averageNERF1_no_Other + averageRELF1) / 2
 987 | 
 988 |         t = PrettyTable(['Type','NER_F1', 'REL_F1', 'AVG_F1'])
 989 | 
 990 |         t.add_row(['Overall', averageNERF1, averageRELF1, over_avg_f1])
 991 |         if "other" in [x.lower() for x in self.NERset]:
 992 |             t.add_row(['Overall ^Other', averageNERF1_no_Other, averageRELF1, over_avg_f1_no_other])
 993 | 
 994 |         if printScores == True:
 995 |             print (t)
 996 | 
 997 |         self.NERF1Macro = averageNERF1
 998 |         self.NERF1Macro_no_other = averageNERF1_no_Other
 999 |         self.RELF1Macro = averageRELF1
1000 |         self.OverallF1Macro = over_avg_f1
1001 |         self.OverallF1Macro_no_other = over_avg_f1_no_other
1002 | 
1003 | 
1004 |     def printInfoMicro(self):
1005 | 
1006 |             printer = printClasses()
1007 | 
1008 |             for label in self.NERset:
1009 |                 # if label != "O" :
1010 |                 self.tpsNER += self.tpsClassesNER[label]
1011 | 
1012 |                 self.fnsNER += self.fnsClassesNER[label]
1013 |                 self.fpsNER += self.fpsClassesNER[label]
1014 | 
1015 |                 printer.add(label, self.tpsClassesNER[label], self.fpsClassesNER[label], self.fnsClassesNER[label],
1016 |                             self.getPrecision(self.tpsClassesNER[label], self.fpsClassesNER[label]),
1017 |                             self.getRecall(self.tpsClassesNER[label], self.fnsClassesNER[label]),
1018 |                             self.getF1(self.tpsClassesNER[label], self.fpsClassesNER[label], self.fnsClassesNER[label]))
1019 | 
1020 | 
1021 | 
1022 |                 # print('%s TP: %d  FP: %d  FN: %d TN: %d precision: %f recall: %f F1: %f' % (label,self.tpsClasses[label],self.fpsClasses[label],self.fnsClasses[label],self.tnsClasses[label], self.precision[label], self.recall[label], self.f1[label]))
1023 |             printer.add("-", "-", "-", "-",
1024 |                         "-", "-",
1025 |                         "-")
1026 |             printer.add("Micro NER chunk RELAXED", self.tpsNER, self.fpsNER, self.fnsNER,
1027 |                         self.getPrecision(self.tpsNER, self.fpsNER), self.getRecall(self.tpsNER, self.fnsNER),
1028 |                         self.getF1(self.tpsNER, self.fpsNER, self.fnsNER))
1029 | 
1030 |             printer.print()
1031 | 
1032 |             printer = printClasses()
1033 |             for label in self.RELset:
1034 | 
1035 |                 if label != "N":
1036 |                     self.tpsREL += self.tpsClassesREL[label]
1037 | 
1038 |                     self.fnsREL += self.fnsClassesREL[label]
1039 |                     self.fpsREL += self.fpsClassesREL[label]
1040 | 
1041 |                     printer.add(label, self.tpsClassesREL[label], self.fpsClassesREL[label], self.fnsClassesREL[label],
1042 |                                 self.getPrecision(self.tpsClassesREL[label], self.fpsClassesREL[label]),
1043 |                                 self.getRecall(self.tpsClassesREL[label], self.fnsClassesREL[label]),
1044 |                                 self.getF1(self.tpsClassesREL[label], self.fpsClassesREL[label],
1045 |                                            self.fnsClassesREL[label]))
1046 | 
1047 | 
1048 | 
1049 |                     # print('%s TP: %d  FP: %d  FN: %d TN: %d precision: %f recall: %f F1: %f' % (label,self.tpsClasses[label],self.fpsClasses[label],self.fnsClasses[label],self.tnsClasses[label], self.precision[label], self.recall[label], self.f1[label]))
1050 |             printer.add("-", "-", "-", "-",
1051 |                         "-", "-",
1052 |                         "-")
1053 |             printer.add("Micro REL chunk RELAXED", self.tpsREL, self.fpsREL, self.fnsREL,
1054 |                         self.getPrecision(self.tpsREL, self.fpsREL), self.getRecall(self.tpsREL, self.fnsREL),
1055 |                         self.getF1(self.tpsREL, self.fpsREL, self.fnsREL))
1056 | 
1057 |             printer.print()


--------------------------------------------------------------------------------