├── config.py ├── dmcnn.py ├── loader.py ├── test.py └── train.py /config.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import numpy as np 3 | import torch 4 | import torch.nn as nn 5 | import torch.optim as optim 6 | from torch.autograd import Variable 7 | from sklearn.metrics import f1_score, precision_score, recall_score 8 | 9 | from dmcnn import dmcnn_t 10 | from loader import load_word2vec 11 | from loader import Batch_tri, Batch_arg 12 | from loader import load_tri_sentences, load_arg_sentences 13 | 14 | 15 | def to_var(x): 16 | return Variable(torch.from_numpy(x).long().cuda()) 17 | 18 | 19 | class Config(object): 20 | def __init__(self): 21 | self.gpu = "4" 22 | self.path_t = 'data/tri.train' 23 | self.path_a = 'data/arg.train' 24 | self.path_test_t = 'data/tri.test' 25 | self.path_test_a = 'data/arg.test' 26 | self.path_modelt = 'data/modelt' 27 | self.path_debug = 'data/debug' 28 | self.lr = 1 29 | self.weight_decay = 1e-5 30 | self.epoch = 75 # training epoches 31 | self.epoch_save = 1 32 | self.sen = 80 # sentence length 33 | self.char_dim = 100 # length of word embedding tensor 34 | self.num_char = 20136 # total num of word2vec model 35 | self.batch_t = 170 # num of sentences in one batch 36 | self.batch_a = 20 37 | self.num_t = 34 # num of triggers 38 | self.num_a = 36 39 | self.pf_t = 5 # dim of pf in trigger classification 40 | self.pf_a = 5 41 | self.ef_a = 5 42 | self.window_t = 3 # window size in cnn 43 | self.window_a = 3 44 | self.feature_t = 200 # num of features in cnn 45 | self.feature_a = 300 46 | 47 | def load_traint_data(self): 48 | print("Reading training data...") 49 | train_t = load_tri_sentences(self.path_t) 50 | self.train_t_b = Batch_tri(train_t, self.batch_t, self.sen) 51 | self.emb_weights = load_word2vec("data/100.utf8", 100, self.num_char, self.char_dim) 52 | print("finish reading") 53 | 54 | def load_testt_data(self): 55 | print("Reading testing data...") 56 | test_t = load_tri_sentences(self.path_test_t) 57 | self.test_t_b = Batch_tri(test_t, self.batch_t, self.sen) 58 | print("finish reading") 59 | 60 | 61 | def set_traint_model(self): 62 | print("Initializing training model...") 63 | self.modelt = dmcnn_t(config=self) 64 | self.optimizer_t = optim.Adadelta(self.modelt.parameters(), lr=self.lr, rho=0.95, eps=1e-6, weight_decay=self.weight_decay) 65 | self.modelt.cuda() 66 | for param_tensor in self.modelt.state_dict(): 67 | print(param_tensor, "\t", self.modelt.state_dict()[param_tensor].size()) 68 | print("Finish initializing") 69 | 70 | def set_testt_model(self): 71 | print("Initializing testing model...") 72 | self.model_test_t = dmcnn_t(config=self) 73 | self.model_test_t.cuda() 74 | self.model_test_t.eval() 75 | print("finish initializing") 76 | 77 | def train_one_step(self, batch): 78 | self.modelt.char_inputs = to_var(np.array(batch[0])) 79 | self.modelt.trigger_inputs = to_var(np.array(batch[1])) 80 | self.modelt.pf_inputs = to_var(np.array(batch[2])) 81 | self.modelt.lxl_inputs = to_var(np.array(batch[3])) 82 | self.modelt.masks = to_var(np.array(batch[4])) 83 | self.modelt.cuts = to_var(np.array(batch[5])) 84 | 85 | self.optimizer_t.zero_grad() 86 | loss, maxes= self.modelt() 87 | loss.backward() 88 | self.optimizer_t.step() 89 | return loss.data, maxes 90 | 91 | def test_one_step(self, batch): 92 | self.model_test_t.char_inputs = to_var(np.array(batch[0])) 93 | self.model_test_t.trigger_inputs = to_var(np.array(batch[1])) 94 | self.model_test_t.pf_inputs = to_var(np.array(batch[2])) 95 | self.model_test_t.lxl_inputs = to_var(np.array(batch[3])) 96 | self.model_test_t.masks = to_var(np.array(batch[4])) 97 | self.model_test_t.cuts = to_var(np.array(batch[5])) 98 | 99 | loss, maxes = self.model_test_t() 100 | return loss, maxes 101 | 102 | def train(self): 103 | for epoch in range(self.epoch): 104 | losses = 0 105 | tru = pre = None 106 | i = 0 107 | print("epoch: ", epoch) 108 | for batch in self.train_t_b.iter_batch(): 109 | loss, maxes = self.train_one_step(batch) 110 | losses += loss 111 | if i == 0: 112 | tru = self.modelt.trigger_inputs 113 | pre = maxes 114 | else: 115 | tru = torch.cat((tru, self.modelt.trigger_inputs), dim=0) 116 | pre = torch.cat((pre, maxes), dim=0) 117 | i += 1 118 | tru = tru.cpu() 119 | pre = pre.cpu() 120 | prec = precision_score(tru, pre, labels=list(range(1, 34)), average='micro') 121 | rec = recall_score(tru, pre, labels=list(range(1, 34)), average='micro') 122 | f1 = f1_score(tru, pre, labels=list(range(1, 34)), average='micro') 123 | i = 0 124 | if epoch % self.epoch_save == 0: 125 | torch.save(self.modelt.state_dict(), self.path_modelt) 126 | print("loss_average:", losses/i) 127 | print("Precision: ", prec) 128 | print("Recall: ", rec) 129 | print("FMeasure", f1) 130 | 131 | 132 | def test(self): 133 | self.model_test_t.load_state_dict(torch.load(self.path_modelt)) 134 | tru = pre = None 135 | i = 0 136 | with open(self.path_debug, 'w') as f: 137 | losses = 0 138 | for batch in self.test_t_b.iter_batch(): 139 | loss, maxes = self.test_one_step(batch) 140 | losses += loss 141 | if i == 0: 142 | tru = self.model_test_t.trigger_inputs 143 | pre = maxes 144 | else: 145 | tru = torch.cat((tru, self.model_test_t.trigger_inputs), dim=0) 146 | pre = torch.cat((pre, maxes), dim=0) 147 | i += 1 148 | tru = tru.cpu() 149 | pre = pre.cpu() 150 | tru_n = tru.numpy() 151 | pre_n = pre.numpy() 152 | for p in range(self.batch_t*self.test_t_b.len_data): 153 | if tru_n[p] != pre_n[p]: 154 | f.write(str(tru_n[p]) + ':' + str(pre_n[p]) + '\n') 155 | prec = precision_score(tru, pre, labels=list(range(1, 34)), average='micro') 156 | rec = recall_score(tru, pre, labels=list(range(1, 34)), average='micro') 157 | f1 = f1_score(tru, pre, labels=list(range(1, 34)), average='micro') 158 | print("loss_average: ", losses/i) 159 | print("Precision: ", prec) 160 | print("Recall: ", rec) 161 | print("FMeasure", f1) 162 | 163 | 164 | 165 | -------------------------------------------------------------------------------- /dmcnn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | 5 | 6 | class dmcnn_t(nn.Module): 7 | def __init__(self, config): 8 | super(dmcnn_t, self).__init__() 9 | self.config = config 10 | self.keep_prob = 0.5 11 | 12 | self.char_inputs = None # [batch, char_dim] 句子 13 | self.trigger_inputs = None # [batch] 真实的trigger种类 14 | self.pf_inputs = None 15 | self.lxl_inputs = None # [batch, sen 16 | self.masks = None # [batch, sen_len-2] 用于pooling trigger位置之前值为1,trigger之后,填充部分之前为2,填充部分为0 17 | self.cuts = None # [batch, 1] trigger位置 18 | 19 | self.char_lookup = nn.Embedding(self.config.num_char, self.config.char_dim) # [20136, 100] word2vec 20 | self.pf_lookup = nn.Embedding(self.config.batch_t, self.config.pf_t) # [batch, pf_dim] 21 | # self.init_word_weights() 22 | # self.init_pf_weights() 23 | 24 | self.conv = nn.Conv1d(self.config.char_dim+self.config.pf_t, self.config.feature_t, self.config.window_t, bias=True) 25 | self.L = nn.Linear(2*self.config.feature_t + 3*self.config.char_dim, self.config.num_t, bias=True) 26 | self.dropout = nn.Dropout(p=self.keep_prob) 27 | self.loss = nn.CrossEntropyLoss() 28 | 29 | def init_word_weights(self): 30 | self.char_lookup.weight.data.copy_(torch.from_numpy(self.config.emb_weights)) 31 | 32 | def init_pf_weights(self): 33 | nn.init.xavier_uniform_(self.pf_lookup.weight.data) 34 | 35 | def pooling(self, conv): 36 | mask = np.array([[0, 0], [0, 1], [1, 0]]) 37 | mask_emb = nn.Embedding(3, 2).cuda() 38 | mask_emb.weight.data.copy_(torch.from_numpy(mask)) 39 | mask = mask_emb(self.masks) # conv [batch, sen-2, feature] mask [batch, sen-2, 2] 40 | pooled, _ = torch.max(torch.unsqueeze(mask*100, dim=2) + torch.unsqueeze(conv, dim=3), dim=1) 41 | pooled -= 100 42 | pooled = pooled.view(self.config.batch_t, -1) 43 | return pooled 44 | 45 | def forward(self): 46 | x = torch.cat((self.char_lookup(self.char_inputs), self.pf_lookup(self.pf_inputs)), dim=-1) 47 | y = self.char_lookup(self.lxl_inputs).view(self.config.batch_t, -1) 48 | x = torch.tanh(self.conv(x.permute(0, 2, 1))) # [batch, feature, sen-2] 49 | x = x.permute(0, 2, 1) # [batch, sen-2, feature] 50 | x = self.pooling(x) # [batch, 2*feature] 51 | x = torch.cat((x, y), dim=-1) # [batch, 2*feature+3*char] 52 | # x = self.dropout(x) 53 | x = self.L(x) # [batch, trigger] 54 | loss = self.loss(x, self.trigger_inputs) 55 | _, maxes = torch.max(x, dim=1) 56 | return loss, maxes 57 | 58 | 59 | 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /loader.py: -------------------------------------------------------------------------------- 1 | import re 2 | import pickle 3 | import codecs 4 | import random 5 | import numpy as np 6 | import torch 7 | import torch.nn as nn 8 | 9 | 10 | def load_word2vec(emb_path, word_dim, num_char, char_dim): 11 | with open("data/maps.pkl", 'rb') as f: 12 | _, id_to_word, __, ___ = pickle.load(f) 13 | old_weights = nn.init.xavier_uniform_(torch.zeros(num_char, char_dim)) 14 | new_weights = old_weights.numpy() 15 | pre_trained = {} 16 | emb_invalid = 0 17 | for i, line in enumerate(codecs.open(emb_path, 'r', 'utf-8')): 18 | line = line.rstrip().split() 19 | if len(line) == word_dim + 1: 20 | pre_trained[line[0]] = np.array([float(x) for x in line[1:]]).astype(np.float32) 21 | else: 22 | emb_invalid += 1 23 | c_found = 0 24 | c_lower = 0 25 | c_zeros = 0 26 | n_words = len(id_to_word) 27 | for i in range(n_words): 28 | word = id_to_word[i] 29 | if word in pre_trained: 30 | new_weights[i] = pre_trained[word] 31 | c_found += 1 32 | elif word.lower() in pre_trained: 33 | new_weights[i] = pre_trained[word.lower()] 34 | c_lower += 1 35 | elif re.sub('\d', '0', word.lower()) in pre_trained: 36 | new_weights[i] = pre_trained[re.sub('\d', '0', word.lower())] 37 | c_zeros += 1 38 | # weights = nn.Embedding(num_char, char_dim) 39 | # weights.weight.data.copy_(torch.from_numpy(new_weights)) 40 | return new_weights 41 | 42 | def load_tri_sentences(path): 43 | expand, sens, sen = list(), list(), list() 44 | c, l, t = list(), list(), list() 45 | for line in codecs.open(path, 'r', 'utf8'): 46 | line = line.rstrip() 47 | if line: 48 | word = line.split() 49 | c.append(int(word[0])) 50 | l.append(int(word[1])) 51 | t.append(int(word[2])) 52 | else: 53 | if len(c) > 0: 54 | sens.append([c, l, t]) 55 | c, l, t = [], [], [] 56 | for x in range(len(sens)): 57 | c, l, t = sens[x] # one sentence 58 | for y in range(len(c)): 59 | if y > 0: 60 | mask = [1 for i in range(y)] # 0 - (y-1) 61 | mask += [2 for i in range(len(c) - y)] # total of (len(c)-y+y)=len(c) numbers 62 | cut = y 63 | tri_in = t[y] 64 | expand.append([c, tri_in, mask, cut]) 65 | return expand 66 | 67 | def load_arg_sentences(path): 68 | expand, sens, sen = list(), list(), list() 69 | c, l, t, a = list(), list(), list(), list() 70 | for line in codecs.open(path, 'r', 'utf8'): 71 | line = line.rstrip() 72 | if line: 73 | word = line.split() 74 | c.append(int(word[0])) 75 | l.append(int(word[1])) 76 | t.append(int(word[2])) 77 | a.append(int(word[3])) 78 | else: 79 | if len(c) > 0: 80 | sens.append([c, l, t, a]) 81 | c, l, t, a = [], [], [], [] 82 | for x in range(len(sens)): 83 | sen = sens[x] 84 | tri_f = 0 85 | for i in range(len(sen[0])): 86 | if sen[2][i] != 0: 87 | tri_f = i 88 | break 89 | for y in range(len(sen[0])): 90 | if y > 0 and y != tri_f: 91 | fir = min(tri_f, y) 92 | sec = max(tri_f, y) 93 | mask = [1 for i in range(fir)] 94 | mask += [2 for i in range(sec - fir)] 95 | mask += [3 for i in range(len(sen[0]) - sec)] 96 | cut = [tri_f, y] 97 | tri_loc, arg_loc = [], [] 98 | for i in range(len(sen[0])): 99 | tri_loc.append(i - cut[0]) 100 | arg_loc.append(i - cut[1]) 101 | arg_in = [0 for i in range(36)] 102 | arg_in[sen[3][y]] = 1 103 | expand.append([sen[0], sen[1], sen[2], sen[3], arg_in, tri_loc, arg_loc, mask, cut]) 104 | print("load_finished!") 105 | return expand 106 | 107 | 108 | class Batch_tri(object): 109 | def __init__(self, data, batch_size, sen_len): 110 | self.batch_data = self.sort_pad(data, batch_size, sen_len) 111 | self.len_data = len(self.batch_data) 112 | self.length = int(sen_len) 113 | 114 | def sort_pad(self, data, batch_size, sen_len): 115 | num_batch = int(len(data)/batch_size) 116 | sort_data = sorted(data, key=lambda x: len(x[0])) 117 | batch_data = list() 118 | for i in range(num_batch): 119 | batch_data.append(self.pad(sort_data[i * batch_size: (i + 1) * batch_size], sen_len)) 120 | return batch_data 121 | 122 | @staticmethod 123 | def pad(data, length): 124 | chars, tri_in, pf_in, mask, cut, lxl_in = list(), list(), list(), list(), list(), list() 125 | for line in data: 126 | c, t, m, cu = line 127 | padding = [0] * (length - len(c)) 128 | chars.append(c + padding) 129 | tri_in.append(t) 130 | pf_in.append([(i - cu + length - 1) for i in range(length)]) 131 | lxl_in.append((c + padding)[cu-1:cu+2]) 132 | mask.append(m + [0] * (length - len(c) - 2)) 133 | cut.append(cu) 134 | return [chars, tri_in, pf_in, lxl_in, mask, cut] 135 | 136 | def iter_batch(self): 137 | random.shuffle(self.batch_data) 138 | for i in range(self.len_data): 139 | yield self.batch_data[i] 140 | 141 | 142 | class Batch_arg(object): 143 | def __init__(self, data, batch_size, sen_len): 144 | self.batch_data = self.sort_pad(data, batch_size, sen_len) 145 | self.len_data = len(self.batch_data) 146 | self.length = int(sen_len) 147 | 148 | def sort_pad(self, data, batch_size, sen_len): 149 | num_batch = int(len(data)/batch_size) 150 | sort_data = sorted(data, key=lambda x: len(x[0])) 151 | batch_data = list() 152 | for i in range(num_batch): 153 | batch_data.append(self.pad(sort_data[i * batch_size: (i + 1) * batch_size], sen_len)) 154 | return batch_data 155 | 156 | @staticmethod 157 | def pad(data, length): 158 | chars, ls, tri, arg, arg_in, tri_loc, arg_loc, mask, cut = list(), list(), list(), list(), list(), list(), list(), list(), list() 159 | for line in data: 160 | c, l, t, a, a_i, t_l, a_l, m, cu = line 161 | padding = [0] * (length - len(c)) 162 | chars.append(c + padding) 163 | ls.append(l + padding) 164 | tri.append(t + padding) 165 | arg.append(a + padding) 166 | arg_in.append(a_i) 167 | mask.append(m + [0] * (length - len(c) - 2)) 168 | cut.append(cu) 169 | for i in range(length - len(c)): 170 | t_l.append(t_l[len(c) - 1] + i + 1) 171 | a_l.append(a_l[len(c) - 1] + i + 1) 172 | for i in range(len(c)): 173 | t_l[i] += length - 1 174 | a_l[i] += length - 1 175 | tri_loc.append(t_l) 176 | arg_loc.append(a_l) 177 | return [chars, ls, tri, arg, arg_in, tri_loc, arg_loc, mask, cut] 178 | 179 | def iter_batch(self): 180 | random.shuffle(self.batch_data) 181 | for i in range(self.len_data): 182 | yield self.batch_data[i] -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import os 2 | from config import Config 3 | 4 | 5 | os.environ["CUDA_VISIBLE_DEVICES"] = "1" 6 | 7 | con = Config() 8 | con.load_testt_data() 9 | con.set_testt_model() 10 | con.test() -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import os 2 | from config import Config 3 | 4 | 5 | os.environ["CUDA_VISIBLE_DEVICES"] = "1" 6 | 7 | con = Config() 8 | con.load_traint_data() 9 | con.set_traint_model() 10 | con.train() 11 | --------------------------------------------------------------------------------