├── config.py
├── dmcnn.py
├── loader.py
├── test.py
└── train.py


/config.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | import numpy as np
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.optim as optim
  6 | from torch.autograd import Variable
  7 | from sklearn.metrics import f1_score, precision_score, recall_score
  8 | 
  9 | from dmcnn import dmcnn_t
 10 | from loader import load_word2vec
 11 | from loader import Batch_tri, Batch_arg
 12 | from loader import load_tri_sentences, load_arg_sentences
 13 | 
 14 | 
 15 | def to_var(x):
 16 |     return Variable(torch.from_numpy(x).long().cuda())
 17 | 
 18 | 
 19 | class Config(object):
 20 |     def __init__(self):
 21 |         self.gpu = "4"
 22 |         self.path_t = 'data/tri.train'
 23 |         self.path_a = 'data/arg.train'
 24 |         self.path_test_t = 'data/tri.test'
 25 |         self.path_test_a = 'data/arg.test'
 26 |         self.path_modelt = 'data/modelt'
 27 |         self.path_debug = 'data/debug'
 28 |         self.lr = 1
 29 |         self.weight_decay = 1e-5
 30 |         self.epoch = 75                 # training epoches
 31 |         self.epoch_save = 1
 32 |         self.sen = 80                   # sentence length
 33 |         self.char_dim = 100             # length of word embedding tensor
 34 |         self.num_char = 20136           # total num of word2vec model
 35 |         self.batch_t = 170              # num of sentences in one batch
 36 |         self.batch_a = 20
 37 |         self.num_t = 34                 # num of triggers
 38 |         self.num_a = 36
 39 |         self.pf_t = 5                   # dim of pf in trigger classification
 40 |         self.pf_a = 5
 41 |         self.ef_a = 5
 42 |         self.window_t = 3               # window size in cnn
 43 |         self.window_a = 3
 44 |         self.feature_t = 200            # num of features in cnn
 45 |         self.feature_a = 300
 46 | 
 47 |     def load_traint_data(self):
 48 |         print("Reading training data...")
 49 |         train_t = load_tri_sentences(self.path_t)
 50 |         self.train_t_b = Batch_tri(train_t, self.batch_t, self.sen)
 51 |         self.emb_weights = load_word2vec("data/100.utf8", 100, self.num_char, self.char_dim)
 52 |         print("finish reading")
 53 | 
 54 |     def load_testt_data(self):
 55 |         print("Reading testing data...")
 56 |         test_t = load_tri_sentences(self.path_test_t)
 57 |         self.test_t_b = Batch_tri(test_t, self.batch_t, self.sen)
 58 |         print("finish reading")
 59 | 
 60 | 
 61 |     def set_traint_model(self):
 62 |         print("Initializing training model...")
 63 |         self.modelt = dmcnn_t(config=self)
 64 |         self.optimizer_t = optim.Adadelta(self.modelt.parameters(), lr=self.lr, rho=0.95, eps=1e-6, weight_decay=self.weight_decay)
 65 |         self.modelt.cuda()
 66 |         for param_tensor in self.modelt.state_dict():
 67 |             print(param_tensor, "\t", self.modelt.state_dict()[param_tensor].size())
 68 |         print("Finish initializing")
 69 | 
 70 |     def set_testt_model(self):
 71 |         print("Initializing testing model...")
 72 |         self.model_test_t = dmcnn_t(config=self)
 73 |         self.model_test_t.cuda()
 74 |         self.model_test_t.eval()
 75 |         print("finish initializing")
 76 | 
 77 |     def train_one_step(self, batch):
 78 |         self.modelt.char_inputs = to_var(np.array(batch[0]))
 79 |         self.modelt.trigger_inputs = to_var(np.array(batch[1]))
 80 |         self.modelt.pf_inputs = to_var(np.array(batch[2]))
 81 |         self.modelt.lxl_inputs = to_var(np.array(batch[3]))
 82 |         self.modelt.masks = to_var(np.array(batch[4]))
 83 |         self.modelt.cuts = to_var(np.array(batch[5]))
 84 | 
 85 |         self.optimizer_t.zero_grad()
 86 |         loss, maxes= self.modelt()
 87 |         loss.backward()
 88 |         self.optimizer_t.step()
 89 |         return loss.data, maxes
 90 | 
 91 |     def test_one_step(self, batch):
 92 |         self.model_test_t.char_inputs = to_var(np.array(batch[0]))
 93 |         self.model_test_t.trigger_inputs = to_var(np.array(batch[1]))
 94 |         self.model_test_t.pf_inputs = to_var(np.array(batch[2]))
 95 |         self.model_test_t.lxl_inputs = to_var(np.array(batch[3]))
 96 |         self.model_test_t.masks = to_var(np.array(batch[4]))
 97 |         self.model_test_t.cuts = to_var(np.array(batch[5]))
 98 | 
 99 |         loss, maxes = self.model_test_t()
100 |         return loss, maxes
101 | 
102 |     def train(self):
103 |         for epoch in range(self.epoch):
104 |             losses = 0
105 |             tru = pre = None
106 |             i = 0
107 |             print("epoch: ", epoch)
108 |             for batch in self.train_t_b.iter_batch():
109 |                 loss, maxes = self.train_one_step(batch)
110 |                 losses += loss
111 |                 if i == 0:
112 |                     tru = self.modelt.trigger_inputs
113 |                     pre = maxes
114 |                 else:
115 |                     tru = torch.cat((tru, self.modelt.trigger_inputs), dim=0)
116 |                     pre = torch.cat((pre, maxes), dim=0)
117 |                 i += 1
118 |             tru = tru.cpu()
119 |             pre = pre.cpu()
120 |             prec = precision_score(tru, pre, labels=list(range(1, 34)), average='micro')
121 |             rec = recall_score(tru, pre, labels=list(range(1, 34)), average='micro')
122 |             f1 = f1_score(tru, pre, labels=list(range(1, 34)), average='micro')
123 |             i = 0
124 |             if epoch % self.epoch_save == 0:
125 |                 torch.save(self.modelt.state_dict(), self.path_modelt)
126 |             print("loss_average:", losses/i)
127 |             print("Precision:  ", prec)
128 |             print("Recall:  ", rec)
129 |             print("FMeasure", f1)
130 | 
131 | 
132 |     def test(self):
133 |         self.model_test_t.load_state_dict(torch.load(self.path_modelt))
134 |         tru = pre = None
135 |         i = 0
136 |         with open(self.path_debug, 'w') as f:
137 |             losses = 0
138 |             for batch in self.test_t_b.iter_batch():
139 |                 loss, maxes = self.test_one_step(batch)
140 |                 losses += loss
141 |                 if i == 0:
142 |                     tru = self.model_test_t.trigger_inputs
143 |                     pre = maxes
144 |                 else:
145 |                     tru = torch.cat((tru, self.model_test_t.trigger_inputs), dim=0)
146 |                     pre = torch.cat((pre, maxes), dim=0)
147 |                 i += 1
148 |             tru = tru.cpu()
149 |             pre = pre.cpu()
150 |             tru_n = tru.numpy()
151 |             pre_n = pre.numpy()
152 |             for p in range(self.batch_t*self.test_t_b.len_data):
153 |                 if tru_n[p] != pre_n[p]:
154 |                     f.write(str(tru_n[p]) + ':' + str(pre_n[p]) + '\n')
155 |             prec = precision_score(tru, pre, labels=list(range(1, 34)), average='micro')
156 |             rec = recall_score(tru, pre, labels=list(range(1, 34)), average='micro')
157 |             f1 = f1_score(tru, pre, labels=list(range(1, 34)), average='micro')
158 |             print("loss_average:  ", losses/i)
159 |             print("Precision:  ", prec)
160 |             print("Recall:  ", rec)
161 |             print("FMeasure", f1)
162 | 
163 | 
164 | 
165 | 


--------------------------------------------------------------------------------
/dmcnn.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import numpy as np
 4 | 
 5 | 
 6 | class dmcnn_t(nn.Module):
 7 |     def __init__(self, config):
 8 |         super(dmcnn_t, self).__init__()
 9 |         self.config = config
10 |         self.keep_prob = 0.5
11 | 
12 |         self.char_inputs = None             # [batch, char_dim] 句子
13 |         self.trigger_inputs = None          # [batch] 真实的trigger种类
14 |         self.pf_inputs = None
15 |         self.lxl_inputs = None               # [batch, sen
16 |         self.masks = None                   # [batch, sen_len-2]    用于pooling   trigger位置之前值为1，trigger之后，填充部分之前为2，填充部分为0
17 |         self.cuts = None                    # [batch, 1]    trigger位置
18 |         
19 |         self.char_lookup = nn.Embedding(self.config.num_char, self.config.char_dim)      # [20136, 100] word2vec
20 |         self.pf_lookup = nn.Embedding(self.config.batch_t, self.config.pf_t)               # [batch, pf_dim]
21 |         # self.init_word_weights()
22 |         # self.init_pf_weights()
23 | 
24 |         self.conv = nn.Conv1d(self.config.char_dim+self.config.pf_t, self.config.feature_t, self.config.window_t, bias=True)
25 |         self.L = nn.Linear(2*self.config.feature_t + 3*self.config.char_dim, self.config.num_t, bias=True)
26 |         self.dropout = nn.Dropout(p=self.keep_prob)
27 |         self.loss = nn.CrossEntropyLoss()
28 | 
29 |     def init_word_weights(self):
30 |         self.char_lookup.weight.data.copy_(torch.from_numpy(self.config.emb_weights))
31 | 
32 |     def init_pf_weights(self):
33 |         nn.init.xavier_uniform_(self.pf_lookup.weight.data)
34 | 
35 |     def pooling(self, conv):
36 |         mask = np.array([[0, 0], [0, 1], [1, 0]])
37 |         mask_emb = nn.Embedding(3, 2).cuda()
38 |         mask_emb.weight.data.copy_(torch.from_numpy(mask))
39 |         mask = mask_emb(self.masks)                         # conv [batch, sen-2, feature]   mask [batch, sen-2, 2]
40 |         pooled, _ = torch.max(torch.unsqueeze(mask*100, dim=2) + torch.unsqueeze(conv, dim=3), dim=1)
41 |         pooled -= 100
42 |         pooled = pooled.view(self.config.batch_t, -1)
43 |         return pooled
44 | 
45 |     def forward(self):
46 |         x = torch.cat((self.char_lookup(self.char_inputs), self.pf_lookup(self.pf_inputs)), dim=-1)
47 |         y = self.char_lookup(self.lxl_inputs).view(self.config.batch_t, -1)
48 |         x = torch.tanh(self.conv(x.permute(0, 2, 1)))       # [batch, feature, sen-2]
49 |         x = x.permute(0, 2, 1)                              # [batch, sen-2, feature]
50 |         x = self.pooling(x)                                 # [batch, 2*feature]
51 |         x = torch.cat((x, y), dim=-1)                       # [batch, 2*feature+3*char]
52 |         # x = self.dropout(x)
53 |         x = self.L(x)                                       # [batch, trigger]
54 |         loss = self.loss(x, self.trigger_inputs)
55 |         _, maxes = torch.max(x, dim=1)
56 |         return loss, maxes
57 | 
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/loader.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import pickle
  3 | import codecs
  4 | import random
  5 | import numpy as np
  6 | import torch
  7 | import torch.nn as nn
  8 | 
  9 | 
 10 | def load_word2vec(emb_path, word_dim, num_char, char_dim):
 11 |     with open("data/maps.pkl", 'rb') as f:
 12 |         _, id_to_word, __, ___ = pickle.load(f)
 13 |     old_weights = nn.init.xavier_uniform_(torch.zeros(num_char, char_dim))
 14 |     new_weights = old_weights.numpy()
 15 |     pre_trained = {}
 16 |     emb_invalid = 0
 17 |     for i, line in enumerate(codecs.open(emb_path, 'r', 'utf-8')):
 18 |         line = line.rstrip().split()
 19 |         if len(line) == word_dim + 1:
 20 |             pre_trained[line[0]] = np.array([float(x) for x in line[1:]]).astype(np.float32)
 21 |         else:
 22 |             emb_invalid += 1
 23 |     c_found = 0
 24 |     c_lower = 0
 25 |     c_zeros = 0
 26 |     n_words = len(id_to_word)
 27 |     for i in range(n_words):
 28 |         word = id_to_word[i]
 29 |         if word in pre_trained:
 30 |             new_weights[i] = pre_trained[word]
 31 |             c_found += 1
 32 |         elif word.lower() in pre_trained:
 33 |             new_weights[i] = pre_trained[word.lower()]
 34 |             c_lower += 1
 35 |         elif re.sub('\d', '0', word.lower()) in pre_trained:
 36 |             new_weights[i] = pre_trained[re.sub('\d', '0', word.lower())]
 37 |             c_zeros += 1
 38 |     # weights = nn.Embedding(num_char, char_dim)
 39 |     # weights.weight.data.copy_(torch.from_numpy(new_weights))
 40 |     return new_weights
 41 | 
 42 | def load_tri_sentences(path):
 43 |     expand, sens, sen = list(), list(), list()
 44 |     c, l, t = list(), list(), list()
 45 |     for line in codecs.open(path, 'r', 'utf8'):
 46 |         line = line.rstrip()
 47 |         if line:
 48 |             word = line.split()
 49 |             c.append(int(word[0]))
 50 |             l.append(int(word[1]))
 51 |             t.append(int(word[2]))
 52 |         else:
 53 |             if len(c) > 0:
 54 |                 sens.append([c, l, t])
 55 |                 c, l, t = [], [], []
 56 |     for x in range(len(sens)):
 57 |         c, l, t = sens[x]        # one sentence
 58 |         for y in range(len(c)):
 59 |             if y > 0:
 60 |                 mask = [1 for i in range(y)]            # 0 - (y-1)
 61 |                 mask += [2 for i in range(len(c) - y)]  # total of (len(c)-y+y)=len(c) numbers
 62 |                 cut = y
 63 |                 tri_in = t[y]
 64 |                 expand.append([c, tri_in, mask, cut])
 65 |     return expand
 66 | 
 67 | def load_arg_sentences(path):
 68 |     expand, sens, sen = list(), list(), list()
 69 |     c, l, t, a = list(), list(), list(), list()
 70 |     for line in codecs.open(path, 'r', 'utf8'):
 71 |         line = line.rstrip()
 72 |         if line:
 73 |             word = line.split()
 74 |             c.append(int(word[0]))
 75 |             l.append(int(word[1]))
 76 |             t.append(int(word[2]))
 77 |             a.append(int(word[3]))
 78 |         else:
 79 |             if len(c) > 0:
 80 |                 sens.append([c, l, t, a])
 81 |                 c, l, t, a = [], [], [], []
 82 |     for x in range(len(sens)):
 83 |         sen = sens[x]
 84 |         tri_f = 0
 85 |         for i in range(len(sen[0])):
 86 |             if sen[2][i] != 0:
 87 |                 tri_f = i
 88 |                 break
 89 |         for y in range(len(sen[0])):
 90 |             if y > 0 and y != tri_f:
 91 |                 fir = min(tri_f, y)
 92 |                 sec = max(tri_f, y)
 93 |                 mask = [1 for i in range(fir)]
 94 |                 mask += [2 for i in range(sec - fir)]
 95 |                 mask += [3 for i in range(len(sen[0]) - sec)]
 96 |                 cut = [tri_f, y]
 97 |                 tri_loc, arg_loc = [], []
 98 |                 for i in range(len(sen[0])):
 99 |                     tri_loc.append(i - cut[0])
100 |                     arg_loc.append(i - cut[1])
101 |                 arg_in = [0 for i in range(36)]
102 |                 arg_in[sen[3][y]] = 1
103 |                 expand.append([sen[0], sen[1], sen[2], sen[3], arg_in, tri_loc, arg_loc, mask, cut])
104 |     print("load_finished!")
105 |     return expand
106 | 
107 | 
108 | class Batch_tri(object):
109 |     def __init__(self, data, batch_size, sen_len):
110 |         self.batch_data = self.sort_pad(data, batch_size, sen_len)
111 |         self.len_data = len(self.batch_data)
112 |         self.length = int(sen_len)
113 | 
114 |     def sort_pad(self, data, batch_size, sen_len):
115 |         num_batch = int(len(data)/batch_size)
116 |         sort_data = sorted(data, key=lambda x: len(x[0]))
117 |         batch_data = list()
118 |         for i in range(num_batch):
119 |             batch_data.append(self.pad(sort_data[i * batch_size: (i + 1) * batch_size], sen_len))
120 |         return batch_data
121 | 
122 |     @staticmethod
123 |     def pad(data, length):
124 |         chars, tri_in, pf_in, mask, cut, lxl_in = list(), list(), list(), list(), list(), list()
125 |         for line in data:
126 |             c, t, m, cu = line
127 |             padding = [0] * (length - len(c))
128 |             chars.append(c + padding)
129 |             tri_in.append(t)
130 |             pf_in.append([(i - cu + length - 1) for i in range(length)])
131 |             lxl_in.append((c + padding)[cu-1:cu+2])
132 |             mask.append(m + [0] * (length - len(c) - 2))
133 |             cut.append(cu)
134 |         return [chars, tri_in, pf_in, lxl_in, mask, cut]
135 | 
136 |     def iter_batch(self):
137 |         random.shuffle(self.batch_data)
138 |         for i in range(self.len_data):
139 |             yield self.batch_data[i]
140 | 
141 | 
142 | class Batch_arg(object):
143 |     def __init__(self, data, batch_size, sen_len):
144 |         self.batch_data = self.sort_pad(data, batch_size, sen_len)
145 |         self.len_data = len(self.batch_data)
146 |         self.length = int(sen_len)
147 | 
148 |     def sort_pad(self, data, batch_size, sen_len):
149 |         num_batch = int(len(data)/batch_size)
150 |         sort_data = sorted(data, key=lambda x: len(x[0]))
151 |         batch_data = list()
152 |         for i in range(num_batch):
153 |             batch_data.append(self.pad(sort_data[i * batch_size: (i + 1) * batch_size], sen_len))
154 |         return batch_data
155 | 
156 |     @staticmethod
157 |     def pad(data, length):
158 |         chars, ls, tri, arg, arg_in, tri_loc, arg_loc, mask, cut = list(), list(), list(), list(), list(), list(), list(), list(), list()
159 |         for line in data:
160 |             c, l, t, a, a_i, t_l, a_l, m, cu = line
161 |             padding = [0] * (length - len(c))
162 |             chars.append(c + padding)
163 |             ls.append(l + padding)
164 |             tri.append(t + padding)
165 |             arg.append(a + padding)
166 |             arg_in.append(a_i)
167 |             mask.append(m + [0] * (length - len(c) - 2))
168 |             cut.append(cu)
169 |             for i in range(length - len(c)):
170 |                 t_l.append(t_l[len(c) - 1] + i + 1)
171 |                 a_l.append(a_l[len(c) - 1] + i + 1)
172 |             for i in range(len(c)):
173 |                 t_l[i] += length - 1
174 |                 a_l[i] += length - 1
175 |             tri_loc.append(t_l)
176 |             arg_loc.append(a_l)
177 |         return [chars, ls, tri, arg, arg_in, tri_loc, arg_loc, mask, cut]
178 | 
179 |     def iter_batch(self):
180 |         random.shuffle(self.batch_data)
181 |         for i in range(self.len_data):
182 |             yield self.batch_data[i]


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from config import Config
 3 | 
 4 | 
 5 | os.environ["CUDA_VISIBLE_DEVICES"] = "1"
 6 | 
 7 | con = Config()
 8 | con.load_testt_data()
 9 | con.set_testt_model()
10 | con.test()


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from config import Config
 3 | 
 4 | 
 5 | os.environ["CUDA_VISIBLE_DEVICES"] = "1"
 6 | 
 7 | con = Config()
 8 | con.load_traint_data()
 9 | con.set_traint_model()
10 | con.train()
11 | 


--------------------------------------------------------------------------------