├── LICENSE ├── README.md ├── configs ├── config.crf.setup1 ├── config.crf.setup2 ├── config.crf.setup3 ├── config.softmax.setup1 ├── config.softmax.setup2 └── config.softmax.setup3 ├── createDataStream_setup1.py ├── createDataStream_setup2.py ├── createDataStream_setup3.py ├── evaluation.py ├── iterationSchemes.py ├── layers.py ├── train.py ├── train_setup3.py └── utils.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Heike Adel 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Description 2 | 3 | This code was written in 2017 for the experiments presented in the 4 | paper "Global Normalization of Convolutional Neural Networks for 5 | Joint Entity and Relation Classification" 6 | which was published at EMNLP 2017. 7 | 8 | The author of the code is Heike Adel. Some parts of the code are 9 | based on the theano tutorials (http://deeplearning.net/tutorial/) 10 | and the CRF layer implementation is based on https://github.com/glample/tagger. 11 | 12 | # Usage 13 | 14 | For usage, please follow these steps: 15 | 16 | 1. create a fuel dataset: 17 | 18 | python createDataStream_setupX.py config 19 | 20 | Please refer to the paper for the different setups. 21 | The config files used in the paper can be found in the folder configs. 22 | 23 | 2. train and evaluate the model: 24 | 25 | python train.py config 26 | 27 | Use the same config file as above. train.py is used for setup 1 and 2, 28 | train_setup3.py is used for setup 3. 29 | 30 | # Contact 31 | If you have questions, please contact heike.adel@ims.uni-stuttgart.de 32 | 33 | # Citation 34 | 35 | If you use the code for your work, please cite the following paper: 36 | 37 | ``` 38 | @inproceedings{globalAdel2017, 39 | author = {Heike Adel and Hinrich Sch\"{u}tze}, 40 | title = {Global Normalization of Convolutional Neural Networks for 41 | Joint Entity and Relation Classification}, 42 | booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing}, 43 | month = {September}, 44 | year = {2017}, 45 | address = {Copenhagen, Denmark}, 46 | publisher = {Association for Computational Linguistics} 47 | } 48 | ``` 49 | -------------------------------------------------------------------------------- /configs/config.crf.setup1: -------------------------------------------------------------------------------- 1 | datafile=data/corpus_prepared.pickled 2 | file=data/fueldataset_CoNNL_setup1 3 | wordvectors=data/vecs.lc.over100freq.txt.gz 4 | batchsize=10 5 | lrate=0.1 6 | numPerBag=1 7 | numClasses=6 8 | filtersizeContext=3 9 | filtersizeEntities=2 10 | kmaxContext=3 11 | kmaxEntities=3 12 | nkernsContext=200 13 | nkernsEntities=50 14 | contextsize=120 15 | entitysize=20 16 | lambda2=0.0001 17 | net=nets_setup1/net.crf.hC100.hE50.nkC200.nkE50 18 | n_epochs=20 19 | hidden=100 20 | hiddenET=50 21 | sgd=True 22 | crf=True 23 | -------------------------------------------------------------------------------- /configs/config.crf.setup2: -------------------------------------------------------------------------------- 1 | datafile=data/corpus_prepared.pickled 2 | file=data/fueldataset_CoNNL_setup2 3 | wordvectors=data/vecs.lc.over100freq.txt.gz 4 | batchsize=10 5 | lrate=0.1 6 | numPerBag=1 7 | numClasses=6 8 | filtersizeContext=3 9 | filtersizeEntities=2 10 | kmaxContext=3 11 | kmaxEntities=3 12 | nkernsContext=500 13 | nkernsEntities=100 14 | contextsize=120 15 | entitysize=20 16 | lambda2=0.0001 17 | net=nets_setup2/net.crf.hC200.hE50 18 | n_epochs=20 19 | hidden=200 20 | hiddenET=50 21 | sgd=True 22 | crf=True 23 | -------------------------------------------------------------------------------- /configs/config.crf.setup3: -------------------------------------------------------------------------------- 1 | datafile=data/corpus_prepared.pickled 2 | file=data/fueldataset_CoNNL_setup3 3 | wordvectors=data/vecs.lc.over100freq.txt.gz 4 | batchsize=10 5 | lrate=0.1 6 | numPerBag=1 7 | numClasses=6 8 | filtersizeContext=3 9 | filtersizeEntities=2 10 | kmaxContext=3 11 | kmaxEntities=3 12 | nkernsContext=500 13 | nkernsEntities=100 14 | contextsize=120 15 | entitysize=20 16 | lambda2=0.0001 17 | net=nets_setup3/net.crf.hC100.hE50 18 | n_epochs=20 19 | hidden=100 20 | hiddenET=50 21 | sgd=True 22 | crf=True 23 | relationEvaluationMethod=relaxed 24 | -------------------------------------------------------------------------------- /configs/config.softmax.setup1: -------------------------------------------------------------------------------- 1 | datafile=data/corpus_prepared.pickled 2 | file=data/fueldataset_CoNNL_setup1 3 | wordvectors=data/vecs.lc.over100freq.txt.gz 4 | batchsize=10 5 | lrate=0.1 6 | numPerBag=1 7 | numClasses=6 8 | filtersizeContext=3 9 | filtersizeEntities=2 10 | kmaxContext=3 11 | kmaxEntities=3 12 | nkernsContext=500 13 | nkernsEntities=100 14 | contextsize=120 15 | entitysize=20 16 | lambda2=0.0001 17 | net=nets_setup1/net.entropy.hC100.hE50.nkC500.nkE100 18 | n_epochs=20 19 | hidden=100 20 | hiddenET=50 21 | sgd=True 22 | -------------------------------------------------------------------------------- /configs/config.softmax.setup2: -------------------------------------------------------------------------------- 1 | datafile=data/corpus_prepared.pickled 2 | file=data/fueldataset_CoNNL_setup2 3 | wordvectors=data/vecs.lc.over100freq.txt.gz 4 | batchsize=10 5 | lrate=0.1 6 | numPerBag=1 7 | numClasses=6 8 | filtersizeContext=3 9 | filtersizeEntities=2 10 | kmaxContext=3 11 | kmaxEntities=3 12 | nkernsContext=500 13 | nkernsEntities=100 14 | contextsize=120 15 | entitysize=20 16 | lambda2=0.0001 17 | net=nets_setup2/net.entropy.hC100.hE50 18 | n_epochs=20 19 | hidden=100 20 | hiddenET=50 21 | sgd=True 22 | -------------------------------------------------------------------------------- /configs/config.softmax.setup3: -------------------------------------------------------------------------------- 1 | datafile=data/corpus_prepared.pickled 2 | file=data/fueldataset_CoNNL_setup3 3 | wordvectors=data/vecs.lc.over100freq.txt.gz 4 | batchsize=10 5 | lrate=0.1 6 | numPerBag=1 7 | numClasses=6 8 | filtersizeContext=3 9 | filtersizeEntities=2 10 | kmaxContext=3 11 | kmaxEntities=3 12 | nkernsContext=500 13 | nkernsEntities=100 14 | contextsize=120 15 | entitysize=20 16 | lambda2=0.0001 17 | net=nets_setup3/net.entropy.hC100.hE50 18 | n_epochs=20 19 | hidden=100 20 | hiddenET=50 21 | sgd=True 22 | relationEvaluationMethod=relaxed 23 | -------------------------------------------------------------------------------- /createDataStream_setup1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import re 4 | import sys 5 | import time 6 | import numpy 7 | from utils import readConfig, readIndices, getCoNNL_label2int, getMatrixForContext, adaptNumSamplesTrain, getRelID, getNerID, cleanContext, reverse 8 | import theano 9 | import pickle 10 | 11 | 12 | if len(sys.argv) != 2: 13 | print "please pass the config file as parameters" 14 | exit(0) 15 | 16 | time1 = time.time() 17 | 18 | configfile = sys.argv[1] 19 | config = readConfig(configfile) 20 | 21 | datafile = config["datafile"] 22 | 23 | if "wordvectors" in config: 24 | wordvectorfile = config["wordvectors"] 25 | print "wordvector file ", wordvectorfile 26 | wordindices = readIndices(wordvectorfile, isWord2vec = True) 27 | else: 28 | print "you have to either specify a wordvector file" 29 | exit() 30 | contextsize = 120 # maximum sentence length is 118 31 | print "contextsize ", contextsize 32 | entitysize = int(config["entitysize"]) 33 | filename = config["file"] 34 | print "filename for storing data ", filename 35 | 36 | label2int = getCoNNL_label2int() 37 | 38 | time1 = time.time() 39 | 40 | # read pickled file 41 | data_in = open(datafile, 'rb') 42 | train_id2sent = pickle.load(data_in) 43 | train_id2pos = pickle.load(data_in) 44 | train_id2ner = pickle.load(data_in) 45 | train_id2nerBILOU = pickle.load(data_in) 46 | train_id2arg2rel = pickle.load(data_in) 47 | 48 | test_id2sent = pickle.load(data_in) 49 | test_id2pos = pickle.load(data_in) 50 | test_id2ner = pickle.load(data_in) 51 | test_id2nerBILOU = pickle.load(data_in) 52 | test_id2arg2rel = pickle.load(data_in) 53 | data_in.close() 54 | 55 | def splitContext(context, curId, id2ner, id2arg2rel): 56 | contextList = context.split() 57 | curNers = id2ner[curId].split() 58 | entities = [] 59 | x1List = [] 60 | x2List = [] 61 | x3List = [] 62 | x4List = [] 63 | e1List = [] 64 | e2List = [] 65 | yList = [] 66 | yE1List = [] 67 | yE2List = [] 68 | e1IdList = [] 69 | e2IdList = [] 70 | i = 0 71 | while i < len(curNers): 72 | j = i + 1 73 | while j < len(curNers) and curNers[i] == curNers[j]: 74 | j += 1 75 | if curNers[i] != "O": 76 | entities.append((i, j-1)) 77 | i = j 78 | for e1Ind in range(len(entities)): 79 | for e2Ind in range(e1Ind+1, len(entities)): 80 | # entity pair (e1, e2) 81 | ent1 = entities[e1Ind] 82 | ent2 = entities[e2Ind] 83 | x1 = contextList[:ent1[0]] 84 | e1 = contextList[ent1[0]:ent1[1]+1] 85 | x2 = contextList[ent1[1]+1:] 86 | x3 = contextList[:ent2[0]] 87 | e2 = contextList[ent2[0]:ent2[1]+1] 88 | x4 = contextList[ent2[1]+1:] 89 | y = 0 90 | if (ent1[1],ent2[1]) in id2arg2rel[curId]: 91 | y = getRelID(id2arg2rel[curId][(ent1[1],ent2[1])]) 92 | yE1 = getNerID(curNers[ent1[1]]) 93 | yE2 = getNerID(curNers[ent2[1]]) 94 | x1List.append(x1) 95 | x2List.append(x2) 96 | x3List.append(x3) 97 | x4List.append(x4) 98 | e1List.append(e1) 99 | e2List.append(e2) 100 | yList.append(y) 101 | yE1List.append(yE1) 102 | yE2List.append(yE2) 103 | e1IdList.append(e1Ind) 104 | e2IdList.append(e2Ind) 105 | 106 | # entity pair (e2, e1): 107 | x1 = contextList[:ent2[0]] 108 | e1 = contextList[ent2[0]:ent2[1]+1] 109 | x2 = contextList[ent2[1]+1:] 110 | x3 = contextList[:ent1[0]] 111 | e2 = contextList[ent1[0]:ent1[1]+1] 112 | x4 = contextList[ent1[1]+1:] 113 | y = 0 114 | if (ent2[1],ent1[1]) in id2arg2rel[curId]: 115 | y = getRelID(id2arg2rel[curId][(ent2[1],ent1[1])]) 116 | yE1 = getNerID(curNers[ent2[1]]) 117 | yE2 = getNerID(curNers[ent1[1]]) 118 | x1List.append(x1) 119 | x2List.append(x2) 120 | x3List.append(x3) 121 | x4List.append(x4) 122 | e1List.append(e1) 123 | e2List.append(e2) 124 | yList.append(y) 125 | yE1List.append(yE1) 126 | yE2List.append(yE2) 127 | e1IdList.append(e2Ind) 128 | e2IdList.append(e1Ind) 129 | return x1List, x2List, x3List, x4List, e1List, e2List, yList, yE1List, yE2List, e1IdList, e2IdList 130 | 131 | def processSamples(id2sent, id2ner, id2arg2rel, wordindices): 132 | x1List = [] 133 | x2List = [] 134 | x3List = [] 135 | x4List = [] 136 | e1List = [] 137 | e2List = [] 138 | yList = [] 139 | yE1List = [] 140 | yE2List = [] 141 | idList = [] 142 | e1IdList = [] 143 | e2IdList = [] 144 | 145 | for curId in id2sent: 146 | context = id2sent[curId] 147 | curX1, curX2, curX3, curX4, curE1, curE2, curYrel, curY1et, curY2et, curE1Id, curE2Id = splitContext(context, curId, id2ner, id2arg2rel) 148 | 149 | for ex in range(len(curX1)): 150 | curX1[ex] = cleanContext(curX1[ex]) 151 | curX2[ex] = cleanContext(curX2[ex]) 152 | curX3[ex] = cleanContext(curX3[ex]) 153 | curX4[ex] = cleanContext(curX4[ex]) 154 | 155 | matrixX1 = getMatrixForContext(curX1[ex], contextsize, wordindices) 156 | matrixX1 = numpy.reshape(matrixX1, contextsize) 157 | matrixX2 = getMatrixForContext(curX2[ex], contextsize, wordindices) 158 | matrixX2 = numpy.reshape(matrixX2, contextsize) 159 | matrixX3 = getMatrixForContext(curX3[ex], contextsize, wordindices) 160 | matrixX3 = numpy.reshape(matrixX3, contextsize) 161 | matrixX4 = getMatrixForContext(curX4[ex], contextsize, wordindices) 162 | matrixX4 = numpy.reshape(matrixX4, contextsize) 163 | 164 | matrixE1 = getMatrixForContext(curE1[ex], entitysize, wordindices) 165 | matrixE1 = numpy.reshape(matrixE1, entitysize) 166 | matrixE2 = getMatrixForContext(curE2[ex], entitysize, wordindices) 167 | matrixE2 = numpy.reshape(matrixE2, entitysize) 168 | 169 | x1List.append(matrixX1) 170 | x2List.append(matrixX2) 171 | x3List.append(matrixX3) 172 | x4List.append(matrixX4) 173 | e1List.append(matrixE1) 174 | e2List.append(matrixE2) 175 | yList.append(curYrel[ex]) 176 | yE1List.append(curY1et[ex]) 177 | yE2List.append(curY2et[ex]) 178 | idList.append(curId) 179 | e1IdList.append(curE1Id[ex]) 180 | e2IdList.append(curE2Id[ex]) 181 | 182 | x1_numpy = numpy.array(x1List) 183 | x2_numpy = numpy.array(x2List) 184 | x3_numpy = numpy.array(x3List) 185 | x4_numpy = numpy.array(x4List) 186 | e1_numpy = numpy.array(e1List) 187 | e2_numpy = numpy.array(e2List) 188 | y_numpy = numpy.array(yList) 189 | yE1_numpy = numpy.array(yE1List) 190 | yE2_numpy = numpy.array(yE2List) 191 | id_numpy = numpy.array(idList) 192 | e1Id_numpy = numpy.array(e1IdList) 193 | e2Id_numpy = numpy.array(e2IdList) 194 | 195 | return x1_numpy, x2_numpy, x3_numpy, x4_numpy, e1_numpy, e2_numpy, y_numpy, yE1_numpy, yE2_numpy, id_numpy, e1Id_numpy, e2Id_numpy 196 | 197 | 198 | x1Train, x2Train, x3Train, x4Train, e1Train, e2Train, yTrain, yE1Train, yE2Train, idTrain, e1IdTrain, e2IdTrain = processSamples(train_id2sent, train_id2ner, train_id2arg2rel, wordindices) 199 | numSamples = x1Train.shape[0] 200 | 201 | x1Test, x2Test, x3Test, x4Test, e1Test, e2Test, yTest, yE1Test, yE2Test, idTest, e1IdTest, e2IdTest = processSamples(test_id2sent, test_id2ner, test_id2arg2rel, wordindices) 202 | numSamplesTest = x1Test.shape[0] 203 | 204 | time2 = time.time() 205 | print "time for reading data: " + str(time2 - time1) 206 | 207 | dt = theano.config.floatX 208 | 209 | # split train into train and dev 210 | numSamplesTrain = int(0.8 * numSamples) 211 | # don't split same sentence id into train and dev 212 | numSamplesTrain = adaptNumSamplesTrain(numSamplesTrain, idTrain) 213 | print "samples for training: ", numSamplesTrain 214 | numSamplesDev = numSamples - numSamplesTrain 215 | print "samples for development: ", numSamplesDev 216 | numSamplesTotal = numSamplesTrain + numSamplesDev + numSamplesTest 217 | 218 | x1Dev = x1Train[numSamplesTrain:] 219 | x1Train = x1Train[:numSamplesTrain] 220 | x2Dev = x2Train[numSamplesTrain:] 221 | x2Train = x2Train[:numSamplesTrain] 222 | x3Dev = x3Train[numSamplesTrain:] 223 | x3Train = x3Train[:numSamplesTrain] 224 | x4Dev = x4Train[numSamplesTrain:] 225 | x4Train = x4Train[:numSamplesTrain] 226 | yDev = yTrain[numSamplesTrain:] 227 | yTrain = yTrain[:numSamplesTrain] 228 | yE1Dev = yE1Train[numSamplesTrain:] 229 | yE1Train = yE1Train[:numSamplesTrain] 230 | yE2Dev = yE2Train[numSamplesTrain:] 231 | yE2Train = yE2Train[:numSamplesTrain] 232 | e1Dev = e1Train[numSamplesTrain:] 233 | e1Train = e1Train[:numSamplesTrain] 234 | e2Dev = e2Train[numSamplesTrain:] 235 | e2Train = e2Train[:numSamplesTrain] 236 | idDev = idTrain[numSamplesTrain:] 237 | idTrain = idTrain[:numSamplesTrain] 238 | e1IdDev = e1IdTrain[numSamplesTrain:] 239 | e1IdTrain = e1IdTrain[:numSamplesTrain] 240 | e2IdDev = e2IdTrain[numSamplesTrain:] 241 | e2IdTrain = e2IdTrain[:numSamplesTrain] 242 | 243 | 244 | ################ FUEL ################# 245 | import h5py 246 | from fuel.datasets.hdf5 import H5PYDataset 247 | 248 | f = h5py.File(filename, mode='w') 249 | 250 | feat_x1 = f.create_dataset('x1', (numSamplesTotal, contextsize), dtype = numpy.dtype(numpy.int32), compression='gzip') 251 | feat_x2 = f.create_dataset('x2', (numSamplesTotal, contextsize), dtype = numpy.dtype(numpy.int32), compression='gzip') 252 | feat_x3 = f.create_dataset('x3', (numSamplesTotal, contextsize), dtype = numpy.dtype(numpy.int32), compression='gzip') 253 | feat_x4 = f.create_dataset('x4', (numSamplesTotal, contextsize), dtype = numpy.dtype(numpy.int32), compression='gzip') 254 | feat_e1 = f.create_dataset('e1', (numSamplesTotal, entitysize), dtype=numpy.dtype(numpy.int32), compression='gzip') 255 | feat_e2 = f.create_dataset('e2', (numSamplesTotal, entitysize), dtype=numpy.dtype(numpy.int32), compression='gzip') 256 | label_y = f.create_dataset('y', (numSamplesTotal, 1), dtype=numpy.dtype(numpy.int32), compression='gzip') 257 | label_y1ET = f.create_dataset('y1ET', (numSamplesTotal, 1), dtype=numpy.dtype(numpy.int32), compression='gzip') 258 | label_y2ET = f.create_dataset('y2ET', (numSamplesTotal, 1), dtype=numpy.dtype(numpy.int32), compression='gzip') 259 | sent_id = f.create_dataset('sent_id', (numSamplesTotal, 1), dtype=numpy.dtype(numpy.int32), compression='gzip') 260 | e1_id = f.create_dataset('e1_id', (numSamplesTotal, 1), dtype=numpy.dtype(numpy.int32), compression='gzip') 261 | e2_id = f.create_dataset('e2_id', (numSamplesTotal, 1), dtype=numpy.dtype(numpy.int32), compression='gzip') 262 | 263 | feat_x1[...] = numpy.vstack([x1Train, x1Dev, x1Test]).reshape(numSamplesTotal, contextsize) 264 | feat_x2[...] = numpy.vstack([x2Train, x2Dev, x2Test]).reshape(numSamplesTotal, contextsize) 265 | feat_x3[...] = numpy.vstack([x3Train, x3Dev, x3Test]).reshape(numSamplesTotal, contextsize) 266 | feat_x4[...] = numpy.vstack([x4Train, x4Dev, x4Test]).reshape(numSamplesTotal, contextsize) 267 | feat_e1[...] = numpy.vstack([e1Train, e1Dev, e1Test]).reshape(numSamplesTotal, entitysize) 268 | feat_e2[...] = numpy.vstack([e2Train, e2Dev, e2Test]).reshape(numSamplesTotal, entitysize) 269 | label_y[...] = numpy.vstack([yTrain.reshape(numSamplesTrain, 1), yDev.reshape(numSamplesDev, 1), yTest.reshape(numSamplesTest, 1)]) #.reshape(numSamplesTotal, 1) 270 | label_y1ET[...] = numpy.vstack([yE1Train.reshape(numSamplesTrain, 1), yE1Dev.reshape(numSamplesDev, 1), yE1Test.reshape(numSamplesTest, 1)]) #.reshape((numSamplesTotal, 1)) 271 | label_y2ET[...] = numpy.vstack([yE2Train.reshape(numSamplesTrain, 1), yE2Dev.reshape(numSamplesDev, 1), yE2Test.reshape(numSamplesTest, 1)]) #.reshape((numSamplesTotal, 1)) 272 | sent_id[...] = numpy.vstack([idTrain.reshape(numSamplesTrain, 1), idDev.reshape(numSamplesDev, 1), idTest.reshape(numSamplesTest, 1)]) #.reshape((numSamplesTotal, 1)) 273 | e1_id[...] = numpy.vstack([e1IdTrain.reshape(numSamplesTrain, 1), e1IdDev.reshape(numSamplesDev, 1), e1IdTest.reshape(numSamplesTest, 1)]) 274 | e2_id[...] = numpy.vstack([e2IdTrain.reshape(numSamplesTrain, 1), e2IdDev.reshape(numSamplesDev, 1), e2IdTest.reshape(numSamplesTest, 1)]) 275 | 276 | start_train = 0 277 | end_train = start_train + numSamplesTrain 278 | start_dev = end_train 279 | end_dev = start_dev + numSamplesDev 280 | start_test = end_dev 281 | end_test = start_test + numSamplesTest 282 | 283 | split_dict = {'train' : 284 | {'x1':(start_train,end_train), 'x2':(start_train,end_train), 285 | 'x3':(start_train,end_train), 'x4':(start_train,end_train), 286 | 'e1':(start_train,end_train), 'e2':(start_train,end_train), 287 | 'y':(start_train,end_train), 'y1ET': (start_train,end_train), 288 | 'y2ET':(start_train,end_train), 'sent_id':(start_train,end_train), 289 | 'e1_id':(start_train,end_train), 'e2_id':(start_train,end_train)}, 290 | 'dev' : 291 | {'x1':(start_dev,end_dev), 'x2':(start_dev,end_dev), 292 | 'x3':(start_dev,end_dev), 'x4':(start_dev,end_dev), 293 | 'e1':(start_dev,end_dev), 'e2': (start_dev,end_dev), 294 | 'y':(start_dev,end_dev), 'y1ET':(start_dev,end_dev), 295 | 'y2ET': (start_dev,end_dev), 'sent_id':(start_dev,end_dev), 296 | 'e1_id':(start_dev,end_dev), 'e2_id':(start_dev,end_dev)}, 297 | 'test' : 298 | {'x1':(start_test,end_test), 'x2':(start_test,end_test), 299 | 'x3':(start_test,end_test), 'x4':(start_test,end_test), 300 | 'e1':(start_test,end_test), 'e2': (start_test,end_test), 301 | 'y':(start_test,end_test), 'y1ET':(start_test,end_test), 302 | 'y2ET': (start_test,end_test), 'sent_id':(start_test,end_test), 303 | 'e1_id':(start_test,end_test), 'e2_id':(start_test,end_test)}} 304 | 305 | f.attrs['split'] = H5PYDataset.create_split_array(split_dict) 306 | 307 | f.flush() 308 | f.close() 309 | -------------------------------------------------------------------------------- /createDataStream_setup2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import re 4 | import sys 5 | import time 6 | import numpy 7 | from utils import readConfig, readIndices, getCoNNL_label2int, getMatrixForContext, adaptNumSamplesTrain, getRelID, getNerID, cleanContext, reverse 8 | import theano 9 | import pickle 10 | import random 11 | 12 | random.seed(123455) 13 | 14 | def doSubsampling(): 15 | return random.sample([0] + [1] * 9, 1)[0] 16 | 17 | 18 | if len(sys.argv) != 2: 19 | print "please pass the config file as parameters" 20 | exit(0) 21 | 22 | time1 = time.time() 23 | 24 | configfile = sys.argv[1] 25 | config = readConfig(configfile) 26 | 27 | datafile = config["datafile"] 28 | 29 | if "wordvectors" in config: 30 | wordvectorfile = config["wordvectors"] 31 | print "wordvector file ", wordvectorfile 32 | wordindices = readIndices(wordvectorfile, isWord2vec = True) 33 | else: 34 | print "you have to either specify a wordvector file" 35 | exit() 36 | contextsize = 120 # maximum sentence length is 118 37 | print "contextsize ", contextsize 38 | entitysize = int(config["entitysize"]) 39 | filename = config["file"] 40 | print "filename for storing data ", filename 41 | 42 | label2int = getCoNNL_label2int() 43 | 44 | time1 = time.time() 45 | 46 | # read pickled file 47 | data_in = open(datafile, 'rb') 48 | train_id2sent = pickle.load(data_in) 49 | train_id2pos = pickle.load(data_in) 50 | train_id2ner = pickle.load(data_in) 51 | train_id2nerBILOU = pickle.load(data_in) 52 | train_id2arg2rel = pickle.load(data_in) 53 | 54 | test_id2sent = pickle.load(data_in) 55 | test_id2pos = pickle.load(data_in) 56 | test_id2ner = pickle.load(data_in) 57 | test_id2nerBILOU = pickle.load(data_in) 58 | test_id2arg2rel = pickle.load(data_in) 59 | data_in.close() 60 | 61 | def splitContext(context, curId, id2ner, id2arg2rel): 62 | contextList = context.split() 63 | curNers = id2ner[curId].split() 64 | entities = [] 65 | x1List = [] 66 | x2List = [] 67 | x3List = [] 68 | x4List = [] 69 | e1List = [] 70 | e2List = [] 71 | yList = [] 72 | yE1List = [] 73 | yE2List = [] 74 | e1IdList = [] 75 | e2IdList = [] 76 | i = 0 77 | while i < len(curNers): 78 | j = i + 1 79 | while j < len(curNers) and curNers[i] == curNers[j] and curNers[i] != "O": 80 | j += 1 81 | entities.append((i, j-1)) 82 | i = j 83 | for e1Ind in range(len(entities)): 84 | for e2Ind in range(e1Ind+1, len(entities)): 85 | ent1 = entities[e1Ind] 86 | ent2 = entities[e2Ind] 87 | x1 = contextList[:ent1[0]] 88 | e1 = contextList[ent1[0]:ent1[1]+1] 89 | x2 = contextList[ent1[1]+1:] 90 | x3 = contextList[:ent2[0]] 91 | e2 = contextList[ent2[0]:ent2[1]+1] 92 | x4 = contextList[ent2[1]+1:] 93 | y = 0 94 | if (ent1[1],ent2[1]) in id2arg2rel[curId]: 95 | y = getRelID(id2arg2rel[curId][(ent1[1],ent2[1])]) 96 | elif (ent2[1],ent1[1]) in id2arg2rel[curId]: 97 | y = getRelID(id2arg2rel[curId][(ent2[1],ent1[1])]) 98 | yE1 = getNerID(curNers[ent1[1]]) 99 | yE2 = getNerID(curNers[ent2[1]]) 100 | x1List.append(x1) 101 | x2List.append(x2) 102 | x3List.append(x3) 103 | x4List.append(x4) 104 | e1List.append(e1) 105 | e2List.append(e2) 106 | yList.append(y) 107 | yE1List.append(yE1) 108 | yE2List.append(yE2) 109 | e1IdList.append(e1Ind) 110 | e2IdList.append(e2Ind) 111 | return x1List, x2List, x3List, x4List, e1List, e2List, yList, yE1List, yE2List, e1IdList, e2IdList 112 | 113 | def processSamples(id2sent, id2ner, id2arg2rel, wordindices, subsampling = False): 114 | x1List = [] 115 | x2List = [] 116 | x3List = [] 117 | x4List = [] 118 | e1List = [] 119 | e2List = [] 120 | yList = [] 121 | yE1List = [] 122 | yE2List = [] 123 | idList = [] 124 | e1IdList = [] 125 | e2IdList = [] 126 | 127 | for curId in id2sent: 128 | context = id2sent[curId] 129 | curX1, curX2, curX3, curX4, curE1, curE2, curYrel, curY1et, curY2et, curE1Id, curE2Id = splitContext(context, curId, id2ner, id2arg2rel) 130 | 131 | for ex in range(len(curX1)): 132 | curX1[ex] = cleanContext(curX1[ex]) 133 | curX2[ex] = cleanContext(curX2[ex]) 134 | curX3[ex] = cleanContext(curX3[ex]) 135 | curX4[ex] = cleanContext(curX4[ex]) 136 | 137 | matrixX1 = getMatrixForContext(curX1[ex], contextsize, wordindices) 138 | matrixX1 = numpy.reshape(matrixX1, contextsize) 139 | matrixX2 = getMatrixForContext(curX2[ex], contextsize, wordindices) 140 | matrixX2 = numpy.reshape(matrixX2, contextsize) 141 | matrixX3 = getMatrixForContext(curX3[ex], contextsize, wordindices) 142 | matrixX3 = numpy.reshape(matrixX3, contextsize) 143 | matrixX4 = getMatrixForContext(curX4[ex], contextsize, wordindices) 144 | matrixX4 = numpy.reshape(matrixX4, contextsize) 145 | 146 | matrixE1 = getMatrixForContext(curE1[ex], entitysize, wordindices) 147 | matrixE1 = numpy.reshape(matrixE1, entitysize) 148 | matrixE2 = getMatrixForContext(curE2[ex], entitysize, wordindices) 149 | matrixE2 = numpy.reshape(matrixE2, entitysize) 150 | 151 | addExample = True 152 | if subsampling: 153 | if curYrel[ex] == 0 and curY1et[ex] == 0 and curY2et[ex] == 0: 154 | subs = doSubsampling() 155 | if subs == 1: 156 | addExample = False 157 | 158 | if addExample: 159 | x1List.append(matrixX1) 160 | x2List.append(matrixX2) 161 | x3List.append(matrixX3) 162 | x4List.append(matrixX4) 163 | e1List.append(matrixE1) 164 | e2List.append(matrixE2) 165 | yList.append(curYrel[ex]) 166 | yE1List.append(curY1et[ex]) 167 | yE2List.append(curY2et[ex]) 168 | idList.append(curId) 169 | e1IdList.append(curE1Id[ex]) 170 | e2IdList.append(curE2Id[ex]) 171 | 172 | x1_numpy = numpy.array(x1List) 173 | x2_numpy = numpy.array(x2List) 174 | x3_numpy = numpy.array(x3List) 175 | x4_numpy = numpy.array(x4List) 176 | e1_numpy = numpy.array(e1List) 177 | e2_numpy = numpy.array(e2List) 178 | y_numpy = numpy.array(yList) 179 | yE1_numpy = numpy.array(yE1List) 180 | yE2_numpy = numpy.array(yE2List) 181 | id_numpy = numpy.array(idList) 182 | e1Id_numpy = numpy.array(e1IdList) 183 | e2Id_numpy = numpy.array(e2IdList) 184 | 185 | return x1_numpy, x2_numpy, x3_numpy, x4_numpy, e1_numpy, e2_numpy, y_numpy, yE1_numpy, yE2_numpy, id_numpy, e1Id_numpy, e2Id_numpy 186 | 187 | x1Train, x2Train, x3Train, x4Train, e1Train, e2Train, yTrain, yE1Train, yE2Train, idTrain, e1IdTrain, e2IdTrain = processSamples(train_id2sent, train_id2ner, train_id2arg2rel, wordindices, subsampling = True) 188 | numSamples = x1Train.shape[0] 189 | 190 | x1Test, x2Test, x3Test, x4Test, e1Test, e2Test, yTest, yE1Test, yE2Test, idTest, e1IdTest, e2IdTest = processSamples(test_id2sent, test_id2ner, test_id2arg2rel, wordindices) 191 | numSamplesTest = x1Test.shape[0] 192 | 193 | time2 = time.time() 194 | print "time for reading data: " + str(time2 - time1) 195 | 196 | dt = theano.config.floatX 197 | 198 | # split train into train and dev 199 | numSamplesTrain = int(0.8 * numSamples) 200 | # don't split same sentence id into train and dev 201 | numSamplesTrain = adaptNumSamplesTrain(numSamplesTrain, idTrain) 202 | print "samples for training: ", numSamplesTrain 203 | numSamplesDev = numSamples - numSamplesTrain 204 | print "samples for development: ", numSamplesDev 205 | numSamplesTotal = numSamplesTrain + numSamplesDev + numSamplesTest 206 | 207 | x1Dev = x1Train[numSamplesTrain:] 208 | x1Train = x1Train[:numSamplesTrain] 209 | x2Dev = x2Train[numSamplesTrain:] 210 | x2Train = x2Train[:numSamplesTrain] 211 | x3Dev = x3Train[numSamplesTrain:] 212 | x3Train = x3Train[:numSamplesTrain] 213 | x4Dev = x4Train[numSamplesTrain:] 214 | x4Train = x4Train[:numSamplesTrain] 215 | yDev = yTrain[numSamplesTrain:] 216 | yTrain = yTrain[:numSamplesTrain] 217 | yE1Dev = yE1Train[numSamplesTrain:] 218 | yE1Train = yE1Train[:numSamplesTrain] 219 | yE2Dev = yE2Train[numSamplesTrain:] 220 | yE2Train = yE2Train[:numSamplesTrain] 221 | e1Dev = e1Train[numSamplesTrain:] 222 | e1Train = e1Train[:numSamplesTrain] 223 | e2Dev = e2Train[numSamplesTrain:] 224 | e2Train = e2Train[:numSamplesTrain] 225 | idDev = idTrain[numSamplesTrain:] 226 | idTrain = idTrain[:numSamplesTrain] 227 | e1IdDev = e1IdTrain[numSamplesTrain:] 228 | e1IdTrain = e1IdTrain[:numSamplesTrain] 229 | e2IdDev = e2IdTrain[numSamplesTrain:] 230 | e2IdTrain = e2IdTrain[:numSamplesTrain] 231 | 232 | ################ FUEL ################# 233 | import h5py 234 | from fuel.datasets.hdf5 import H5PYDataset 235 | 236 | f = h5py.File(filename, mode='w') 237 | 238 | feat_x1 = f.create_dataset('x1', (numSamplesTotal, contextsize), dtype = numpy.dtype(numpy.int32), compression='gzip') 239 | feat_x2 = f.create_dataset('x2', (numSamplesTotal, contextsize), dtype = numpy.dtype(numpy.int32), compression='gzip') 240 | feat_x3 = f.create_dataset('x3', (numSamplesTotal, contextsize), dtype = numpy.dtype(numpy.int32), compression='gzip') 241 | feat_x4 = f.create_dataset('x4', (numSamplesTotal, contextsize), dtype = numpy.dtype(numpy.int32), compression='gzip') 242 | feat_e1 = f.create_dataset('e1', (numSamplesTotal, entitysize), dtype=numpy.dtype(numpy.int32), compression='gzip') 243 | feat_e2 = f.create_dataset('e2', (numSamplesTotal, entitysize), dtype=numpy.dtype(numpy.int32), compression='gzip') 244 | label_y = f.create_dataset('y', (numSamplesTotal, 1), dtype=numpy.dtype(numpy.int32), compression='gzip') 245 | label_y1ET = f.create_dataset('y1ET', (numSamplesTotal, 1), dtype=numpy.dtype(numpy.int32), compression='gzip') 246 | label_y2ET = f.create_dataset('y2ET', (numSamplesTotal, 1), dtype=numpy.dtype(numpy.int32), compression='gzip') 247 | sent_id = f.create_dataset('sent_id', (numSamplesTotal, 1), dtype=numpy.dtype(numpy.int32), compression='gzip') 248 | e1_id = f.create_dataset('e1_id', (numSamplesTotal, 1), dtype=numpy.dtype(numpy.int32), compression='gzip') 249 | e2_id = f.create_dataset('e2_id', (numSamplesTotal, 1), dtype=numpy.dtype(numpy.int32), compression='gzip') 250 | 251 | feat_x1[...] = numpy.vstack([x1Train, x1Dev, x1Test]).reshape(numSamplesTotal, contextsize) 252 | feat_x2[...] = numpy.vstack([x2Train, x2Dev, x2Test]).reshape(numSamplesTotal, contextsize) 253 | feat_x3[...] = numpy.vstack([x3Train, x3Dev, x3Test]).reshape(numSamplesTotal, contextsize) 254 | feat_x4[...] = numpy.vstack([x4Train, x4Dev, x4Test]).reshape(numSamplesTotal, contextsize) 255 | feat_e1[...] = numpy.vstack([e1Train, e1Dev, e1Test]).reshape(numSamplesTotal, entitysize) 256 | feat_e2[...] = numpy.vstack([e2Train, e2Dev, e2Test]).reshape(numSamplesTotal, entitysize) 257 | label_y[...] = numpy.vstack([yTrain.reshape(numSamplesTrain, 1), yDev.reshape(numSamplesDev, 1), yTest.reshape(numSamplesTest, 1)]) #.reshape(numSamplesTotal, 1) 258 | label_y1ET[...] = numpy.vstack([yE1Train.reshape(numSamplesTrain, 1), yE1Dev.reshape(numSamplesDev, 1), yE1Test.reshape(numSamplesTest, 1)]) #.reshape((numSamplesTotal, 1)) 259 | label_y2ET[...] = numpy.vstack([yE2Train.reshape(numSamplesTrain, 1), yE2Dev.reshape(numSamplesDev, 1), yE2Test.reshape(numSamplesTest, 1)]) #.reshape((numSamplesTotal, 1)) 260 | sent_id[...] = numpy.vstack([idTrain.reshape(numSamplesTrain, 1), idDev.reshape(numSamplesDev, 1), idTest.reshape(numSamplesTest, 1)]) #.reshape((numSamplesTotal, 1)) 261 | e1_id[...] = numpy.vstack([e1IdTrain.reshape(numSamplesTrain, 1), e1IdDev.reshape(numSamplesDev, 1), e1IdTest.reshape(numSamplesTest, 1)]) 262 | e2_id[...] = numpy.vstack([e2IdTrain.reshape(numSamplesTrain, 1), e2IdDev.reshape(numSamplesDev, 1), e2IdTest.reshape(numSamplesTest, 1)]) 263 | 264 | start_train = 0 265 | end_train = start_train + numSamplesTrain 266 | start_dev = end_train 267 | end_dev = start_dev + numSamplesDev 268 | start_test = end_dev 269 | end_test = start_test + numSamplesTest 270 | 271 | split_dict = {'train' : 272 | {'x1':(start_train,end_train), 'x2':(start_train,end_train), 273 | 'x3':(start_train,end_train), 'x4':(start_train,end_train), 274 | 'e1':(start_train,end_train), 'e2':(start_train,end_train), 275 | 'y':(start_train,end_train), 'y1ET': (start_train,end_train), 276 | 'y2ET':(start_train,end_train), 'sent_id':(start_train,end_train), 277 | 'e1_id':(start_train,end_train), 'e2_id':(start_train,end_train)}, 278 | 'dev' : 279 | {'x1':(start_dev,end_dev), 'x2':(start_dev,end_dev), 280 | 'x3':(start_dev,end_dev), 'x4':(start_dev,end_dev), 281 | 'e1':(start_dev,end_dev), 'e2': (start_dev,end_dev), 282 | 'y':(start_dev,end_dev), 'y1ET':(start_dev,end_dev), 283 | 'y2ET': (start_dev,end_dev), 'sent_id':(start_dev,end_dev), 284 | 'e1_id':(start_dev,end_dev), 'e2_id':(start_dev,end_dev)}, 285 | 'test' : 286 | {'x1':(start_test,end_test), 'x2':(start_test,end_test), 287 | 'x3':(start_test,end_test), 'x4':(start_test,end_test), 288 | 'e1':(start_test,end_test), 'e2': (start_test,end_test), 289 | 'y':(start_test,end_test), 'y1ET':(start_test,end_test), 290 | 'y2ET': (start_test,end_test), 'sent_id':(start_test,end_test), 291 | 'e1_id':(start_test,end_test), 'e2_id':(start_test,end_test)}} 292 | 293 | f.attrs['split'] = H5PYDataset.create_split_array(split_dict) 294 | 295 | f.flush() 296 | f.close() 297 | -------------------------------------------------------------------------------- /createDataStream_setup3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import re 4 | import sys 5 | import time 6 | import numpy 7 | from utils import readConfig, readIndices, getCoNNL_label2int, getMatrixForContext, adaptNumSamplesTrain, getRelID, getNerID, cleanContext, reverse 8 | import theano 9 | import pickle 10 | import random 11 | 12 | random.seed(123455) 13 | 14 | def doSubsampling(): 15 | return random.sample([0] + [1] * 9, 1)[0] 16 | 17 | if len(sys.argv) != 2: 18 | print "please pass the config file as parameters" 19 | exit(0) 20 | 21 | time1 = time.time() 22 | 23 | configfile = sys.argv[1] 24 | config = readConfig(configfile) 25 | 26 | datafile = config["datafile"] 27 | 28 | if "wordvectors" in config: 29 | wordvectorfile = config["wordvectors"] 30 | print "wordvector file ", wordvectorfile 31 | wordindices = readIndices(wordvectorfile, isWord2vec = True) 32 | else: 33 | print "you have to either specify a wordvector file" 34 | exit() 35 | contextsize = 120 # maximum sentence length is 118 36 | print "contextsize ", contextsize 37 | entitysize = int(config["entitysize"]) 38 | filename = config["file"] 39 | print "filename for storing data ", filename 40 | 41 | label2int = getCoNNL_label2int() 42 | 43 | time1 = time.time() 44 | 45 | # read pickled file 46 | data_in = open(datafile, 'rb') 47 | train_id2sent = pickle.load(data_in) 48 | train_id2pos = pickle.load(data_in) 49 | train_id2ner = pickle.load(data_in) 50 | train_id2nerBILOU = pickle.load(data_in) 51 | train_id2arg2rel = pickle.load(data_in) 52 | 53 | test_id2sent = pickle.load(data_in) 54 | test_id2pos = pickle.load(data_in) 55 | test_id2ner = pickle.load(data_in) 56 | test_id2nerBILOU = pickle.load(data_in) 57 | test_id2arg2rel = pickle.load(data_in) 58 | data_in.close() 59 | 60 | sentId2newIndex2OldIndex = {} 61 | 62 | def splitContext(context, curId, id2ner, id2arg2rel): 63 | global sentId2newIndex2OldIndex 64 | contextList = context.split() 65 | curNers = id2ner[curId].split() 66 | entities = [] 67 | x1List = [] 68 | x2List = [] 69 | x3List = [] 70 | x4List = [] 71 | e1List = [] 72 | e2List = [] 73 | yList = [] 74 | yE1List = [] 75 | yE2List = [] 76 | e1IdListTmp = [] 77 | e2IdListTmp = [] 78 | e1IdList = [] 79 | e2IdList = [] 80 | processedMultiEntities = set() 81 | i = 0 82 | while i < len(curNers): 83 | j = i + 1 84 | while j < len(curNers) and curNers[i] == curNers[j] and curNers[i] != "O": 85 | j += 1 86 | entities.append((i, j-1)) 87 | i = j 88 | for e1Ind in range(len(entities)): 89 | for e2Ind in range(e1Ind+1, len(entities)): 90 | ent1 = entities[e1Ind] 91 | ent2 = entities[e2Ind] 92 | x1 = contextList[:ent1[0]] 93 | e1 = contextList[ent1[0]:ent1[1]+1] 94 | x2 = contextList[ent1[1]+1:] 95 | x3 = contextList[:ent2[0]] 96 | e2 = contextList[ent2[0]:ent2[1]+1] 97 | x4 = contextList[ent2[1]+1:] 98 | y = 0 99 | if (ent1[1],ent2[1]) in id2arg2rel[curId]: 100 | y = getRelID(id2arg2rel[curId][(ent1[1],ent2[1])]) 101 | elif (ent2[1],ent1[1]) in id2arg2rel[curId]: 102 | y = getRelID(id2arg2rel[curId][(ent2[1],ent1[1])]) 103 | yE1 = getNerID(curNers[ent1[1]]) 104 | yE2 = getNerID(curNers[ent2[1]]) 105 | # create different entries for the different words of each entity 106 | for e1index in range(len(e1)): 107 | for e2index in range(len(e2)): 108 | e1part = e1[e1index] 109 | e2part = e2[e2index] 110 | x1part = x1 + e1[:e1index] 111 | x2part = e1[e1index+1:] + x2 112 | x3part = x3 + e2[:e2index] 113 | x4part = e2[e2index+1:] + x4 114 | x1List.append(x1part) 115 | x2List.append(x2part) 116 | x3List.append(x3part) 117 | x4List.append(x4part) 118 | e1List.append([e1part]) 119 | e2List.append([e2part]) 120 | yList.append(y) # append all possible options for training 121 | yE1List.append(yE1) 122 | yE2List.append(yE2) 123 | e1IdListTmp.append(str(e1Ind) + "_" + str(e1index)) 124 | e2IdListTmp.append(str(e2Ind) + "_" + str(e2index)) 125 | if not e1Ind in processedMultiEntities: 126 | for e1index in range(len(e1)): 127 | for e2index in range(e1index + 1, len(e1)): 128 | e1part = e1[e1index] 129 | e2part = e1[e2index] 130 | x1part = x1 + e1[:e1index] 131 | x2part = e1[e1index+1:] + x2 132 | x3part = x1 + e1[:e2index] 133 | x4part = e1[e2index+1:] + x2 134 | x1List.append(x1part) 135 | x2List.append(x2part) 136 | x3List.append(x3part) 137 | x4List.append(x4part) 138 | e1List.append([e1part]) 139 | e2List.append([e2part]) 140 | yList.append(y) # append all possible options for training 141 | yE1List.append(yE1) 142 | yE2List.append(yE1) 143 | e1IdListTmp.append(str(e1Ind) + "_" + str(e1index)) 144 | e2IdListTmp.append(str(e1Ind) + "_" + str(e2index)) 145 | processedMultiEntities.add(e1Ind) 146 | entitiesAll = e1IdListTmp + e2IdListTmp 147 | entitiesAllSorted = sorted(entitiesAll) 148 | ent2newIndex = {} # Attention: new entity indices will not be correlated with word indices in the sentence! they are just for scoring correctly! 149 | newIndex = 0 150 | for ea in entitiesAllSorted: 151 | if not ea in ent2newIndex: 152 | ent2newIndex[ea] = newIndex 153 | if not curId in sentId2newIndex2OldIndex: 154 | sentId2newIndex2OldIndex[curId] = {} 155 | sentId2newIndex2OldIndex[curId][newIndex] = ea 156 | newIndex += 1 157 | for ei1 in e1IdListTmp: 158 | e1IdList.append(ent2newIndex[ei1]) 159 | for ei2 in e2IdListTmp: 160 | e2IdList.append(ent2newIndex[ei2]) 161 | return x1List, x2List, x3List, x4List, e1List, e2List, yList, yE1List, yE2List, e1IdList, e2IdList 162 | 163 | def processSamples(id2sent, id2ner, id2arg2rel, wordindices, subsampling = False): 164 | x1List = [] 165 | x2List = [] 166 | x3List = [] 167 | x4List = [] 168 | e1List = [] 169 | e2List = [] 170 | yList = [] 171 | yE1List = [] 172 | yE2List = [] 173 | idList = [] 174 | e1IdList = [] 175 | e2IdList = [] 176 | 177 | for curId in id2sent: 178 | context = id2sent[curId] 179 | curX1, curX2, curX3, curX4, curE1, curE2, curYrel, curY1et, curY2et, curE1Id, curE2Id = splitContext(context, curId, id2ner, id2arg2rel) 180 | 181 | for ex in range(len(curX1)): 182 | curX1[ex] = cleanContext(curX1[ex]) 183 | curX2[ex] = cleanContext(curX2[ex]) 184 | curX3[ex] = cleanContext(curX3[ex]) 185 | curX4[ex] = cleanContext(curX4[ex]) 186 | 187 | matrixX1 = getMatrixForContext(curX1[ex], contextsize, wordindices) 188 | matrixX1 = numpy.reshape(matrixX1, contextsize) 189 | matrixX2 = getMatrixForContext(curX2[ex], contextsize, wordindices) 190 | matrixX2 = numpy.reshape(matrixX2, contextsize) 191 | matrixX3 = getMatrixForContext(curX3[ex], contextsize, wordindices) 192 | matrixX3 = numpy.reshape(matrixX3, contextsize) 193 | matrixX4 = getMatrixForContext(curX4[ex], contextsize, wordindices) 194 | matrixX4 = numpy.reshape(matrixX4, contextsize) 195 | 196 | matrixE1 = getMatrixForContext(curE1[ex], entitysize, wordindices) 197 | matrixE1 = numpy.reshape(matrixE1, entitysize) 198 | matrixE2 = getMatrixForContext(curE2[ex], entitysize, wordindices) 199 | matrixE2 = numpy.reshape(matrixE2, entitysize) 200 | 201 | addExample = True 202 | if subsampling: 203 | if curYrel[ex] == 0 and curY1et[ex] == 0 and curY2et[ex] == 0: 204 | subs = doSubsampling() 205 | if subs == 1: 206 | addExample = False 207 | 208 | if addExample: 209 | x1List.append(matrixX1) 210 | x2List.append(matrixX2) 211 | x3List.append(matrixX3) 212 | x4List.append(matrixX4) 213 | e1List.append(matrixE1) 214 | e2List.append(matrixE2) 215 | yList.append(curYrel[ex]) 216 | yE1List.append(curY1et[ex]) 217 | yE2List.append(curY2et[ex]) 218 | idList.append(curId) 219 | e1IdList.append(curE1Id[ex]) 220 | e2IdList.append(curE2Id[ex]) 221 | 222 | x1_numpy = numpy.array(x1List) 223 | x2_numpy = numpy.array(x2List) 224 | x3_numpy = numpy.array(x3List) 225 | x4_numpy = numpy.array(x4List) 226 | e1_numpy = numpy.array(e1List) 227 | e2_numpy = numpy.array(e2List) 228 | y_numpy = numpy.array(yList) 229 | yE1_numpy = numpy.array(yE1List) 230 | yE2_numpy = numpy.array(yE2List) 231 | id_numpy = numpy.array(idList) 232 | e1Id_numpy = numpy.array(e1IdList) 233 | e2Id_numpy = numpy.array(e2IdList) 234 | 235 | return x1_numpy, x2_numpy, x3_numpy, x4_numpy, e1_numpy, e2_numpy, y_numpy, yE1_numpy, yE2_numpy, id_numpy, e1Id_numpy, e2Id_numpy 236 | 237 | 238 | x1Train, x2Train, x3Train, x4Train, e1Train, e2Train, yTrain, yE1Train, yE2Train, idTrain, e1IdTrain, e2IdTrain = processSamples(train_id2sent, train_id2ner, train_id2arg2rel, wordindices, subsampling = True) 239 | numSamples = x1Train.shape[0] 240 | 241 | x1Test, x2Test, x3Test, x4Test, e1Test, e2Test, yTest, yE1Test, yE2Test, idTest, e1IdTest, e2IdTest = processSamples(test_id2sent, test_id2ner, test_id2arg2rel, wordindices) 242 | numSamplesTest = x1Test.shape[0] 243 | 244 | time2 = time.time() 245 | print "time for reading data: " + str(time2 - time1) 246 | 247 | dt = theano.config.floatX 248 | 249 | # split train into train and dev 250 | numSamplesTrain = int(0.8 * numSamples) 251 | # don't split same sentence id into train and dev 252 | numSamplesTrain = adaptNumSamplesTrain(numSamplesTrain, idTrain) 253 | print "samples for training: ", numSamplesTrain 254 | numSamplesDev = numSamples - numSamplesTrain 255 | print "samples for development: ", numSamplesDev 256 | numSamplesTotal = numSamplesTrain + numSamplesDev + numSamplesTest 257 | 258 | x1Dev = x1Train[numSamplesTrain:] 259 | x1Train = x1Train[:numSamplesTrain] 260 | x2Dev = x2Train[numSamplesTrain:] 261 | x2Train = x2Train[:numSamplesTrain] 262 | x3Dev = x3Train[numSamplesTrain:] 263 | x3Train = x3Train[:numSamplesTrain] 264 | x4Dev = x4Train[numSamplesTrain:] 265 | x4Train = x4Train[:numSamplesTrain] 266 | yDev = yTrain[numSamplesTrain:] 267 | yTrain = yTrain[:numSamplesTrain] 268 | yE1Dev = yE1Train[numSamplesTrain:] 269 | yE1Train = yE1Train[:numSamplesTrain] 270 | yE2Dev = yE2Train[numSamplesTrain:] 271 | yE2Train = yE2Train[:numSamplesTrain] 272 | e1Dev = e1Train[numSamplesTrain:] 273 | e1Train = e1Train[:numSamplesTrain] 274 | e2Dev = e2Train[numSamplesTrain:] 275 | e2Train = e2Train[:numSamplesTrain] 276 | idDev = idTrain[numSamplesTrain:] 277 | idTrain = idTrain[:numSamplesTrain] 278 | e1IdDev = e1IdTrain[numSamplesTrain:] 279 | e1IdTrain = e1IdTrain[:numSamplesTrain] 280 | e2IdDev = e2IdTrain[numSamplesTrain:] 281 | e2IdTrain = e2IdTrain[:numSamplesTrain] 282 | 283 | # store sentId2newIndex2OldIndex: 284 | fp = open(filename + "_indexMapping", 'wb') 285 | pickle.dump(sentId2newIndex2OldIndex, fp) 286 | fp.close() 287 | 288 | 289 | ################ FUEL ################# 290 | import h5py 291 | from fuel.datasets.hdf5 import H5PYDataset 292 | 293 | f = h5py.File(filename, mode='w') 294 | 295 | feat_x1 = f.create_dataset('x1', (numSamplesTotal, contextsize), dtype = numpy.dtype(numpy.int32), compression='gzip') 296 | feat_x2 = f.create_dataset('x2', (numSamplesTotal, contextsize), dtype = numpy.dtype(numpy.int32), compression='gzip') 297 | feat_x3 = f.create_dataset('x3', (numSamplesTotal, contextsize), dtype = numpy.dtype(numpy.int32), compression='gzip') 298 | feat_x4 = f.create_dataset('x4', (numSamplesTotal, contextsize), dtype = numpy.dtype(numpy.int32), compression='gzip') 299 | feat_e1 = f.create_dataset('e1', (numSamplesTotal, entitysize), dtype=numpy.dtype(numpy.int32), compression='gzip') 300 | feat_e2 = f.create_dataset('e2', (numSamplesTotal, entitysize), dtype=numpy.dtype(numpy.int32), compression='gzip') 301 | label_y = f.create_dataset('y', (numSamplesTotal, 1), dtype=numpy.dtype(numpy.int32), compression='gzip') 302 | label_y1ET = f.create_dataset('y1ET', (numSamplesTotal, 1), dtype=numpy.dtype(numpy.int32), compression='gzip') 303 | label_y2ET = f.create_dataset('y2ET', (numSamplesTotal, 1), dtype=numpy.dtype(numpy.int32), compression='gzip') 304 | sent_id = f.create_dataset('sent_id', (numSamplesTotal, 1), dtype=numpy.dtype(numpy.int32), compression='gzip') 305 | e1_id = f.create_dataset('e1_id', (numSamplesTotal, 1), dtype=numpy.dtype(numpy.int32), compression='gzip') 306 | e2_id = f.create_dataset('e2_id', (numSamplesTotal, 1), dtype=numpy.dtype(numpy.int32), compression='gzip') 307 | 308 | feat_x1[...] = numpy.vstack([x1Train, x1Dev, x1Test]).reshape(numSamplesTotal, contextsize) 309 | feat_x2[...] = numpy.vstack([x2Train, x2Dev, x2Test]).reshape(numSamplesTotal, contextsize) 310 | feat_x3[...] = numpy.vstack([x3Train, x3Dev, x3Test]).reshape(numSamplesTotal, contextsize) 311 | feat_x4[...] = numpy.vstack([x4Train, x4Dev, x4Test]).reshape(numSamplesTotal, contextsize) 312 | feat_e1[...] = numpy.vstack([e1Train, e1Dev, e1Test]).reshape(numSamplesTotal, entitysize) 313 | feat_e2[...] = numpy.vstack([e2Train, e2Dev, e2Test]).reshape(numSamplesTotal, entitysize) 314 | label_y[...] = numpy.vstack([yTrain.reshape(numSamplesTrain, 1), yDev.reshape(numSamplesDev, 1), yTest.reshape(numSamplesTest, 1)]) #.reshape(numSamplesTotal, 1) 315 | label_y1ET[...] = numpy.vstack([yE1Train.reshape(numSamplesTrain, 1), yE1Dev.reshape(numSamplesDev, 1), yE1Test.reshape(numSamplesTest, 1)]) #.reshape((numSamplesTotal, 1)) 316 | label_y2ET[...] = numpy.vstack([yE2Train.reshape(numSamplesTrain, 1), yE2Dev.reshape(numSamplesDev, 1), yE2Test.reshape(numSamplesTest, 1)]) #.reshape((numSamplesTotal, 1)) 317 | sent_id[...] = numpy.vstack([idTrain.reshape(numSamplesTrain, 1), idDev.reshape(numSamplesDev, 1), idTest.reshape(numSamplesTest, 1)]) #.reshape((numSamplesTotal, 1)) 318 | e1_id[...] = numpy.vstack([e1IdTrain.reshape(numSamplesTrain, 1), e1IdDev.reshape(numSamplesDev, 1), e1IdTest.reshape(numSamplesTest, 1)]) 319 | e2_id[...] = numpy.vstack([e2IdTrain.reshape(numSamplesTrain, 1), e2IdDev.reshape(numSamplesDev, 1), e2IdTest.reshape(numSamplesTest, 1)]) 320 | 321 | start_train = 0 322 | end_train = start_train + numSamplesTrain 323 | start_dev = end_train 324 | end_dev = start_dev + numSamplesDev 325 | start_test = end_dev 326 | end_test = start_test + numSamplesTest 327 | 328 | split_dict = {'train' : 329 | {'x1':(start_train,end_train), 'x2':(start_train,end_train), 330 | 'x3':(start_train,end_train), 'x4':(start_train,end_train), 331 | 'e1':(start_train,end_train), 'e2':(start_train,end_train), 332 | 'y':(start_train,end_train), 'y1ET': (start_train,end_train), 333 | 'y2ET':(start_train,end_train), 'sent_id':(start_train,end_train), 334 | 'e1_id':(start_train,end_train), 'e2_id':(start_train,end_train)}, 335 | 'dev' : 336 | {'x1':(start_dev,end_dev), 'x2':(start_dev,end_dev), 337 | 'x3':(start_dev,end_dev), 'x4':(start_dev,end_dev), 338 | 'e1':(start_dev,end_dev), 'e2': (start_dev,end_dev), 339 | 'y':(start_dev,end_dev), 'y1ET':(start_dev,end_dev), 340 | 'y2ET': (start_dev,end_dev), 'sent_id':(start_dev,end_dev), 341 | 'e1_id':(start_dev,end_dev), 'e2_id':(start_dev,end_dev)}, 342 | 'test' : 343 | {'x1':(start_test,end_test), 'x2':(start_test,end_test), 344 | 'x3':(start_test,end_test), 'x4':(start_test,end_test), 345 | 'e1':(start_test,end_test), 'e2': (start_test,end_test), 346 | 'y':(start_test,end_test), 'y1ET':(start_test,end_test), 347 | 'y2ET': (start_test,end_test), 'sent_id':(start_test,end_test), 348 | 'e1_id':(start_test,end_test), 'e2_id':(start_test,end_test)}} 349 | 350 | f.attrs['split'] = H5PYDataset.create_split_array(split_dict) 351 | 352 | f.flush() 353 | f.close() 354 | -------------------------------------------------------------------------------- /evaluation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import numpy 4 | from utils import getF1, processPredictions, getMajorityPrediction, getRelaxedPredictionEntityType, getPredictionRelation, mergeREPredictionsWithOldIndices, mergeETPredictionsWithOldIndices 5 | 6 | def evaluateModel(datastream, epoch, doCRF, numClasses, numClassesET, getPredictions, getPredictionsR1, getPredictionsET1, getPredictionsET2): 7 | # validate on dev data 8 | allHyposRE = [] 9 | allRefsRE = [] 10 | allHyposET = [] 11 | allRefsET = [] 12 | curSentId = -1 13 | curSentence_entity2types = {} 14 | for d in datastream.get_epoch_iterator(as_dict = True): 15 | x1_numpy = d['x1'] 16 | x2_numpy = d['x2'] 17 | x3_numpy = d['x3'] 18 | x4_numpy = d['x4'] 19 | e1_numpy = d['e1'] 20 | e2_numpy = d['e2'] 21 | y_numpy = d['y'] 22 | y1ET_numpy = d['y1ET'] 23 | y2ET_numpy = d['y2ET'] 24 | sent_id_numpy = d['sent_id'] 25 | e1_id_numpy = d['e1_id'] 26 | e2_id_numpy = d['e2_id'] 27 | numSamples_numpy = numpy.ones_like(y_numpy) 28 | if doCRF: 29 | predictions, probs = getPredictions(x1_numpy, x2_numpy, x3_numpy, x4_numpy, e1_numpy, e2_numpy, numSamples_numpy) 30 | predictions_rel = predictions[:,2::2] # cut off begin and end padding 31 | for b in range(predictions_rel.shape[0]): 32 | allHyposRE.append(predictions_rel[b][0]) 33 | allRefsRE.append(y_numpy[b][0]) 34 | predictions_et = predictions[:,1::2] - numClasses # cut off begin and end padding and account for vector concatenation with RE scores 35 | for b in range(predictions.shape[0]): 36 | if curSentId == -1: 37 | curSentId = sent_id_numpy[b][0] 38 | if sent_id_numpy[b][0] == curSentId: 39 | pass # only append below 40 | else: 41 | for ent in curSentence_entity2types: 42 | majorityPrediction = getMajorityPrediction(curSentence_entity2types[ent]) 43 | allHyposET.append(majorityPrediction[0]) 44 | allRefsET.append(majorityPrediction[1]) 45 | curSentence_entity2types = {} 46 | curSentId = sent_id_numpy[b][0] 47 | key1 = e1_id_numpy[b][0] 48 | key2 = e2_id_numpy[b][0] 49 | if not key1 in curSentence_entity2types: 50 | curSentence_entity2types[key1] = [] 51 | if not key2 in curSentence_entity2types: 52 | curSentence_entity2types[key2] = [] 53 | curSentence_entity2types[key1].append((predictions_et[b][0],y1ET_numpy[b][0])) 54 | curSentence_entity2types[key2].append((predictions_et[b][1],y2ET_numpy[b][0])) 55 | else: 56 | predictionsR1, probsR1 = getPredictionsR1(x1_numpy, x2_numpy, x3_numpy, x4_numpy, e1_numpy, e2_numpy, numSamples_numpy) 57 | curBatchPredictionsR1 = processPredictions(predictionsR1, probsR1) 58 | allHyposRE.extend(curBatchPredictionsR1) 59 | allRefsRE.extend(y_numpy.flatten().tolist()) 60 | predictionsET1, probsET1 = getPredictionsET1(x1_numpy, x2_numpy, e1_numpy, numSamples_numpy) 61 | curBatchPredictionsET1 = processPredictions(predictionsET1, probsET1) 62 | predictionsET2, probsET2 = getPredictionsET2(x3_numpy, x4_numpy, e2_numpy, numSamples_numpy) 63 | curBatchPredictionsET2 = processPredictions(predictionsET2, probsET2) 64 | for b in range(len(curBatchPredictionsET1)): 65 | if curSentId == -1: 66 | curSentId = sent_id_numpy[b][0] 67 | if sent_id_numpy[b][0] == curSentId: 68 | pass # only append below 69 | else: 70 | for ent in curSentence_entity2types: 71 | majorityPrediction = getMajorityPrediction(curSentence_entity2types[ent]) 72 | allHyposET.append(majorityPrediction[0]) 73 | allRefsET.append(majorityPrediction[1]) 74 | curSentence_entity2types = {} 75 | curSentId = sent_id_numpy[b][0] 76 | key1 = e1_id_numpy[b][0] 77 | key2 = e2_id_numpy[b][0] 78 | if not key1 in curSentence_entity2types: 79 | curSentence_entity2types[key1] = [] 80 | if not key2 in curSentence_entity2types: 81 | curSentence_entity2types[key2] = [] 82 | curSentence_entity2types[key1].append((curBatchPredictionsET1[b],y1ET_numpy[b][0])) 83 | curSentence_entity2types[key2].append((curBatchPredictionsET2[b],y2ET_numpy[b][0])) 84 | # also include predictions from last sentence 85 | for ent in curSentence_entity2types: 86 | majorityPrediction = getMajorityPrediction(curSentence_entity2types[ent]) 87 | allHyposET.append(majorityPrediction[0]) 88 | allRefsET.append(majorityPrediction[1]) 89 | print "Validation after epoch " + str(epoch) + ":" 90 | f1_rel = getF1(allHyposRE, allRefsRE, numClasses, name="RE") 91 | f1_et = getF1(allHyposET, allRefsET, numClassesET, name = "ET") 92 | f1 = 0.5 * (f1_rel + f1_et) 93 | return f1 94 | 95 | def evaluateModel_setup3(datastream, epoch, doCRF, relationEvaluationMethod, sentId2newIndex2oldIndex, numClasses, numClassesET, getPredictions, getPredictionsR1, getPredictionsET1, getPredictionsET2): 96 | # validate on dev data 97 | allHyposRE = [] 98 | allRefsRE = [] 99 | allHyposET = [] 100 | allRefsET = [] 101 | curSentIdET = -1 102 | curSentIdRE = -1 103 | curSentence_entity2types = {} 104 | curSentence_entityPair2relations = {} 105 | for d in datastream.get_epoch_iterator(as_dict = True): 106 | x1_numpy = d['x1'] 107 | x2_numpy = d['x2'] 108 | x3_numpy = d['x3'] 109 | x4_numpy = d['x4'] 110 | e1_numpy = d['e1'] 111 | e2_numpy = d['e2'] 112 | y_numpy = d['y'] 113 | y1ET_numpy = d['y1ET'] 114 | y2ET_numpy = d['y2ET'] 115 | sent_id_numpy = d['sent_id'] 116 | e1_id_numpy = d['e1_id'] 117 | e2_id_numpy = d['e2_id'] 118 | numSamples_numpy = numpy.ones_like(y_numpy) 119 | if doCRF: 120 | predictions, probs = getPredictions(x1_numpy, x2_numpy, x3_numpy, x4_numpy, e1_numpy, e2_numpy, numSamples_numpy) 121 | predictions_rel = predictions[:,2::2] # cut off begin and end padding 122 | predictions_et = predictions[:,1::2] - numClasses # cut off begin and end padding and account for vector concatenation with RE scores 123 | else: 124 | predictionsR1, probsR1 = getPredictionsR1(x1_numpy, x2_numpy, x3_numpy, x4_numpy, e1_numpy, e2_numpy, numSamples_numpy) 125 | predictions_rel = processPredictions(predictionsR1, probsR1) 126 | predictions_rel = numpy.array(predictions_rel) 127 | predictions_rel = predictions_rel.reshape(predictions_rel.shape[0], 1) 128 | predictionsET1, probsET1 = getPredictionsET1(x1_numpy, x2_numpy, e1_numpy, numSamples_numpy) 129 | curBatchPredictionsET1 = processPredictions(predictionsET1, probsET1) 130 | curBatchPredictionsET1 = numpy.array(curBatchPredictionsET1) 131 | predictionsET2, probsET2 = getPredictionsET2(x3_numpy, x4_numpy, e2_numpy, numSamples_numpy) 132 | curBatchPredictionsET2 = processPredictions(predictionsET2, probsET2) 133 | curBatchPredictionsET2 = numpy.array(curBatchPredictionsET2) 134 | predictions_et_list = [] 135 | for b in range(len(curBatchPredictionsET1)): 136 | predictions_et_list.append([curBatchPredictionsET1[b], curBatchPredictionsET2[b]]) 137 | predictions_et = numpy.array(predictions_et_list) 138 | 139 | # process relation predictions 140 | for b in range(predictions_rel.shape[0]): 141 | if curSentIdRE == -1: 142 | curSentIdRE = sent_id_numpy[b][0] 143 | if sent_id_numpy[b][0] == curSentIdRE: 144 | pass # only append below 145 | else: 146 | newIndex2oldIndex = sentId2newIndex2oldIndex[curSentIdRE] 147 | curSentence_pair2predictions, curSentence_pair2refs = mergeREPredictionsWithOldIndices(curSentence_entityPair2relations, newIndex2oldIndex) 148 | for ent1, ent2 in curSentence_pair2predictions: 149 | bestHypo, curRef = getPredictionRelation(curSentence_pair2predictions[(ent1,ent2)], curSentence_pair2refs[(ent1,ent2)], relationEvaluationMethod) 150 | allHyposRE.append(bestHypo) 151 | allRefsRE.append(curRef) 152 | curSentence_entityPair2relations = {} 153 | curSentIdRE = sent_id_numpy[b][0] 154 | key1 = e1_id_numpy[b][0] 155 | key2 = e2_id_numpy[b][0] 156 | if not (key1, key2) in curSentence_entityPair2relations: 157 | curSentence_entityPair2relations[(key1, key2)] = [] 158 | curSentence_entityPair2relations[(key1, key2)].append((predictions_rel[b][0], y_numpy[b][0])) 159 | 160 | # process entity type predictions 161 | for b in range(predictions_et.shape[0]): 162 | if curSentIdET == -1: 163 | curSentIdET = sent_id_numpy[b][0] 164 | if sent_id_numpy[b][0] == curSentIdET: 165 | pass # only append below 166 | else: 167 | newIndex2oldIndex = sentId2newIndex2oldIndex[curSentIdET] 168 | curSentence_ent2majorityPredictions, curSentence_ent2refs = mergeETPredictionsWithOldIndices(curSentence_entity2types, newIndex2oldIndex) 169 | # entity is correct if at least one of its component is correct 170 | for ent in curSentence_ent2majorityPredictions: 171 | bestHypo, curRef = getRelaxedPredictionEntityType(curSentence_ent2majorityPredictions[ent], curSentence_ent2refs[ent]) 172 | allHyposET.append(bestHypo) 173 | allRefsET.append(curRef) 174 | curSentence_entity2types = {} 175 | curSentIdET = sent_id_numpy[b][0] 176 | key1 = e1_id_numpy[b][0] 177 | key2 = e2_id_numpy[b][0] 178 | if not key1 in curSentence_entity2types: 179 | curSentence_entity2types[key1] = [] 180 | if not key2 in curSentence_entity2types: 181 | curSentence_entity2types[key2] = [] 182 | curSentence_entity2types[key1].append((predictions_et[b][0],y1ET_numpy[b][0])) 183 | curSentence_entity2types[key2].append((predictions_et[b][1],y2ET_numpy[b][0])) 184 | 185 | # also include predictions from last sentence 186 | newIndex2oldIndex = sentId2newIndex2oldIndex[curSentIdRE] 187 | # RE prediction 188 | curSentence_pair2predictions, curSentence_pair2refs = mergeREPredictionsWithOldIndices(curSentence_entityPair2relations, newIndex2oldIndex) 189 | for ent1, ent2 in curSentence_pair2predictions: 190 | bestHypo, curRef = getPredictionRelation(curSentence_pair2predictions[(ent1,ent2)], curSentence_pair2refs[(ent1,ent2)], relationEvaluationMethod) 191 | allHyposRE.append(bestHypo) 192 | allRefsRE.append(curRef) 193 | # ET prediction 194 | newIndex2oldIndex = sentId2newIndex2oldIndex[curSentIdET] 195 | curSentence_ent2majorityPredictions, curSentence_ent2refs = mergeETPredictionsWithOldIndices(curSentence_entity2types, newIndex2oldIndex) 196 | # entity is correct if at least one of its component is correct 197 | for ent in curSentence_ent2majorityPredictions: 198 | bestHypo, curRef = getRelaxedPredictionEntityType(curSentence_ent2majorityPredictions[ent], curSentence_ent2refs[ent]) 199 | allHyposET.append(bestHypo) 200 | allRefsET.append(curRef) 201 | 202 | # get F1 score 203 | print "Validation after epoch " + str(epoch) + ":" 204 | f1_rel = getF1(allHyposRE, allRefsRE, numClasses, name="RE") 205 | f1_et = getF1(allHyposET, allRefsET, numClassesET, name = "ET") 206 | f1 = 0.5 * (f1_rel + f1_et) 207 | return f1 208 | 209 | -------------------------------------------------------------------------------- /iterationSchemes.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | from fuel.schemes import BatchScheme 4 | from picklable_itertools import iter_, imap 5 | from picklable_itertools.extras import partition_all 6 | import numpy 7 | 8 | class ShuffledExampleSchemeBatch(BatchScheme): 9 | def __init__(self, examples, batch_size, seed = 987654): 10 | super(ShuffledExampleSchemeBatch, self).__init__(examples, batch_size) 11 | self.batch_size = batch_size 12 | numpy.random.seed(seed) 13 | 14 | def get_request_iterator(self): 15 | indices = list(self.indices) 16 | # shuffle indices 17 | indicesShuffled = [] 18 | permutation = numpy.random.permutation(len(indices)) 19 | return imap(list, partition_all(self.batch_size, permutation)) 20 | 21 | -------------------------------------------------------------------------------- /layers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import numpy 4 | import theano 5 | import theano.tensor as T 6 | from theano.tensor.nnet import conv 7 | import theano.sandbox.neighbours as TSN 8 | 9 | class HiddenLayer(object): 10 | def __init__(self, rng, n_in, n_out, W=None, b=None, 11 | activation=T.tanh, name=""): 12 | """ 13 | Typical hidden layer of a MLP: units are fully-connected and have 14 | sigmoidal activation function. Weight matrix W is of shape (n_in,n_out) 15 | and the bias vector b is of shape (n_out,). 16 | 17 | NOTE : The nonlinearity used here is tanh 18 | 19 | Hidden unit activation is given by: tanh(dot(input,W) + b) 20 | 21 | :type rng: numpy.random.RandomState 22 | :param rng: a random number generator used to initialize weights 23 | 24 | :type input: theano.tensor.dmatrix 25 | :param input: a symbolic tensor of shape (n_examples, n_in) 26 | 27 | :type n_in: int 28 | :param n_in: dimensionality of input 29 | 30 | :type n_out: int 31 | :param n_out: number of hidden units 32 | 33 | :type activation: theano.Op or function 34 | :param activation: Non linearity to be applied in the hidden 35 | layer 36 | """ 37 | 38 | self.activation = activation 39 | 40 | if name != "": 41 | prefix = name 42 | else: 43 | prefix = "mlp_" 44 | 45 | if W is None: 46 | W_values = numpy.asarray(rng.uniform( 47 | low=-numpy.sqrt(6. / (n_in + n_out)), 48 | high=numpy.sqrt(6. / (n_in + n_out)), 49 | size=(n_in, n_out)), dtype=theano.config.floatX) 50 | if activation == theano.tensor.nnet.sigmoid: 51 | W_values *= 4 52 | 53 | W = theano.shared(value=W_values, name=prefix+'W', borrow=True) 54 | 55 | if b is None: 56 | b_values = numpy.zeros((n_out,), dtype=theano.config.floatX) 57 | b = theano.shared(value=b_values, name=prefix+'b', borrow=True) 58 | 59 | self.W = W 60 | self.b = b 61 | 62 | # parameters of the model 63 | self.params = [self.W, self.b] 64 | 65 | def getOutput(self, input): 66 | lin_output = T.dot(input, self.W) + self.b 67 | output = (lin_output if self.activation is None 68 | else self.activation(lin_output)) 69 | return output 70 | 71 | 72 | ######################################################################################### 73 | 74 | class LogisticRegression(object): 75 | 76 | def __init__(self, n_in, n_out, W = None, b = None, rng = None, randomInit = False): 77 | """ Initialize the parameters of the logistic regression 78 | 79 | :type input: theano.tensor.TensorType 80 | :param input: symbolic variable that describes the input of the 81 | architecture (one minibatch) 82 | 83 | :type n_in: int 84 | :param n_in: number of input units, the dimension of the space in 85 | which the datapoints lie 86 | 87 | :type n_out: int 88 | :param n_out: number of output units, the dimension of the space in 89 | which the labels lie 90 | 91 | """ 92 | 93 | self.numClasses = n_out 94 | 95 | if W == None: 96 | if randomInit: 97 | name = 'softmax_random_W' 98 | fan_in = n_in 99 | fan_out = n_out 100 | W_bound = numpy.sqrt(6. / (fan_in + fan_out)) 101 | self.W = theano.shared(value=numpy.asarray( 102 | rng.uniform(low=-W_bound, high=W_bound, size=(n_in, n_out)), 103 | dtype=theano.config.floatX), 104 | name=name, borrow=True) 105 | else: 106 | # initialize with 0 the weights W as a matrix of shape (n_in, n_out) 107 | self.W = theano.shared(value=numpy.zeros((n_in, n_out), 108 | dtype=theano.config.floatX), 109 | name='softmax_W', borrow=True) 110 | else: 111 | self.W = W 112 | 113 | self.params = [self.W] 114 | 115 | if b == None: 116 | # initialize the baises b as a vector of n_out 0s 117 | self.b = theano.shared(value=numpy.zeros((n_out,), 118 | dtype=theano.config.floatX), 119 | name='softmax_b', borrow=True) 120 | else: 121 | self.b = b 122 | self.params.append(self.b) 123 | 124 | def getMask(self, batchsize, maxSamplesInBag, samplesInBags): 125 | # mask entries outside of bags 126 | mask = T.zeros((batchsize, maxSamplesInBag)) 127 | maskAcc, _ = theano.scan(fn = lambda b, m: T.set_subtensor(m[b,:samplesInBags[b,0]], 1), 128 | outputs_info=mask, sequences=T.arange(batchsize)) 129 | mask = maskAcc[-1] 130 | mask2 = mask.repeat(self.numClasses, axis = 1).reshape((batchsize, maxSamplesInBag, self.numClasses)) 131 | return mask2 132 | 133 | def nll_mi(self, x, y, samplesInBags, batchsize): 134 | self.p_y_given_x = T.nnet.softmax(T.dot(x, self.W) + self.b) 135 | maxSamplesInBag = self.p_y_given_x.shape[0] / batchsize 136 | self.p_y_given_x = self.p_y_given_x.reshape((batchsize, maxSamplesInBag, self.p_y_given_x.shape[1])) 137 | mask = self.getMask(batchsize, maxSamplesInBag, samplesInBags) 138 | 139 | self.p_y_given_x_masked = self.p_y_given_x * T.cast(mask, theano.config.floatX) 140 | maxpredvec = T.max(self.p_y_given_x_masked, axis = 1) 141 | batch_cost_log = T.log(maxpredvec)[T.arange(y.shape[0]), y] 142 | 143 | numberOfValidExamples = T.sum(T.cast(mask[:,:,0], theano.config.floatX)) 144 | return -T.sum(batch_cost_log) / numberOfValidExamples 145 | 146 | def getCostMI(self, x, y, samplesInBags, batchsize, rankingParam=2, m_minus = 0.5, m_plus = 2.5): 147 | return self.nll_mi(x, y, samplesInBags, batchsize) 148 | 149 | def getScores(self, x, samplesInBags, batchsize): 150 | return self.getScores_softmax(x, samplesInBags, batchsize) 151 | 152 | def getOutput(self, x, samplesInBags, batchsize): 153 | return self.getOutput_softmax(x, samplesInBags, batchsize) 154 | 155 | def getScores_softmax(self, x, samplesInBags, batchsize): 156 | predictions = T.dot(x, self.W) + self.b 157 | maxSamplesInBag = predictions.shape[0] / batchsize 158 | predictions = predictions.reshape((batchsize, maxSamplesInBag, predictions.shape[1])) 159 | mask = self.getMask(batchsize, maxSamplesInBag, samplesInBags) 160 | predictions_masked = predictions * T.cast(mask, theano.config.floatX) 161 | maxpredvec = T.max(predictions_masked, axis = 1) 162 | return maxpredvec 163 | 164 | def getOutput_softmax(self, x, samplesInBags, batchsize): 165 | self.p_y_given_x = T.nnet.softmax(T.dot(x, self.W) + self.b) 166 | maxSamplesInBag = self.p_y_given_x.shape[0] / batchsize 167 | self.p_y_given_x = self.p_y_given_x.reshape((batchsize, maxSamplesInBag, self.p_y_given_x.shape[1])) 168 | mask = self.getMask(batchsize, maxSamplesInBag, samplesInBags) 169 | self.p_y_given_x_masked = self.p_y_given_x * T.cast(mask, theano.config.floatX) 170 | argmaxpredvec = T.argmax(self.p_y_given_x_masked, axis = 2) 171 | maxpredvec = T.max(self.p_y_given_x_masked, axis = 2) 172 | return [argmaxpredvec, maxpredvec] 173 | 174 | 175 | #################################################################################### 176 | 177 | class LeNetConvPoolLayer(object): 178 | """Pool Layer of a convolutional network """ 179 | 180 | def preparePooling(self, conv_out): 181 | neighborsForPooling = TSN.images2neibs(ten4=conv_out, neib_shape=(1,conv_out.shape[3]), mode='ignore_borders') 182 | self.neighbors = neighborsForPooling 183 | neighborsArgSorted = T.argsort(neighborsForPooling, axis=1) 184 | neighborsArgSorted = neighborsArgSorted 185 | return neighborsForPooling, neighborsArgSorted 186 | 187 | def kmaxPooling(self, conv_out, k): 188 | neighborsForPooling, neighborsArgSorted = self.preparePooling(conv_out) 189 | kNeighborsArg = neighborsArgSorted[:,-k:] 190 | self.neigborsSorted = kNeighborsArg 191 | kNeighborsArgSorted = T.sort(kNeighborsArg, axis=1) 192 | ii = T.repeat(T.arange(neighborsForPooling.shape[0]), k) 193 | jj = kNeighborsArgSorted.flatten() 194 | self.ii = ii 195 | self.jj = jj 196 | pooledkmaxTmp = neighborsForPooling[ii, jj] 197 | 198 | self.pooled = pooledkmaxTmp 199 | 200 | # reshape pooled_out 201 | new_shape = T.cast(T.join(0, conv_out.shape[:-2], 202 | T.as_tensor([conv_out.shape[2]]), 203 | T.as_tensor([k])), 204 | 'int64') 205 | pooledkmax = T.reshape(pooledkmaxTmp, new_shape, ndim=4) 206 | return pooledkmax 207 | 208 | def convStep(self, curInput, curFilter): 209 | return conv.conv2d(input=curInput, filters=curFilter, 210 | filter_shape=self.filter_shape, 211 | image_shape=None) 212 | 213 | def __init__(self, rng, filter_shape, image_shape = None, W = None, b = None, poolsize=(2, 2)): 214 | """ 215 | Allocate a LeNetConvPoolLayer with shared variable internal parameters. 216 | 217 | :type rng: numpy.random.RandomState 218 | :param rng: a random number generator used to initialize weights 219 | 220 | :type W: theano.matrix 221 | :param W: the weight matrix used for convolution 222 | 223 | :type b: theano vector 224 | :param b: the bias used for convolution 225 | 226 | :type input: theano.tensor.dtensor4 227 | :param input: symbolic image tensor, of shape image_shape 228 | 229 | :type filter_shape: tuple or list of length 4 230 | :param filter_shape: (number of filters, num input feature maps, 231 | filter height,filter width) 232 | 233 | :type image_shape: tuple or list of length 4 234 | :param image_shape: (batch size, num input feature maps, 235 | image height, image width) 236 | 237 | :type poolsize: tuple or list of length 2 238 | :param poolsize: the downsampling (pooling) factor (#rows,#cols) 239 | """ 240 | 241 | self.filter_shape = filter_shape 242 | self.poolsize = poolsize 243 | 244 | if W == None: 245 | fan_in = numpy.prod(self.filter_shape[1:]) 246 | fan_out = (self.filter_shape[0] * numpy.prod(self.filter_shape[2:]) / 247 | numpy.prod(self.poolsize)) 248 | 249 | W_bound = numpy.sqrt(6. / (fan_in + fan_out)) 250 | # the convolution weight matrix 251 | self.W = theano.shared(numpy.asarray( 252 | rng.uniform(low=-W_bound, high=W_bound, size=filter_shape), 253 | dtype=theano.config.floatX), name='conv_W', 254 | borrow=True) 255 | else: 256 | self.W = W 257 | 258 | if b == None: 259 | # the bias is a 1D tensor -- one bias per output feature map 260 | b_values = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX) 261 | self.b = theano.shared(value=b_values, name='conv_b', borrow=True) 262 | else: 263 | self.b = b 264 | 265 | # store parameters of this layer 266 | self.params = [self.W, self.b] 267 | 268 | def getOutput(self, input): 269 | 270 | # convolve input feature maps with filters 271 | conv_out = self.convStep(input, self.W) 272 | 273 | #self.conv_out_tanh = T.tanh(conv_out + self.b.dimshuffle('x', 0, 'x', 'x')) 274 | 275 | k = self.poolsize[1] 276 | self.pooledkmax = self.kmaxPooling(conv_out, k) 277 | 278 | # add the bias term. Since the bias is a vector (1D array), we first 279 | # reshape it to a tensor of shape (1,n_filters,1,1). Each bias will 280 | # thus be broadcasted across mini-batches and feature map 281 | # width & height 282 | output = T.tanh(self.pooledkmax + self.b.dimshuffle('x', 0, 'x', 'x')) 283 | return output 284 | 285 | ################################################################################### 286 | 287 | class CRF: 288 | # Code from https://github.com/glample/tagger/blob/master/model.py 289 | # but extended to support mini-batches 290 | 291 | def log_sum_exp(self, x, axis=None): 292 | """ 293 | Sum probabilities in the log-space. 294 | """ 295 | xmax = x.max(axis=axis, keepdims=True) 296 | xmax_ = x.max(axis=axis) 297 | return xmax_ + T.log(T.exp(x - xmax).sum(axis=axis)) 298 | 299 | def recurrence(self, obs, previous): 300 | previous = previous.dimshuffle(0, 1, 'x') 301 | obs = obs.dimshuffle(0, 'x', 1) 302 | return self.log_sum_exp(previous + obs + self.transitions.dimshuffle('x', 0, 1), axis=1) 303 | 304 | def recurrence_viterbi(self, obs, previous): 305 | previous = previous.dimshuffle(0, 1, 'x') 306 | obs = obs.dimshuffle(0, 'x', 1) 307 | scores = previous + obs + self.transitions.dimshuffle('x', 0, 1) 308 | out = scores.max(axis=1) 309 | return out 310 | 311 | def recurrence_viterbi_returnBest(self, obs, previous): 312 | previous = previous.dimshuffle(0, 1, 'x') 313 | obs = obs.dimshuffle(0, 'x', 1) 314 | scores = previous + obs + self.transitions.dimshuffle('x', 0, 1) 315 | out = scores.max(axis=1) 316 | out2 = scores.argmax(axis=1) 317 | return out, out2 318 | 319 | def forward(self, observations, viterbi=False, return_alpha=False, return_best_sequence=False): 320 | """ 321 | Takes as input: 322 | - observations, sequence of shape (batch_size, n_steps, n_classes) 323 | Probabilities must be given in the log space. 324 | Compute alpha, matrix of size (batch_size, n_steps, n_classes), such that 325 | alpha[:, i, j] represents one of these 2 values: 326 | - the probability that the real path at node i ends in j 327 | - the maximum probability of a path finishing in j at node i (Viterbi) 328 | Returns one of these 2 values: 329 | - alpha 330 | - the final probability, which can be: 331 | - the sum of the probabilities of all paths 332 | - the probability of the best path (Viterbi) 333 | """ 334 | assert not return_best_sequence or (viterbi and not return_alpha) 335 | 336 | def recurrence_bestSequence(b): 337 | sequence_b, _ = theano.scan( 338 | fn=lambda beta_i, previous: beta_i[previous], 339 | outputs_info=T.cast(T.argmax(alpha[0][b][-1]), 'int32'), 340 | sequences=T.cast(alpha[1][b,::-1], 'int32') 341 | ) 342 | return sequence_b 343 | 344 | initial = observations[:,0] 345 | 346 | if viterbi: 347 | if return_best_sequence: 348 | alpha, _ = theano.scan( 349 | fn=self.recurrence_viterbi_returnBest, 350 | outputs_info=(initial, None), 351 | sequences=[observations[:,1:].dimshuffle(1,0,2)] # shuffle to get a sequence over time, not over batches 352 | ) 353 | alpha[0] = alpha[0].dimshuffle(1,0,2) # shuffle back 354 | alpha[1] = alpha[1].dimshuffle(1,0,2) 355 | else: 356 | alpha, _ = theano.scan( 357 | fn=self.recurrence_viterbi, 358 | outputs_info=initial, 359 | sequences=[observations[:,1:].dimshuffle(1,0,2)] # shuffle to get a sequence over time, not over batches 360 | ) 361 | alpha = alpha.dimshuffle(1,0,2) # shuffle back 362 | else: 363 | alpha, _ = theano.scan( 364 | fn=self.recurrence, 365 | outputs_info=initial, 366 | sequences=[observations[:,1:].dimshuffle(1,0,2)] # shuffle to get a sequence over time, not over batches 367 | ) 368 | alpha = alpha.dimshuffle(1,0,2) # shuffle back 369 | 370 | if return_alpha: 371 | return alpha 372 | elif return_best_sequence: 373 | batchsizeVar = alpha[0].shape[0] 374 | sequence, _ = theano.scan( 375 | fn=recurrence_bestSequence, 376 | outputs_info = None, 377 | sequences=T.arange(batchsizeVar) 378 | ) 379 | sequence = T.concatenate([sequence[:,::-1], T.argmax(alpha[0][:,-1], axis = 1).reshape((batchsizeVar, 1))], axis = 1) 380 | return sequence, alpha[0] 381 | else: 382 | if viterbi: 383 | return alpha[:,-1,:].max(axis=1) 384 | else: 385 | return self.log_sum_exp(alpha[:,-1,:], axis=1) 386 | 387 | def __init__(self, numClasses, rng, batchsizeVar, sequenceLength = 3): 388 | self.numClasses = numClasses 389 | 390 | shape_transitions = (numClasses + 2, numClasses + 2) # +2 because of start id and end id 391 | drange = numpy.sqrt(6.0 / numpy.sum(shape_transitions)) 392 | self.transitions = theano.shared(value = numpy.asarray(rng.uniform(low = -drange, high = drange, size = shape_transitions), dtype = theano.config.floatX), name = 'transitions') 393 | 394 | self.small = -1000 # log for very small probability 395 | b_s = numpy.array([[self.small] * numClasses + [0, self.small]]).astype(theano.config.floatX) 396 | e_s = numpy.array([[self.small] * numClasses + [self.small, 0]]).astype(theano.config.floatX) 397 | self.b_s_theano = theano.shared(value = b_s).dimshuffle('x', 0, 1) 398 | self.e_s_theano = theano.shared(value = e_s).dimshuffle('x', 0, 1) 399 | 400 | self.b_s_theano = self.b_s_theano.repeat(batchsizeVar, axis = 0) 401 | self.e_s_theano = self.e_s_theano.repeat(batchsizeVar, axis = 0) 402 | 403 | self.s_len = sequenceLength 404 | 405 | self.debug1 = self.e_s_theano 406 | 407 | self.params = [self.transitions] 408 | 409 | def getObservations(self, scores): 410 | batchsizeVar = scores.shape[0] 411 | observations = T.concatenate([scores, self.small * T.cast(T.ones((batchsizeVar, self.s_len, 2)), theano.config.floatX)], axis = 2) 412 | observations = T.concatenate([self.b_s_theano, observations, self.e_s_theano], axis = 1) 413 | return observations 414 | 415 | def getPrediction(self, scores): 416 | observations = self.getObservations(scores) 417 | prediction = self.forward(observations, viterbi=True, return_best_sequence=True) 418 | return prediction 419 | 420 | def getCost(self, scores, y_conc): 421 | batchsizeVar = scores.shape[0] 422 | observations = self.getObservations(scores) 423 | 424 | # score from classes 425 | scores_flattened = scores.reshape((scores.shape[0] * scores.shape[1], scores.shape[2])) 426 | y_flattened = y_conc.flatten(1) 427 | 428 | real_path_score = scores_flattened[T.arange(batchsizeVar * self.s_len), y_flattened] 429 | real_path_score = real_path_score.reshape((batchsizeVar, self.s_len)).sum(axis = 1) 430 | 431 | # score from transitions 432 | b_id = theano.shared(value=numpy.array([self.numClasses], dtype=numpy.int32)) # id for begin 433 | e_id = theano.shared(value=numpy.array([self.numClasses + 1], dtype=numpy.int32)) # id for end 434 | b_id = b_id.dimshuffle('x', 0).repeat(batchsizeVar, axis = 0) 435 | e_id = e_id.dimshuffle('x', 0).repeat(batchsizeVar, axis = 0) 436 | 437 | padded_tags_ids = T.concatenate([b_id, y_conc, e_id], axis=1) 438 | 439 | real_path_score2, _ = theano.scan(fn = lambda m: self.transitions[padded_tags_ids[m,T.arange(self.s_len+1)], padded_tags_ids[m,T.arange(self.s_len + 1) + 1]].sum(), sequences = T.arange(batchsizeVar), outputs_info = None) 440 | 441 | real_path_score += real_path_score2 442 | all_paths_scores = self.forward(observations) 443 | self.debug1 = real_path_score 444 | cost = - T.mean(real_path_score - all_paths_scores) 445 | return cost 446 | 447 | def getCostAddLogWeights(self, scores, y_conc): 448 | batchsizeVar = scores.shape[0] 449 | observations = self.getObservations(scores) 450 | 451 | # score from classes 452 | scores_flattened = scores.reshape((scores.shape[0] * scores.shape[1], scores.shape[2])) 453 | y_flattened = y_conc.flatten(1) 454 | 455 | real_path_score = scores_flattened[T.arange(batchsizeVar * self.s_len), y_flattened] 456 | real_path_score = real_path_score.reshape((batchsizeVar, self.s_len)).sum(axis = 1) 457 | 458 | # score from transitions 459 | b_id = theano.shared(value=numpy.array([self.numClasses], dtype=numpy.int32)) # id for begin 460 | e_id = theano.shared(value=numpy.array([self.numClasses + 1], dtype=numpy.int32)) # id for end 461 | b_id = b_id.dimshuffle('x', 0).repeat(batchsizeVar, axis = 0) 462 | e_id = e_id.dimshuffle('x', 0).repeat(batchsizeVar, axis = 0) 463 | 464 | padded_tags_ids = T.concatenate([b_id, y_conc, e_id], axis=1) 465 | 466 | real_path_score2, _ = theano.scan(fn = lambda m: self.transitions[padded_tags_ids[m,T.arange(self.s_len+1)], padded_tags_ids[m,T.arange(self.s_len + 1) + 1]].sum(), sequences = T.arange(batchsizeVar), outputs_info = None) 467 | 468 | real_path_score += real_path_score2 469 | all_paths_scores = self.forward(observations) 470 | self.debug1 = real_path_score 471 | cost = - T.mean(real_path_score - all_paths_scores) 472 | return cost 473 | 474 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys 4 | import time 5 | from collections import defaultdict, OrderedDict 6 | import numpy 7 | from utils import readConfig, readWordvectorsNumpy 8 | from evaluation import evaluateModel 9 | import random 10 | import theano 11 | import theano.tensor as T 12 | import cPickle 13 | from layers import LeNetConvPoolLayer, HiddenLayer, LogisticRegression, CRF 14 | from fuel.datasets.hdf5 import H5PYDataset 15 | from fuel.streams import DataStream 16 | from fuel.schemes import SequentialScheme, ShuffledExampleScheme 17 | from iterationSchemes import ShuffledExampleSchemeBatch 18 | 19 | def sgd_updates(params, cost, learning_rate, sqrt_norm_lim = 3): 20 | updates = [] 21 | for param in params: 22 | gp = T.grad(cost, param) 23 | step = -1.0 * learning_rate * gp 24 | stepped_param = param + step 25 | if (param.get_value(borrow=True).ndim == 2) and (param.name!='Words'): 26 | col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0)) 27 | desired_norms = T.clip(col_norms, 0, sqrt_norm_lim) 28 | scale = desired_norms / (1e-7 + col_norms) 29 | updates.append((param, stepped_param * scale)) 30 | else: 31 | updates.append((param, stepped_param)) 32 | return updates 33 | 34 | 35 | if len(sys.argv) != 2: 36 | print "please pass the config file as parameters" 37 | exit(0) 38 | 39 | time1 = time.time() 40 | 41 | configfile = sys.argv[1] 42 | config = readConfig(configfile) 43 | 44 | print "config:" 45 | for c in config: 46 | print str(c) + "\t" + str(config[c]) 47 | 48 | datafile = config["file"] 49 | iterationSeed = -1 50 | if "iterationSeed" in config: 51 | iterationSeed = int(config["iterationSeed"]) 52 | print "using " + str(iterationSeed) + " as seed for iteration scheme" 53 | pretrainedEmbeddings = False 54 | if "wordvectors" in config: 55 | wordvectorfile = config["wordvectors"] 56 | wordvectors, representationsize, words = readWordvectorsNumpy(wordvectorfile, isWord2vec = True) 57 | vocabsize = wordvectors.shape[0] 58 | pretrainedEmbeddings = True 59 | else: 60 | print "you have to specify a wordvector file in the config" 61 | exit() 62 | networkfile = config["net"] 63 | lrateOrig = float(config["lrate"]) 64 | print "using sgd with learning rate ", lrateOrig 65 | batch_size = int(config["batchsize"]) 66 | contextsize = int(config["contextsize"]) 67 | entitysize = int(config["entitysize"]) 68 | myLambda1 = 0 69 | if "lambda1" in config: 70 | myLambda1 = float(config["lambda1"]) 71 | myLambda2 = 0 72 | if "lambda2" in config: 73 | myLambda2 = float(config["lambda2"]) 74 | addInputSize = 1 # extra feature for hidden layer after convolution: name before filler? 75 | loss = "entropy" 76 | doCRF = False 77 | if "crf" in config: 78 | loss = "crf" 79 | doCRF = True 80 | print "using loss function: ", loss 81 | examplesPerUpdate = None 82 | if "examplesPerUpdate" in config: 83 | examplesPerUpdate = int(config["examplesPerUpdate"]) 84 | numPerBag = int(config["numPerBag"]) 85 | numClasses = int(config["numClasses"]) 86 | numClassesET = 5 87 | 88 | nkernsContext = int(config["nkernsContext"]) 89 | nkernsEntities = int(config["nkernsEntities"]) 90 | hiddenUnits = int(config["hidden"]) 91 | hiddenUnitsET = int(config["hiddenET"]) 92 | filtersizeContext = int(config["filtersizeContext"]) 93 | filtersizeEntities = int(config["filtersizeEntities"]) 94 | kmaxContext = int(config["kmaxContext"]) 95 | kmaxEntities = int(config["kmaxEntities"]) 96 | 97 | time1 = time.time() 98 | 99 | ######## FUEL ################# 100 | # Define "data_stream" 101 | # The names here (e.g. 'name1') need to match the names of the variables which 102 | # are the roots of the computational graph for the cost. 103 | 104 | train_set = H5PYDataset(datafile, which_sets = ('train',), load_in_memory=True) 105 | dev_set = H5PYDataset(datafile, which_sets = ('dev',), load_in_memory=True) 106 | test_set = H5PYDataset(datafile, which_sets = ('test',), load_in_memory=True) 107 | numSamplesDev = dev_set.num_examples 108 | numSamplesTrain = train_set.num_examples 109 | numSamplesTest = test_set.num_examples 110 | 111 | print "got " + str(numSamplesTrain) + " training examples" 112 | numTrainingBatches = numSamplesTrain / batch_size 113 | print "got " + str(numSamplesDev) + " dev examples" 114 | print "got " + str(numSamplesTest) + " test examples" 115 | 116 | if iterationSeed != -1: 117 | data_stream = DataStream(train_set, iteration_scheme = ShuffledExampleSchemeBatch(numSamplesTrain, batch_size, iterationSeed)) 118 | else: 119 | data_stream = DataStream(train_set, iteration_scheme = ShuffledExampleSchemeBatch(train_set.num_examples, batch_size)) 120 | data_stream_dev = DataStream(dev_set, iteration_scheme=SequentialScheme( 121 | dev_set.num_examples, 1)) 122 | data_stream_test = DataStream(test_set, iteration_scheme=SequentialScheme( 123 | test_set.num_examples, 1)) 124 | numSamplesDev = dev_set.num_examples 125 | numSamplesTest = test_set.num_examples 126 | numSamplesTrain = (train_set.num_examples / batch_size) * batch_size 127 | ################################ 128 | 129 | time2 = time.time() 130 | print "time for reading data: " + str(time2 - time1) 131 | 132 | # train network 133 | curSeed = 23455 134 | if "seed" in config: 135 | curSeed = int(config["seed"]) 136 | rng = numpy.random.RandomState(curSeed) 137 | seed = rng.get_state()[1][0] 138 | print "seed: " + str(seed) 139 | 140 | x1 = T.imatrix('x1') # shape: (batchsize, numPerBag * contextsize) # left of e1 141 | x2 = T.imatrix('x2') # shape: (batchsize, numPerBag * contextsize) # right of e1 142 | x3 = T.imatrix('x3') # shape: (batchsize, numPerBag * contextsize) # left of e2 143 | x4 = T.imatrix('x4') # shape: (batchsize, numPerBag * contextsize) # right of e3 144 | y = T.imatrix('y') # shape: (batchsize, 1) 145 | y1ET = T.imatrix('y1ET') # shape: (batchsize, 1) 146 | y2ET = T.imatrix('y2ET') # shape: (batchsize, 1) 147 | e1 = T.imatrix('e1') # shape: (batchsize, entitysize) 148 | e2 = T.imatrix('e2') # shape: (batchsize, entitysize) 149 | numSamples = T.imatrix('numSamples') # shape: (batchsize, 1) 150 | lr = T.scalar('lr') # learning rate 151 | 152 | embeddings = theano.shared(numpy.array(wordvectors, dtype = theano.config.floatX)).dimshuffle(1,0) 153 | 154 | batchsizeVar = numSamples.shape[0] 155 | y_resh = y.reshape((batchsizeVar,)) # rel:e1->e2 156 | y1ET_resh = y1ET.reshape((batchsizeVar,)) 157 | y2ET_resh = y2ET.reshape((batchsizeVar,)) 158 | 159 | numSamples_resh = numSamples.reshape((batchsizeVar,)) 160 | 161 | layers = [] 162 | 163 | cnnContext = LeNetConvPoolLayer(rng = rng, filter_shape = (nkernsContext, 1, representationsize, filtersizeContext), poolsize = (1, kmaxContext)) 164 | layers.append(cnnContext) 165 | if "middleContext" in config: 166 | hidden_in = nkernsContext * kmaxContext 167 | else: 168 | cnnEntities = LeNetConvPoolLayer(rng = rng, filter_shape = (nkernsEntities, 1, representationsize, filtersizeEntities), poolsize = (1, kmaxEntities)) 169 | layers.append(cnnEntities) 170 | hidden_in = 2 * (2 * nkernsContext * kmaxContext + nkernsEntities * kmaxEntities) 171 | hiddenLayer = HiddenLayer(rng = rng, n_in = hidden_in, n_out = hiddenUnits) 172 | layers.append(hiddenLayer) 173 | hiddenLayerET = HiddenLayer(rng = rng, n_in = 2 * nkernsContext * kmaxContext + nkernsEntities * kmaxEntities, n_out = hiddenUnitsET) 174 | layers.append(hiddenLayerET) 175 | randomInit = False 176 | if doCRF: 177 | randomInit = True 178 | outputLayer = LogisticRegression(n_in = hiddenUnits, n_out = numClasses, rng = rng, randomInit = randomInit) 179 | layers.append(outputLayer) 180 | outputLayerET = LogisticRegression(n_in = hiddenUnitsET, n_out = numClassesET, rng = rng, randomInit = randomInit) 181 | layers.append(outputLayerET) 182 | if doCRF: 183 | crfLayer = CRF(numClasses = numClasses + numClassesET, rng = rng, batchsizeVar = batchsizeVar, sequenceLength = 3) 184 | layers.append(crfLayer) 185 | 186 | x1_resh = x1.reshape((batchsizeVar * numPerBag, contextsize)) 187 | x1_emb = embeddings[:,x1_resh].dimshuffle(1, 0, 2) 188 | x1_emb = x1_emb.reshape((x1_emb.shape[0], 1, x1_emb.shape[1], x1_emb.shape[2])) 189 | x2_resh = x2.reshape((batchsizeVar * numPerBag, contextsize)) 190 | x2_emb = embeddings[:,x2_resh].dimshuffle(1, 0, 2) 191 | x2_emb = x2_emb.reshape((x2_emb.shape[0], 1, x2_emb.shape[1], x2_emb.shape[2])) 192 | x3_resh = x3.reshape((batchsizeVar * numPerBag, contextsize)) 193 | x3_emb = embeddings[:,x3_resh].dimshuffle(1, 0, 2) 194 | x3_emb = x3_emb.reshape((x3_emb.shape[0], 1, x3_emb.shape[1], x3_emb.shape[2])) 195 | x4_resh = x4.reshape((batchsizeVar * numPerBag, contextsize)) 196 | x4_emb = embeddings[:,x4_resh].dimshuffle(1, 0, 2) 197 | x4_emb = x4_emb.reshape((x4_emb.shape[0], 1, x4_emb.shape[1], x4_emb.shape[2])) 198 | 199 | e1_resh = e1.reshape((batchsizeVar, entitysize)) 200 | e1_emb = embeddings[:,e1_resh].dimshuffle(1, 0, 2) 201 | e1_emb = e1_emb.reshape((e1_emb.shape[0], 1, e1_emb.shape[1], e1_emb.shape[2])) 202 | e2_resh = e2.reshape((batchsizeVar, entitysize)) 203 | e2_emb = embeddings[:,e2_resh].dimshuffle(1, 0, 2) 204 | e2_emb = e2_emb.reshape((e2_emb.shape[0], 1, e2_emb.shape[1], e2_emb.shape[2])) 205 | 206 | x1_rep = cnnContext.getOutput(x1_emb) 207 | x2_rep = cnnContext.getOutput(x2_emb) 208 | x3_rep = cnnContext.getOutput(x3_emb) 209 | x4_rep = cnnContext.getOutput(x4_emb) 210 | e1_rep = cnnEntities.getOutput(e1_emb) 211 | e2_rep = cnnEntities.getOutput(e2_emb) 212 | 213 | e1_rep_repeated = e1_rep.flatten(2).repeat(numPerBag, axis = 0) 214 | e2_rep_repeated = e2_rep.flatten(2).repeat(numPerBag, axis = 0) 215 | 216 | aroundE1 = T.concatenate([x1_rep.flatten(2), e1_rep_repeated, x2_rep.flatten(2)], axis = 1) 217 | aroundE2 = T.concatenate([x3_rep.flatten(2), e2_rep_repeated, x4_rep.flatten(2)], axis = 1) 218 | 219 | # entity typing: 220 | hiddenForE1 = hiddenLayerET.getOutput(aroundE1) 221 | hiddenForE2 = hiddenLayerET.getOutput(aroundE2) 222 | 223 | # relation classification: 224 | if "middleContext" in config: 225 | e1_emb_repeated = e1_emb.repeat(numPerBag, axis = 0) 226 | e2_emb_repeated = e2_emb.repeat(numPerBag, axis = 0) 227 | 228 | betweenE1E2 = cnnContext.getOutput(T.concatenate([e1_emb_repeated, x2_emb, e2_emb_repeated], axis = 3)) 229 | 230 | betweenE1E2flatten = betweenE1E2.flatten(2) 231 | 232 | # to predict r1: between e1 and e2 233 | hiddenForR1 = hiddenLayer.getOutput(betweenE1E2flatten) 234 | 235 | else: 236 | # to predict r1: aroundE1 (x1 + e1 + x2) and aroundE2 (x2 + e2 + x3) 237 | hiddenForR1 = hiddenLayer.getOutput(T.concatenate([aroundE1,aroundE2], axis = 1)) 238 | 239 | if doCRF: 240 | # scores for different classes for r1, r2 and r3 241 | scoresForR1 = outputLayer.getScores(hiddenForR1, numSamples, batchsizeVar) 242 | scoresForE1 = outputLayerET.getScores(hiddenForE1, numSamples, batchsizeVar) 243 | scoresForE2 = outputLayerET.getScores(hiddenForE2, numSamples, batchsizeVar) 244 | 245 | scores = T.zeros((batchsizeVar, 3, numClasses + numClassesET)) 246 | scores = T.set_subtensor(scores[:,0,numClasses:], scoresForE1) 247 | scores = T.set_subtensor(scores[:,1,:numClasses], scoresForR1) 248 | scores = T.set_subtensor(scores[:,2,numClasses:], scoresForE2) 249 | y_conc = T.concatenate([y1ET + numClasses, y, y2ET + numClasses], axis = 1) 250 | cost = crfLayer.getCostAddLogWeights(scores, y_conc) 251 | else: 252 | cost = outputLayer.getCostMI(hiddenForR1, y_resh, numSamples, batchsizeVar) + outputLayerET.getCostMI(hiddenForE1, y1ET_resh, numSamples, batchsizeVar) + outputLayerET.getCostMI(hiddenForE2, y2ET_resh, numSamples, batchsizeVar) 253 | 254 | params = [] 255 | for l in layers: 256 | params += l.params 257 | 258 | reg2 = 0.0 259 | reg1 = 0.0 260 | for p in params: 261 | if ".W" in p.name or "_W" in p.name: 262 | print "found W", p 263 | reg2 += T.sum(p ** 2) 264 | reg1 += T.sum(abs(p)) 265 | cost += myLambda2 * reg2 266 | cost += myLambda1 * reg1 267 | 268 | updates = sgd_updates(params, cost, learning_rate = lr) 269 | 270 | if doCRF: 271 | predictions_global = crfLayer.getPrediction(scores) 272 | else: 273 | predictions_y1 = outputLayer.getOutput(hiddenForR1, numSamples, batchsizeVar) 274 | predictions_et1 = outputLayerET.getOutput(hiddenForE1, numSamples, batchsizeVar) 275 | predictions_et2 = outputLayerET.getOutput(hiddenForE2, numSamples, batchsizeVar) 276 | 277 | train = theano.function([x1, x2, x3, x4, e1, e2, y, y1ET, y2ET, numSamples, lr], cost, updates = updates, on_unused_input='warn') 278 | if doCRF: 279 | getPredictions = theano.function([x1, x2, x3, x4, e1, e2, numSamples], predictions_global, on_unused_input='warn') # cut of padded begin and end 280 | getPredictionsR1 = None 281 | getPredictionsET1 = None 282 | getPredictionsET2 = None 283 | else: 284 | getPredictions = None 285 | getPredictionsR1 = theano.function([x1, x2, x3, x4, e1, e2, numSamples], predictions_y1, on_unused_input='warn') 286 | getPredictionsET1 = theano.function([x1, x2, e1, numSamples], predictions_et1, on_unused_input='warn') 287 | getPredictionsET2 = theano.function([x3, x4, e2, numSamples], predictions_et2, on_unused_input='warn') 288 | 289 | ########## start training ########################### 290 | n_epochs = 15 291 | if "n_epochs" in config: 292 | n_epochs = int(config["n_epochs"]) 293 | 294 | bestF1 = 0 295 | best_params = [] 296 | best_epoch = 0 297 | epoch = 0 298 | lrate = lrateOrig 299 | while epoch < n_epochs: 300 | time1 = time.time() 301 | # train 302 | 303 | time1Train = time.time() 304 | for d in data_stream.get_epoch_iterator(as_dict = True): 305 | x1_numpy = d['x1'] 306 | x2_numpy = d['x2'] 307 | x3_numpy = d['x3'] 308 | x4_numpy = d['x4'] 309 | e1_numpy = d['e1'] 310 | e2_numpy = d['e2'] 311 | y_numpy = d['y'] 312 | y1ET_numpy = d['y1ET'] 313 | y2ET_numpy = d['y2ET'] 314 | numSamples_numpy = numpy.ones_like(y1ET_numpy) 315 | 316 | cost_ij = train(x1_numpy, x2_numpy, x3_numpy, x4_numpy, e1_numpy, e2_numpy, y_numpy, y1ET_numpy, y2ET_numpy, numSamples_numpy, lrate) 317 | if numpy.isnan(cost_ij): 318 | print "ERROR: NAN in cost" 319 | epoch = n_epochs 320 | break 321 | 322 | time2Train = time.time() 323 | print "time for training: " + str(time2Train - time1Train) 324 | if epoch < n_epochs: # don't evaluate if cost was NAN 325 | # validate with table filling 326 | time1Eval = time.time() 327 | curF1 = evaluateModel(data_stream_dev, epoch, doCRF, numClasses, numClassesET, getPredictions, getPredictionsR1, getPredictionsET1, getPredictionsET2) 328 | time2Eval = time.time() 329 | print "Average F1 over RE and ET: " + str(curF1) 330 | print "time for validation: " + str(time2Eval - time1Eval) 331 | if curF1 > bestF1: 332 | bestF1 = curF1 333 | best_epoch = epoch 334 | best_params = [] 335 | for p in params: 336 | best_params.append(p.get_value(borrow=False)) 337 | else: 338 | lrate = lrate * 0.5 339 | print "reducing learning rate to ", lrate 340 | if lrate < 0.00001: # early stopping 341 | epoch = n_epochs 342 | break 343 | epoch += 1 344 | 345 | time2 = time.time() 346 | print "time for epoch: " + str(time2 - time1) 347 | print "" 348 | 349 | print "FINAL: result on dev: " + str(bestF1) 350 | # re-storing best model and saving it 351 | save_file = open(networkfile, 'wb') 352 | for p, bp in zip(params, best_params): 353 | p.set_value(bp, borrow=False) 354 | cPickle.dump(bp, save_file, -1) 355 | save_file.close() 356 | 357 | # validate best model on test 358 | f1_test = evaluateModel(data_stream_test, best_epoch, doCRF, numClasses, numClassesET, getPredictions, getPredictionsR1, getPredictionsET1, getPredictionsET2) 359 | print "FINAL: result on test: " + str(f1_test) 360 | -------------------------------------------------------------------------------- /train_setup3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys 4 | import time 5 | from collections import defaultdict, OrderedDict 6 | import numpy 7 | from utils import readConfig, readWordvectorsNumpy 8 | from evaluation import evaluateModel_setup3 9 | import random 10 | import theano 11 | import theano.tensor as T 12 | import cPickle 13 | import pickle 14 | from layers import LeNetConvPoolLayer, HiddenLayer, LogisticRegression, CRF 15 | from fuel.datasets.hdf5 import H5PYDataset 16 | from fuel.streams import DataStream 17 | from fuel.schemes import SequentialScheme, ShuffledExampleScheme 18 | from iterationSchemes import ShuffledExampleSchemeBatch 19 | 20 | def sgd_updates(params, cost, learning_rate, sqrt_norm_lim = 3): 21 | updates = [] 22 | for param in params: 23 | gp = T.grad(cost, param) 24 | step = -1.0 * learning_rate * gp 25 | stepped_param = param + step 26 | if (param.get_value(borrow=True).ndim == 2) and (param.name!='Words'): 27 | col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0)) 28 | desired_norms = T.clip(col_norms, 0, sqrt_norm_lim) 29 | scale = desired_norms / (1e-7 + col_norms) 30 | updates.append((param, stepped_param * scale)) 31 | else: 32 | updates.append((param, stepped_param)) 33 | return updates 34 | 35 | 36 | if len(sys.argv) != 2: 37 | print "please pass the config file as parameters" 38 | exit(0) 39 | 40 | time1 = time.time() 41 | 42 | configfile = sys.argv[1] 43 | config = readConfig(configfile) 44 | 45 | print "config:" 46 | for c in config: 47 | print str(c) + "\t" + str(config[c]) 48 | 49 | datafile = config["file"] 50 | fp = open(datafile + "_indexMapping", 'rb') 51 | sentId2newIndex2oldIndex = pickle.load(fp) 52 | fp.close() 53 | iterationSeed = -1 54 | if "iterationSeed" in config: 55 | iterationSeed = int(config["iterationSeed"]) 56 | print "using " + str(iterationSeed) + " as seed for iteration scheme" 57 | pretrainedEmbeddings = False 58 | if "wordvectors" in config: 59 | wordvectorfile = config["wordvectors"] 60 | wordvectors, representationsize, words = readWordvectorsNumpy(wordvectorfile, isWord2vec = True) 61 | vocabsize = wordvectors.shape[0] 62 | pretrainedEmbeddings = True 63 | else: 64 | print "you have to specify a wordvector file in the config" 65 | exit() 66 | networkfile = config["net"] 67 | lrateOrig = float(config["lrate"]) 68 | print "using sgd with learning rate ", lrateOrig 69 | batch_size = int(config["batchsize"]) 70 | contextsize = int(config["contextsize"]) 71 | entitysize = int(config["entitysize"]) 72 | myLambda1 = 0 73 | if "lambda1" in config: 74 | myLambda1 = float(config["lambda1"]) 75 | myLambda2 = 0 76 | if "lambda2" in config: 77 | myLambda2 = float(config["lambda2"]) 78 | addInputSize = 1 # extra feature for hidden layer after convolution: name before filler? 79 | loss = "entropy" 80 | doCRF = False 81 | if "crf" in config: 82 | loss = "crf" 83 | doCRF = True 84 | print "using loss function: ", loss 85 | examplesPerUpdate = None 86 | if "examplesPerUpdate" in config: 87 | examplesPerUpdate = int(config["examplesPerUpdate"]) 88 | numPerBag = int(config["numPerBag"]) 89 | numClasses = int(config["numClasses"]) 90 | numClassesET = 5 91 | 92 | relationEvaluationMethod = "last" 93 | if "relationEvaluationMethod" in config: 94 | relationEvaluationMethod = config["relationEvaluationMethod"] 95 | print "using relation evaluation method", relationEvaluationMethod 96 | 97 | nkernsContext = int(config["nkernsContext"]) 98 | nkernsEntities = int(config["nkernsEntities"]) 99 | hiddenUnits = int(config["hidden"]) 100 | hiddenUnitsET = int(config["hiddenET"]) 101 | filtersizeContext = int(config["filtersizeContext"]) 102 | filtersizeEntities = int(config["filtersizeEntities"]) 103 | kmaxContext = int(config["kmaxContext"]) 104 | kmaxEntities = int(config["kmaxEntities"]) 105 | 106 | time1 = time.time() 107 | 108 | ######## FUEL ################# 109 | # Define "data_stream" 110 | # The names here (e.g. 'name1') need to match the names of the variables which 111 | # are the roots of the computational graph for the cost. 112 | 113 | train_set = H5PYDataset(datafile, which_sets = ('train',), load_in_memory=True) 114 | dev_set = H5PYDataset(datafile, which_sets = ('dev',), load_in_memory=True) 115 | test_set = H5PYDataset(datafile, which_sets = ('test',), load_in_memory=True) 116 | numSamplesDev = dev_set.num_examples 117 | numSamplesTrain = train_set.num_examples 118 | numSamplesTest = test_set.num_examples 119 | 120 | print "got " + str(numSamplesTrain) + " training examples" 121 | numTrainingBatches = numSamplesTrain / batch_size 122 | print "got " + str(numSamplesDev) + " dev examples" 123 | print "got " + str(numSamplesTest) + " test examples" 124 | 125 | if iterationSeed != -1: 126 | data_stream = DataStream(train_set, iteration_scheme = ShuffledExampleSchemeBatch(numSamplesTrain, batch_size, iterationSeed)) 127 | else: 128 | data_stream = DataStream(train_set, iteration_scheme = ShuffledExampleSchemeBatch(train_set.num_examples, batch_size)) 129 | data_stream_dev = DataStream(dev_set, iteration_scheme=SequentialScheme( 130 | dev_set.num_examples, 1)) 131 | data_stream_test = DataStream(test_set, iteration_scheme=SequentialScheme( 132 | test_set.num_examples, 1)) 133 | numSamplesDev = dev_set.num_examples 134 | numSamplesTest = test_set.num_examples 135 | numSamplesTrain = (train_set.num_examples / batch_size) * batch_size 136 | ################################ 137 | 138 | time2 = time.time() 139 | print "time for reading data: " + str(time2 - time1) 140 | 141 | # train network 142 | curSeed = 23455 143 | if "seed" in config: 144 | curSeed = int(config["seed"]) 145 | rng = numpy.random.RandomState(curSeed) 146 | seed = rng.get_state()[1][0] 147 | print "seed: " + str(seed) 148 | 149 | x1 = T.imatrix('x1') # shape: (batchsize, numPerBag * contextsize) # left of e1 150 | x2 = T.imatrix('x2') # shape: (batchsize, numPerBag * contextsize) # right of e1 151 | x3 = T.imatrix('x3') # shape: (batchsize, numPerBag * contextsize) # left of e2 152 | x4 = T.imatrix('x4') # shape: (batchsize, numPerBag * contextsize) # right of e3 153 | y = T.imatrix('y') # shape: (batchsize, 1) 154 | y1ET = T.imatrix('y1ET') # shape: (batchsize, 1) 155 | y2ET = T.imatrix('y2ET') # shape: (batchsize, 1) 156 | e1 = T.imatrix('e1') # shape: (batchsize, entitysize) 157 | e2 = T.imatrix('e2') # shape: (batchsize, entitysize) 158 | numSamples = T.imatrix('numSamples') # shape: (batchsize, 1) 159 | lr = T.scalar('lr') # learning rate 160 | 161 | embeddings = theano.shared(numpy.array(wordvectors, dtype = theano.config.floatX)).dimshuffle(1,0) 162 | 163 | batchsizeVar = numSamples.shape[0] 164 | y_resh = y.reshape((batchsizeVar,)) # rel:e1->e2 165 | y1ET_resh = y1ET.reshape((batchsizeVar,)) 166 | y2ET_resh = y2ET.reshape((batchsizeVar,)) 167 | 168 | numSamples_resh = numSamples.reshape((batchsizeVar,)) 169 | 170 | layers = [] 171 | 172 | cnnContext = LeNetConvPoolLayer(rng = rng, filter_shape = (nkernsContext, 1, representationsize, filtersizeContext), poolsize = (1, kmaxContext)) 173 | layers.append(cnnContext) 174 | if "middleContext" in config: 175 | hidden_in = nkernsContext * kmaxContext 176 | else: 177 | cnnEntities = LeNetConvPoolLayer(rng = rng, filter_shape = (nkernsEntities, 1, representationsize, filtersizeEntities), poolsize = (1, kmaxEntities)) 178 | layers.append(cnnEntities) 179 | hidden_in = 2 * (2 * nkernsContext * kmaxContext + nkernsEntities * kmaxEntities) 180 | hiddenLayer = HiddenLayer(rng = rng, n_in = hidden_in, n_out = hiddenUnits) 181 | layers.append(hiddenLayer) 182 | hiddenLayerET = HiddenLayer(rng = rng, n_in = 2 * nkernsContext * kmaxContext + nkernsEntities * kmaxEntities, n_out = hiddenUnitsET) 183 | layers.append(hiddenLayerET) 184 | randomInit = False 185 | if doCRF: 186 | randomInit = True 187 | outputLayer = LogisticRegression(n_in = hiddenUnits, n_out = numClasses, rng = rng, randomInit = randomInit) 188 | layers.append(outputLayer) 189 | outputLayerET = LogisticRegression(n_in = hiddenUnitsET, n_out = numClassesET, rng = rng, randomInit = randomInit) 190 | layers.append(outputLayerET) 191 | if doCRF: 192 | crfLayer = CRF(numClasses = numClasses + numClassesET, rng = rng, batchsizeVar = batchsizeVar, sequenceLength = 3) 193 | layers.append(crfLayer) 194 | 195 | x1_resh = x1.reshape((batchsizeVar * numPerBag, contextsize)) 196 | x1_emb = embeddings[:,x1_resh].dimshuffle(1, 0, 2) 197 | x1_emb = x1_emb.reshape((x1_emb.shape[0], 1, x1_emb.shape[1], x1_emb.shape[2])) 198 | x2_resh = x2.reshape((batchsizeVar * numPerBag, contextsize)) 199 | x2_emb = embeddings[:,x2_resh].dimshuffle(1, 0, 2) 200 | x2_emb = x2_emb.reshape((x2_emb.shape[0], 1, x2_emb.shape[1], x2_emb.shape[2])) 201 | x3_resh = x3.reshape((batchsizeVar * numPerBag, contextsize)) 202 | x3_emb = embeddings[:,x3_resh].dimshuffle(1, 0, 2) 203 | x3_emb = x3_emb.reshape((x3_emb.shape[0], 1, x3_emb.shape[1], x3_emb.shape[2])) 204 | x4_resh = x4.reshape((batchsizeVar * numPerBag, contextsize)) 205 | x4_emb = embeddings[:,x4_resh].dimshuffle(1, 0, 2) 206 | x4_emb = x4_emb.reshape((x4_emb.shape[0], 1, x4_emb.shape[1], x4_emb.shape[2])) 207 | 208 | e1_resh = e1.reshape((batchsizeVar, entitysize)) 209 | e1_emb = embeddings[:,e1_resh].dimshuffle(1, 0, 2) 210 | e1_emb = e1_emb.reshape((e1_emb.shape[0], 1, e1_emb.shape[1], e1_emb.shape[2])) 211 | e2_resh = e2.reshape((batchsizeVar, entitysize)) 212 | e2_emb = embeddings[:,e2_resh].dimshuffle(1, 0, 2) 213 | e2_emb = e2_emb.reshape((e2_emb.shape[0], 1, e2_emb.shape[1], e2_emb.shape[2])) 214 | 215 | x1_rep = cnnContext.getOutput(x1_emb) 216 | x2_rep = cnnContext.getOutput(x2_emb) 217 | x3_rep = cnnContext.getOutput(x3_emb) 218 | x4_rep = cnnContext.getOutput(x4_emb) 219 | e1_rep = cnnEntities.getOutput(e1_emb) 220 | e2_rep = cnnEntities.getOutput(e2_emb) 221 | 222 | e1_rep_repeated = e1_rep.flatten(2).repeat(numPerBag, axis = 0) 223 | e2_rep_repeated = e2_rep.flatten(2).repeat(numPerBag, axis = 0) 224 | 225 | aroundE1 = T.concatenate([x1_rep.flatten(2), e1_rep_repeated, x2_rep.flatten(2)], axis = 1) 226 | aroundE2 = T.concatenate([x3_rep.flatten(2), e2_rep_repeated, x4_rep.flatten(2)], axis = 1) 227 | 228 | # entity typing: 229 | hiddenForE1 = hiddenLayerET.getOutput(aroundE1) 230 | hiddenForE2 = hiddenLayerET.getOutput(aroundE2) 231 | 232 | # relation classification: 233 | if "middleContext" in config: 234 | e1_emb_repeated = e1_emb.repeat(numPerBag, axis = 0) 235 | e2_emb_repeated = e2_emb.repeat(numPerBag, axis = 0) 236 | 237 | betweenE1E2 = cnnContext.getOutput(T.concatenate([e1_emb_repeated, x2_emb, e2_emb_repeated], axis = 3)) 238 | 239 | betweenE1E2flatten = betweenE1E2.flatten(2) 240 | 241 | # to predict r1: between e1 and e2 242 | hiddenForR1 = hiddenLayer.getOutput(betweenE1E2flatten) 243 | 244 | else: 245 | # to predict r1: aroundE1 (x1 + e1 + x2) and aroundE2 (x2 + e2 + x3) 246 | hiddenForR1 = hiddenLayer.getOutput(T.concatenate([aroundE1,aroundE2], axis = 1)) 247 | 248 | if doCRF: 249 | # scores for different classes for r1, r2 and r3 250 | scoresForR1 = outputLayer.getScores(hiddenForR1, numSamples, batchsizeVar) 251 | scoresForE1 = outputLayerET.getScores(hiddenForE1, numSamples, batchsizeVar) 252 | scoresForE2 = outputLayerET.getScores(hiddenForE2, numSamples, batchsizeVar) 253 | 254 | scores = T.zeros((batchsizeVar, 3, numClasses + numClassesET)) 255 | scores = T.set_subtensor(scores[:,0,numClasses:], scoresForE1) 256 | scores = T.set_subtensor(scores[:,1,:numClasses], scoresForR1) 257 | scores = T.set_subtensor(scores[:,2,numClasses:], scoresForE2) 258 | y_conc = T.concatenate([y1ET + numClasses, y, y2ET + numClasses], axis = 1) 259 | cost = crfLayer.getCostAddLogWeights(scores, y_conc) 260 | else: 261 | cost = outputLayer.getCostMI(hiddenForR1, y_resh, numSamples, batchsizeVar) + outputLayerET.getCostMI(hiddenForE1, y1ET_resh, numSamples, batchsizeVar) + outputLayerET.getCostMI(hiddenForE2, y2ET_resh, numSamples, batchsizeVar) 262 | 263 | params = [] 264 | for l in layers: 265 | params += l.params 266 | 267 | reg2 = 0.0 268 | reg1 = 0.0 269 | for p in params: 270 | if ".W" in p.name or "_W" in p.name: 271 | print "found W", p 272 | reg2 += T.sum(p ** 2) 273 | reg1 += T.sum(abs(p)) 274 | cost += myLambda2 * reg2 275 | cost += myLambda1 * reg1 276 | 277 | updates = sgd_updates(params, cost, learning_rate = lr) 278 | 279 | if doCRF: 280 | predictions_global = crfLayer.getPrediction(scores) 281 | else: 282 | predictions_y1 = outputLayer.getOutput(hiddenForR1, numSamples, batchsizeVar) 283 | predictions_et1 = outputLayerET.getOutput(hiddenForE1, numSamples, batchsizeVar) 284 | predictions_et2 = outputLayerET.getOutput(hiddenForE2, numSamples, batchsizeVar) 285 | 286 | train = theano.function([x1, x2, x3, x4, e1, e2, y, y1ET, y2ET, numSamples, lr], cost, updates = updates, on_unused_input='warn') 287 | if doCRF: 288 | getPredictions = theano.function([x1, x2, x3, x4, e1, e2, numSamples], predictions_global, on_unused_input='warn') # cut of padded begin and end 289 | getPredictionsR1 = None 290 | getPredictionsET1 = None 291 | getPredictionsET2 = None 292 | else: 293 | getPredictions = None 294 | getPredictionsR1 = theano.function([x1, x2, x3, x4, e1, e2, numSamples], predictions_y1, on_unused_input='warn') 295 | getPredictionsET1 = theano.function([x1, x2, e1, numSamples], predictions_et1, on_unused_input='warn') 296 | getPredictionsET2 = theano.function([x3, x4, e2, numSamples], predictions_et2, on_unused_input='warn') 297 | 298 | ########## start training ########################### 299 | n_epochs = 15 300 | if "n_epochs" in config: 301 | n_epochs = int(config["n_epochs"]) 302 | 303 | bestF1 = 0 304 | best_params = [] 305 | best_epoch = 0 306 | epoch = 0 307 | lrate = lrateOrig 308 | while epoch < n_epochs: 309 | time1 = time.time() 310 | # train 311 | 312 | time1Train = time.time() 313 | for d in data_stream.get_epoch_iterator(as_dict = True): 314 | x1_numpy = d['x1'] 315 | x2_numpy = d['x2'] 316 | x3_numpy = d['x3'] 317 | x4_numpy = d['x4'] 318 | e1_numpy = d['e1'] 319 | e2_numpy = d['e2'] 320 | y_numpy = d['y'] 321 | y1ET_numpy = d['y1ET'] 322 | y2ET_numpy = d['y2ET'] 323 | numSamples_numpy = numpy.ones_like(y1ET_numpy) 324 | 325 | cost_ij = train(x1_numpy, x2_numpy, x3_numpy, x4_numpy, e1_numpy, e2_numpy, y_numpy, y1ET_numpy, y2ET_numpy, numSamples_numpy, lrate) 326 | if numpy.isnan(cost_ij): 327 | print "ERROR: NAN in cost" 328 | epoch = n_epochs 329 | break 330 | 331 | time2Train = time.time() 332 | print "time for training: " + str(time2Train - time1Train) 333 | if epoch < n_epochs: # don't evaluate if cost was NAN 334 | # validate with table filling 335 | time1Eval = time.time() 336 | curF1 = evaluateModel_setup3(data_stream_dev, epoch, doCRF, relationEvaluationMethod, sentId2newIndex2oldIndex, numClasses, numClassesET, getPredictions, getPredictionsR1, getPredictionsET1, getPredictionsET2) 337 | time2Eval = time.time() 338 | print "Average F1 over RE and ET: " + str(curF1) 339 | print "time for validation: " + str(time2Eval - time1Eval) 340 | if curF1 > bestF1: 341 | bestF1 = curF1 342 | best_epoch = epoch 343 | best_params = [] 344 | for p in params: 345 | best_params.append(p.get_value(borrow=False)) 346 | else: 347 | lrate = lrate * 0.5 348 | print "reducing learning rate to ", lrate 349 | if lrate < 0.00001: # early stopping 350 | epoch = n_epochs 351 | break 352 | epoch += 1 353 | 354 | time2 = time.time() 355 | print "time for epoch: " + str(time2 - time1) 356 | print "" 357 | 358 | print "FINAL: result on dev: " + str(bestF1) 359 | # re-storing best model and saving it 360 | save_file = open(networkfile, 'wb') 361 | for p, bp in zip(params, best_params): 362 | p.set_value(bp, borrow=False) 363 | cPickle.dump(bp, save_file, -1) 364 | save_file.close() 365 | 366 | # validate best model on test 367 | f1_test = evaluateModel_setup3(data_stream_test, best_epoch, doCRF, relationEvaluationMethod, sentId2newIndex2oldIndex, numClasses, numClassesET, getPredictions, getPredictionsR1, getPredictionsET1, getPredictionsET2) 368 | print "FINAL: result on test: " + str(f1_test) 369 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | from __future__ import unicode_literals 5 | import re 6 | import random 7 | import numpy 8 | import codecs, sys 9 | reload(sys) 10 | sys.setdefaultencoding('utf-8') 11 | sys.stdout = codecs.getwriter('utf8')(sys.stdout) 12 | sys.stderr = codecs.getwriter('utf8')(sys.stderr) 13 | import io 14 | import gzip 15 | 16 | def readConfigBasic(configfile): 17 | config = {} 18 | # read config file 19 | f = open(configfile, 'r') 20 | for line in f: 21 | if "#" == line[0]: 22 | continue # skip commentars 23 | line = line.strip() 24 | parts = line.split('=') 25 | name = parts[0] 26 | value = parts[1] 27 | config[name] = value 28 | f.close() 29 | return config 30 | 31 | def readConfig(configfile): 32 | config = readConfigBasic(configfile) 33 | return config 34 | 35 | def readIndices(wordvectorfile, isWord2vec = True): 36 | indices = {} 37 | curIndex = 0 38 | indices[""] = curIndex 39 | curIndex += 1 40 | indices[""] = curIndex 41 | curIndex += 1 42 | if ".gz" in wordvectorfile: 43 | f = gzip.open(wordvectorfile, 'r') 44 | else: 45 | f = open(wordvectorfile, 'r') 46 | count = 0 47 | for line in f: 48 | if isWord2vec: 49 | if count == 0: 50 | print "omitting first embedding line because of word2vec" 51 | count += 1 52 | continue 53 | parts = line.split() 54 | word = parts[0] 55 | indices[word] = curIndex 56 | curIndex += 1 57 | f.close() 58 | return indices 59 | 60 | def readWordvectorsNumpy(wordvectorfile, isWord2vec = True): 61 | wordvectors = [] 62 | words = [] 63 | vectorsize = 0 64 | if ".gz" in wordvectorfile: 65 | f = gzip.open(wordvectorfile, 'r') 66 | else: 67 | f = open(wordvectorfile, 'r') 68 | count = 0 69 | for line in f: 70 | if isWord2vec: 71 | if count == 0: 72 | print "omitting first embedding line because of word2vec" 73 | count += 1 74 | continue 75 | parts = line.split() 76 | word = parts.pop(0) # ignore word string 77 | wordvectors.append([float(p) for p in parts]) 78 | words.append(word) 79 | vectorsize = len(parts) 80 | f.close() 81 | # first entry: (zero) vector 82 | # second entry: (random) vector 83 | zeroVec = [0 for i in range(vectorsize)] 84 | random.seed(123456) 85 | randomVec = [random.uniform(-numpy.sqrt(1./len(wordvectors)), numpy.sqrt(1./len(wordvectors))) for i in range(vectorsize)] 86 | wordvectors.insert(0,randomVec) 87 | words.insert(0, "") 88 | wordvectors.insert(0, zeroVec) 89 | words.insert(0, "") 90 | 91 | wordvectorsNumpy = numpy.array(wordvectors) 92 | return wordvectorsNumpy, vectorsize, words 93 | 94 | def getCoNNL_label2int(): 95 | label2int = {} 96 | relSet = ['OrgBased_In', 'Live_In', 'Kill', 'Located_In', 'Work_For'] 97 | nerSet = ['L-Org', 'U-Loc', 'U-Peop', 'U-Org', 'B-Org', 'B-Other', 'I-Org', 'B-Peop', 'I-Loc', 'I-Peop', 'I-Other', 'L-Loc', 'U-Other', 'L-Other', 'B-Loc', 'L-Peop'] 98 | index = 1 # index 0 = no ner / rel 99 | label2int['O'] = 0 100 | for n in nerSet: 101 | label2int[n] = index 102 | index += 1 103 | index = 1 # with two different softmax it's possible / even necessary to use the same integers again 104 | for r in relSet: 105 | label2int[r] = index 106 | index += 1 107 | return label2int 108 | 109 | def getMatrixForContext(context, contextsize, wordindices): 110 | matrix = numpy.zeros(shape = (contextsize)) 111 | i = 0 112 | nextIndex = 0 113 | while i < len(context): 114 | word = context[i] 115 | nextIndex = 0 116 | # current word 117 | if word != "": 118 | if not word in wordindices: 119 | if re.search(r'^\d+$', word): 120 | word = "0" 121 | if word.islower(): 122 | word = word.title() 123 | else: 124 | word = word.lower() 125 | if not word in wordindices: 126 | word = "" 127 | curIndex = wordindices[word] 128 | matrix[i] = curIndex 129 | i += 1 130 | 131 | return matrix 132 | 133 | def adaptNumSamplesTrain(numSamplesTrain, idTrain): 134 | while idTrain[numSamplesTrain] == idTrain[numSamplesTrain + 1]: 135 | numSamplesTrain += 1 136 | return numSamplesTrain + 1 # because we want the number of samples, not the index 137 | 138 | def getRelID(relName): 139 | relSet = ['O', 'OrgBased_In', 'Live_In', 'Kill', 'Located_In', 'Work_For'] 140 | return relSet.index(relName) 141 | 142 | def getNerID(nerName): 143 | nerSet = ['O', 'Org', 'Loc', 'Peop', 'Other'] 144 | return nerSet.index(nerName) 145 | 146 | def cleanContext(context): 147 | c = " ".join(context) 148 | c = re.sub(r'\-LRB\-', '(', c) 149 | c = re.sub(r'\-RRB\-', ')', c) 150 | c = re.sub(r' COMMA ', ' , ', c) 151 | c = re.sub(r'(\S)(\W)$', '\\1 \\2', c) 152 | return c.split() 153 | 154 | def reverse(x_in, x_len, numSamples, contentDim): 155 | x_rev = numpy.zeros(shape = (numSamples, contentDim)) 156 | for i in range(numSamples): 157 | if x_len[i,0] > 0: 158 | # reverse context: 159 | x_rev[i,:x_len[i,0]] = x_in[i,x_len[i,0]-1::-1] 160 | return x_rev 161 | 162 | def processPredictions(predictionsR1, probsR1): 163 | predictionsBatch = [] 164 | for b in range(predictionsR1.shape[0]): 165 | predR1_b = predictionsR1[b] 166 | probR1_b = probsR1[b] 167 | maxPositiveProb = 0 168 | bestPrediction = 0 169 | for curPred, curProb in zip(predR1_b, probR1_b): 170 | if curPred > 0 and curProb > maxPositiveProb: 171 | maxPositiveProb = curProb 172 | bestPrediction = curPred 173 | predictionsBatch.append(bestPrediction) 174 | return predictionsBatch 175 | 176 | def getReversedRel(rel): 177 | rev = numpy.zeros_like(rel) 178 | for b in range(rel.shape[0]): 179 | curRel = rel[b,0] 180 | if curRel == 0: 181 | rev[b,0] = 0 182 | elif curRel % 2 == 0: 183 | rev[b,0] = curRel - 1 184 | else: 185 | rev[b,0] = curRel + 1 186 | return rev 187 | 188 | def getF1(allHypos, allRefs, numClasses, name = ""): 189 | class2precision = {} 190 | class2recall = {} 191 | class2f1 = {} 192 | class2tp = {} 193 | class2numHypo = {} 194 | class2numRef = {} 195 | for cl in range(numClasses): # initialize 196 | class2numHypo[cl] = 0 197 | class2numRef[cl] = 0 198 | class2tp[cl] = 0 199 | class2precision[cl] = 0 200 | class2recall[cl] = 0 201 | class2f1[cl] = 0 202 | for h, r in zip(allHypos, allRefs): 203 | if h >= numClasses: 204 | print "ERROR: prediction of " + str(h) + " but only " + str(numClasses) + " classes for " + name 205 | h = 0 206 | class2numHypo[h] += 1 207 | class2numRef[r] += 1 208 | if h == r: 209 | class2tp[h] += 1 210 | sumF1 = 0 211 | for cl in range(1, len(class2numHypo.keys())): 212 | prec = 1.0 213 | numH = class2numHypo[cl] 214 | numR = class2numRef[cl] 215 | if numH > 0: 216 | prec = class2tp[cl] * 1.0 / numH 217 | class2precision[cl] = prec 218 | rec = 0.0 219 | if numR > 0: 220 | rec = class2tp[cl] * 1.0 / numR 221 | class2recall[cl] = rec 222 | f1 = 0.0 223 | if prec + rec > 0: 224 | f1 = prec * rec * 2.0 / (prec + rec) 225 | class2f1[cl] = f1 226 | sumF1 += f1 227 | print "Class " + str(cl) + ": numRef: " + str(numR) + ", numHypo: " + str(numH) + ", P = " + str(prec) + ", R = " + str(rec) + ", F1 = " + str(f1) 228 | macroF1 = sumF1 * 1.0 / (numClasses - 1) 229 | if name == "": 230 | print "Macro F1: " + str(macroF1) 231 | else: 232 | print "Macro F1 " + str(name) + ": " + str(macroF1) 233 | return macroF1 234 | 235 | def getMajorityPrediction(types): 236 | hypos = [t[0] for t in types] 237 | refs = [t[1] for t in types] 238 | assert len(set(refs)) == 1 239 | sortedHypos = sorted([(hypos.count(e), e) for e in set(hypos)], key=lambda x:x[0], reverse=True) 240 | elems = [h[1] for h in sortedHypos] 241 | counts = [h[0] for h in sortedHypos] 242 | if len(counts) == 1 or counts[0] != counts[1]: # easy case 243 | return elems[0], refs[0] 244 | # select most common class among hypos with highest votes 245 | bestCounts = 0 246 | i = 1 247 | while i < len(counts) and counts[i] == counts[0]: 248 | bestCounts = i 249 | i += 1 250 | bestElems = elems[:bestCounts + 1] 251 | # order of ET classes according to frequency: 252 | # 1. loc: 2 253 | # 2. per: 3 254 | # 3. org: 1 255 | # 4. other: 4 256 | for mostFreq in [2, 3, 1, 4]: 257 | if mostFreq in bestElems: 258 | return mostFreq, refs[0] 259 | return 0, refs[0] 260 | 261 | def getRelaxedPredictionEntityType(predictions, refs): 262 | assert len(set(refs)) == 1 263 | ref = refs[0] 264 | if ref in predictions: # prediction is considered as correct 265 | return ref, ref 266 | else: 267 | return predictions[0], ref # just pick random prediction 268 | 269 | def getPredictionRelation(predictions, refs, relationEvaluationMethod): 270 | assert len(set([r[2] for r in refs])) == 1 271 | ref = refs[0][2] 272 | if relationEvaluationMethod == "relaxed": # hypo is correct if one of the hypos is correct 273 | hypos = [h[2] for h in predictions] 274 | if ref in hypos: 275 | return ref, ref 276 | else: 277 | return hypos[0], ref # random prediction 278 | else: # hypo is prediction in cell with last token of entities 279 | maximumE1 = max([h[0] for h in predictions]) 280 | maximumE2 = max([h[1] for h in predictions]) 281 | for h in predictions: 282 | if h[0] == maximumE1 and h[1] == maximumE2: 283 | return h[2], ref 284 | # default return, should never happen 285 | return predictions[0], ref 286 | 287 | def mergeREPredictionsWithOldIndices(curSentence_entityPair2relations, newIndex2oldIndex): 288 | curSentence_pair2predictions = {} 289 | curSentence_pair2refs = {} 290 | for ent1, ent2 in curSentence_entityPair2relations: 291 | oldIndex1a, oldIndex1b = newIndex2oldIndex[ent1].split("_") 292 | oldIndex2a, oldIndex2b = newIndex2oldIndex[ent2].split("_") 293 | if oldIndex1a == oldIndex2a: 294 | continue # this is entity typing, not relation classification 295 | if not (oldIndex1a, oldIndex2a) in curSentence_pair2predictions: 296 | curSentence_pair2predictions[(oldIndex1a, oldIndex2a)] = [] 297 | curSentence_pair2refs[(oldIndex1a, oldIndex2a)] = [] 298 | for rIndex in range(len(curSentence_entityPair2relations[(ent1, ent2)])): 299 | curSentence_pair2predictions[(oldIndex1a, oldIndex2a)].append((oldIndex1b, oldIndex2b, curSentence_entityPair2relations[(ent1, ent2)][rIndex][0])) 300 | curSentence_pair2refs[(oldIndex1a, oldIndex2a)].append((oldIndex1b, oldIndex2b, curSentence_entityPair2relations[(ent1, ent2)][rIndex][1])) 301 | return curSentence_pair2predictions, curSentence_pair2refs 302 | 303 | def mergeETPredictionsWithOldIndices(curSentence_entity2types, newIndex2oldIndex): 304 | curSentence_ent2majorityPredictions = {} 305 | curSentence_ent2refs = {} 306 | for ent in curSentence_entity2types: 307 | majorityPrediction = getMajorityPrediction(curSentence_entity2types[ent]) 308 | oldIndex = int(newIndex2oldIndex[ent].split("_")[0]) 309 | if not oldIndex in curSentence_ent2majorityPredictions: 310 | curSentence_ent2majorityPredictions[oldIndex] = [] 311 | curSentence_ent2refs[oldIndex] = [] 312 | curSentence_ent2majorityPredictions[oldIndex].append(majorityPrediction[0]) 313 | curSentence_ent2refs[oldIndex].append(majorityPrediction[1]) 314 | return curSentence_ent2majorityPredictions, curSentence_ent2refs 315 | 316 | --------------------------------------------------------------------------------