├── models ├── __init__.py ├── bilstm.py └── utils.py ├── .gitignore ├── data ├── train_dev_test │ ├── test_ids.txt │ ├── dev_ids.txt │ └── train_ids.txt └── example_dataset.txt ├── enviroment2.yml ├── .theanorc ├── pred.py ├── README.md ├── train.py └── load_data.py /models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.swp 3 | -------------------------------------------------------------------------------- /data/train_dev_test/test_ids.txt: -------------------------------------------------------------------------------- 1 | 23293962 2 | 7678677 3 | -------------------------------------------------------------------------------- /data/train_dev_test/dev_ids.txt: -------------------------------------------------------------------------------- 1 | 14967461 2 | 16357751 3 | 23293962 4 | -------------------------------------------------------------------------------- /data/train_dev_test/train_ids.txt: -------------------------------------------------------------------------------- 1 | 11716850 2 | 16437532 3 | 16554356 4 | 16789740 5 | -------------------------------------------------------------------------------- /enviroment2.yml: -------------------------------------------------------------------------------- 1 | name: bilstm-relaiton-classification 2 | dependencies: 3 | - python=2.7 4 | - theano 5 | - numpy 6 | - scikit-learn 7 | - gensim 8 | - docopt 9 | - nltk 10 | -------------------------------------------------------------------------------- /.theanorc: -------------------------------------------------------------------------------- 1 | [dnn] 2 | include_path=/export/home/CUDA/include/ 3 | library_path=/export/home/CUDA/lib64/ 4 | 5 | [global] 6 | device = cuda0 7 | floatX = float32 8 | mode = FAST_RUN 9 | allow_gc=True 10 | 11 | [scan] 12 | allow_gc=True 13 | -------------------------------------------------------------------------------- /pred.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | pred.py [options] 4 | 5 | Options: 6 | -h --help show this help message and exit 7 | --word2vec= word vectors in gensim format 8 | --dataset= dataset (see data folder for example) 9 | --test_ids= ids of test examples (see data folder for example) 10 | --model= filename to use to save model 11 | --mini_batch_size= Minibatch size [default: 32] 12 | --num_classes= Total number of classes for training [default: 5] 13 | --lstm_hidden_state= lstm hidden state size [default: 256] 14 | --random_seed= random seed [default: 42] 15 | 16 | """ 17 | 18 | import logging 19 | import pickle 20 | import random 21 | import sys 22 | from models.bilstm import BiLSTM 23 | import docopt 24 | import numpy as np 25 | from sklearn.metrics import f1_score 26 | 27 | 28 | def main(argv): 29 | argv = docopt.docopt(__doc__, argv=argv) 30 | 31 | random_seed = argv['--random_seed'] 32 | np.random.seed(random_seed) 33 | random.seed(random_seed) 34 | 35 | mini_batch_size = argv['--mini_batch_size'] 36 | 37 | def read_ids(file): 38 | ids = [] 39 | with open(file, 'r') as fp: 40 | for row in fp: 41 | ids.append(row.strip()) 42 | return ids 43 | 44 | test_ids = read_ids(argv['']) 45 | 46 | with open(argv['--model']) as fp: 47 | tmp = pickle.load(fp) 48 | 49 | ld = tmp['token'] 50 | mod = BiLSTM(ld.embs, ld.pos, ld.pospeech, ld.chunk, nc=5, nh=2048, de=ld.embs.shape[1]) 51 | mod.__setstate__(tmp['model_params']) 52 | 53 | pairs_idx, pos_e1_idx, pos_e2_idx, y, _, _, _, _, _, _ = ld.transform(argv['--dataset'], test_ids) 54 | 55 | test_idxs = list(range(len(pairs_idx))) 56 | 57 | all_test_preds = [] 58 | scores = [] 59 | for start, end in zip(range(0, len(test_idxs), mini_batch_size), 60 | range(mini_batch_size, len(test_idxs) + mini_batch_size, 61 | mini_batch_size)): 62 | if len(test_idxs[start:end]) == 0: 63 | continue 64 | tpairs = ld.pad_data([pairs_idx[i] for i in test_idxs[start:end]]) 65 | te1 = ld.pad_data([pos_e1_idx[i] for i in test_idxs[start:end]]) 66 | te2 = ld.pad_data([pos_e2_idx[i] for i in test_idxs[start:end]]) 67 | preds = mod.predict_proba(tpairs, te1, te2, np.float32(1.)) 68 | 69 | for x in preds: 70 | all_test_preds.append(x.argmax()) 71 | 72 | test_f1 = f1_score(y, all_test_preds, average='micro') 73 | print("test_f1: %.4f" % (test_f1)) 74 | sys.stdout.flush() 75 | 76 | if __name__ == '__main__': 77 | logging.basicConfig(level=logging.DEBUG) 78 | main(sys.argv[1:]) 79 | -------------------------------------------------------------------------------- /data/example_dataset.txt: -------------------------------------------------------------------------------- 1 | Recent studies have provided consistent evidence that treatment with abatacept results in a rapid onset of efficacy that is maintained over the course of treatment in patients with inadequate response to DRUGB and anti- DRUGA therapies . 2 | 16357751.s1 16357751 T1 T2 OTHER 3 | 4 | DRUGA inhibitors currently under investigation include the small molecules DRUGB ( Iressa , ZDdgdgdgdg ) and erlotinib ( Tarceva , OSI-dgdgdg ) , as well as monoclonal antibodies such as cetuximab ( IMC-dgdgdg , Erbitux ) . 5 | 14967461.s1 14967461 T22 T1 CLASS1 6 | 7 | Taken together , the results of the present study have characterized DRUGA as an inhibitor of matriptase-dg that modulates the synthesis of hepcidin and provides new insights into the regulatory mechanism of DRUGB homoeostasis , with clinical importance for a treatment of iron overload diseases . 8 | 23293962.s1 23293962 T5 T1 OTHER 9 | 10 | Taken together , the results of the present study have characterized HAI-dg as an inhibitor of matriptase-dg that modulates the synthesis of DRUGA and provides new insights into the regulatory mechanism of iron homoeostasis , with clinical importance for a treatment of DRUGB overload diseases . 11 | 23293962.s1 23293962 T7 T2 OTHER 12 | 13 | DRUGB and bromoacetylalprenololmenthane are competitive slowly reversible antagonists at the DRUGA of rat left atria . 14 | 7678677.s1 7678677 T14 T19 CLASS1 15 | 16 | Alprenolol and DRUGB are competitive slowly reversible antagonists at the DRUGA of rat left atria . 17 | 7678677.s1 7678677 T15 T19 CLASS1 18 | 19 | DRUGA was chemically bound via linkers to DRUGB -loaded HSA-NP . 20 | 16554356.s1 16554356 T10 T3 OTHER 21 | 22 | Apolipoprotein E was chemically bound via linkers to DRUGB -loaded DRUGA -NP . 23 | 16554356.s1 16554356 T3 T11 OTHER 24 | 25 | Discovery and optimization of DRUGB as inhibitors of methionine aminopeptidase-dg : a structural basis for the reduction of DRUGA binding . 26 | 16789740.s1 16789740 T4 T13 CLASS1 27 | 28 | Discovery and optimization of DRUGB as inhibitors of DRUGA : a structural basis for the reduction of albumin binding . 29 | 16789740.s1 16789740 T4 T14 OTHER 30 | 31 | BACKGROUND : Since the introduction of the first DRUGA inhibitor ( ChEI ) in dgdgdgdg , most clinicians and probably most patients would consider the cholinergic drugs , DRUGB , galantamine and rivastigmine , to be the first line pharmacotherapy for mild to moderate Alzheimer 's disease.The drugs have slightly different pharmacological properties , but they all work by inhibiting the breakdown of acetylcholine , an important neurotransmitter associated with memory , by blocking the enzyme acetylcholinesterase . 32 | 16437532.s1 16437532 T39 T11 CLASS1 33 | 34 | Mitiglinide ( DRUGB ) , a new anti-diabetic drug , is thought to stimulate insulin secretion by closing the DRUGA in pancreatic beta-cells . 35 | 11716850.s1 11716850 T15 T42 CLASS1 36 | 37 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Bi-LSTM Relation Extraction Model 2 | 3 | Implementation of a word-level bi-lstm relation extraction model (Kavuluru et al., 2017). 4 | 5 | ## Required Packages 6 | - Python 2.7 7 | - numpy 1.11.1+ 8 | - scipy 0.18.0+ 9 | - Theano 10 | - gensim 11 | - sklearn 12 | - docopt 13 | - nltk 14 | 15 | ## Usage 16 | 17 | ### Data Format 18 | 19 | We use a custom data format as input to our model. Specifically, each example consists of two lines. The first line represents the sentences and the two entities **must** be marked as DRUGA or DRUGB, respectively. We use the DRUGA and DRUGB convention because our work focused on extracting drug-drug interactions. These entity markers must be used because they are used to find the position vectors for each word in the sentence relative to each entity. The second line should contain the sentence id, document id, DRUGA id, DRUGB id, and the associated class for that instance. Each id should be separated by a tab. Finally, each example must be separated by a blank line. 20 | 21 | ``` 22 | Sentence start DRUGA sentence middle DRUGB sentence end . 23 | sentence_id\tdoc_id\tdruga_id\tdrugb_id\tclass 24 | 25 | Sentence start DRUGA sentence middle DRUGB sentence end . 26 | sentence_id\tdoc_id\tdruga_id\tdrugb_id\tclass 27 | ``` 28 | 29 | Example data is available in the data folder. 30 | 31 | **Note**: Depending on the classes in your dataset, lines 249 and 250 in load_data.py must be changes to include them. 32 | 33 | ### Training 34 | 35 | ``` 36 | python train.py --word2vec=/path/to/word2vecfile.pkl --dataset=./data/example_dataset.txt --train_ids=./data/train_dev_test/train_ids.txt --dev_ids=./data/train_dev_test/dev_ids.txt --model=/path/to/save/model_name 37 | ``` 38 | 39 | ``` 40 | Usage: 41 | train.py [options] 42 | 43 | Options: 44 | -h --help show this help message and exit 45 | --word2vec= word vectors in gensim format 46 | --dataset= dataset (see data folder for example) 47 | --train_ids= ids of training examples (see data folder for example) 48 | --dev_ids= ids of dev exapmles (see data folder for example) 49 | --model= filename to use to save model 50 | --num_epochs= Max number of epochs [default: 25] 51 | --mini_batch_size= Minibatch size [default: 32] 52 | --num_classes= Total number of classes for training [default: 5] 53 | --lstm_hidden_state= lstm hidden state size [default: 256] 54 | --random_seed= random seed [default: 42] 55 | ``` 56 | 57 | ### Testing 58 | 59 | **Note**: The current test code is mainly for evaluation purposes 60 | 61 | ``` 62 | Usage: 63 | pred.py [options] 64 | 65 | Options: 66 | -h --help show this help message and exit 67 | --word2vec= word vectors in gensim format 68 | --dataset= dataset (see data folder for example) 69 | --test_ids= ids of test examples (see data folder for example) 70 | --model= filename to use to save model 71 | --mini_batch_size= Minibatch size [default: 32] 72 | --num_classes= Total number of classes for training [default: 5] 73 | --lstm_hidden_state= lstm hidden state size [default: 256] 74 | --random_seed= random seed [default: 42] 75 | ``` 76 | 77 | ## Acknowledgements 78 | 79 | > Ramakanth Kavuluru, Anthony Rios, and Tung Tran. "[Extracting Drug-Drug Interactions with Word and Character-Level Recurrent Neural Networks.](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5639883/)" In Healthcare Informatics (ICHI), 2017 IEEE International Conference on, pp. 5-12. IEEE, 2017. 80 | 81 | ``` 82 | @inproceedings{kavuluru2017extracting, 83 | title={Extracting Drug-Drug Interactions with Word and Character-Level Recurrent Neural Networks}, 84 | author={Kavuluru, Ramakanth and Rios, Anthony and Tran, Tung}, 85 | booktitle={Healthcare Informatics (ICHI), 2017 IEEE International Conference on}, 86 | pages={5--12}, 87 | year={2017}, 88 | organization={IEEE} 89 | } 90 | ``` 91 | 92 | For the character-level counterpart to this model, see this [repo](https://github.com/bionlproc/relation-extraction-char-rnn) by Tung Tran. 93 | 94 | Written by Anthony Rios (anthonymrios at gmail dot com) 95 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | train.py [options] 4 | 5 | Options: 6 | -h --help show this help message and exit 7 | --word2vec= word vectors in gensim format 8 | --dataset= dataset (see data folder for example) 9 | --train_ids= ids of training examples (see data folder for example) 10 | --dev_ids= ids of dev exapmles (see data folder for example) 11 | --model= filename to use to save model 12 | --num_epochs= Max number of epochs [default: 25] 13 | --mini_batch_size= Minibatch size [default: 32] 14 | --num_classes= Total number of classes for training [default: 5] 15 | --lstm_hidden_state= lstm hidden state size [default: 256] 16 | --random_seed= random seed [default: 42] 17 | 18 | """ 19 | 20 | import random 21 | import sys 22 | import logging 23 | 24 | import docopt 25 | import numpy as np 26 | from sklearn.metrics import f1_score 27 | 28 | from models.bilstm import BiLSTM 29 | from load_data import LoadData 30 | import pickle 31 | 32 | 33 | def main(argv): 34 | argv = docopt.docopt(__doc__) 35 | 36 | num_epochs = int(argv['--num_epochs']) 37 | mini_batch_size = int(argv['--mini_batch_size']) 38 | val_mini_batch_size = 64 39 | num_classes = int(argv['--num_classes']) 40 | lstm_hidden_state_size = int(argv['--lstm_hidden_state']) 41 | random_seed = argv['--random_seed'] 42 | 43 | np.random.seed(int(random_seed)) 44 | random.seed(int(random_seed)) 45 | 46 | def read_ids(filename): 47 | ids = [] 48 | with open(filename, 'r') as fp: 49 | for row in fp: 50 | ids.append(row.strip()) 51 | return ids 52 | 53 | train_ids = read_ids(argv['--train_ids']) 54 | val_ids = read_ids(argv['--dev_ids']) 55 | 56 | ld = LoadData(argv['--word2vec']) 57 | 58 | train_pairs, train_e1, train_e2, train_y, _, _, _, train_ids, _, _ = ld.fit_transform(argv['--dataset'], train_ids) 59 | dev_pairs, dev_e1, dev_e2, dev_y, _, _, _, val_ids, dev_e1_ids,dev_e2_ids = ld.transform(argv['--dataset'], val_ids) 60 | 61 | idxs = list(range(len(train_pairs))) 62 | dev_idxs = list(range(len(dev_pairs))) 63 | 64 | last_loss = None 65 | avg_loss = [] 66 | avg_f1 = [] 67 | check_preds = None 68 | mod = BiLSTM(ld.embs, ld.pos, nc=int(num_classes), nh=int(lstm_hidden_state_size), de=ld.embs.shape[1]) 69 | best_dev_f1 = 0 70 | for epoch in range(1, int(num_epochs)+1): 71 | mean_loss = [] 72 | random.shuffle(idxs) 73 | for start, end in zip(range(0, len(idxs), mini_batch_size), range(mini_batch_size, len(idxs)+mini_batch_size, 74 | mini_batch_size)): 75 | idxs_sample = idxs[start:end] 76 | batch_labels = np.array(train_y[idxs_sample], dtype='int32') 77 | tpairs = ld.pad_data([train_pairs[i] for i in idxs_sample]) 78 | te1 = ld.pad_data([train_e1[i] for i in idxs_sample]) 79 | te2 = ld.pad_data([train_e2[i] for i in idxs_sample]) 80 | cost = mod.train_batch(tpairs, te1, te2, train_y[idxs_sample].astype('int32'), np.float32(0.)) 81 | mean_loss.append(cost) 82 | print("EPOCH: %d loss: %.4f train_loss: %.4f" % (epoch, cost, np.mean(mean_loss))) 83 | sys.stdout.flush() 84 | 85 | all_dev_preds = [] 86 | scores = [] 87 | for start, end in zip(range(0, len(dev_idxs), val_mini_batch_size), range(val_mini_batch_size, len(dev_idxs)+val_mini_batch_size, 88 | val_mini_batch_size)): 89 | if len(dev_idxs[start:end]) == 0: 90 | continue 91 | vpairs = ld.pad_data([dev_pairs[i] for i in dev_idxs[start:end]]) 92 | ve1 = ld.pad_data([dev_e1[i] for i in dev_idxs[start:end]]) 93 | ve2 = ld.pad_data([dev_e2[i] for i in dev_idxs[start:end]]) 94 | preds = mod.predict_proba(vpairs, ve1, ve2, np.float32(1.)) 95 | for x in preds: 96 | all_dev_preds.append(x.argmax()) 97 | 98 | dev_f1 = f1_score(dev_y, all_dev_preds, average='micro') 99 | print("EPOCH: %d train_loss: %.4f dev_f1: %.4f" % (epoch, np.mean(mean_loss), dev_f1)) 100 | sys.stdout.flush() 101 | 102 | if dev_f1 > best_dev_f1: 103 | with open(argv['--model'], 'w') as fp: 104 | pickle.dump({'model_params':mod.__getstate__(), 'token':ld}, fp, pickle.HIGHEST_PROTOCOL) 105 | best_dev_f1 = dev_f1 106 | 107 | if __name__ == '__main__': 108 | logging.basicConfig(level=logging.DEBUG) 109 | main(sys.argv[1:]) 110 | -------------------------------------------------------------------------------- /models/bilstm.py: -------------------------------------------------------------------------------- 1 | from theano.tensor.shared_randomstreams import RandomStreams 2 | 3 | srng2 = RandomStreams(seed=234) 4 | 5 | from .utils import * 6 | 7 | 8 | class BiLSTM(object): 9 | def __init__(self, emb, pos, nh=256, nc=2, de=100, p_drop=0.5): 10 | """ 11 | Args: 12 | emb: Embedding Matrix 13 | pos: position matrix 14 | nh: hidden layer size 15 | nc: Number of classes 16 | # de: Dimensionality of word embeddings 17 | p_drop :: Dropout probability 18 | """ 19 | 20 | def recurrence(xi, mask, h_tm1, c_tm1, 21 | W_i, U_i, b_i, W_c, U_c, b_c, W_f, U_f, b_f, W_o2, U_o, b_o2, 22 | mask_in, mask_rec): 23 | x = xi * T.neq(mask, 0).dimshuffle(0, 'x') 24 | x = dropout_scan(x, mask_in, dropout_switch, 0.2) 25 | 26 | x_i = T.dot(x, W_i) + b_i 27 | x_i = x_i * T.neq(mask, 0).dimshuffle(0, 'x') 28 | 29 | x_f = T.dot(x, W_f) + b_f 30 | x_f = x_f * T.neq(mask, 0).dimshuffle(0, 'x') 31 | 32 | x_c = T.dot(x, W_c) + b_c 33 | x_c = x_c * T.neq(mask, 0).dimshuffle(0, 'x') 34 | 35 | x_o = T.dot(x, W_o2) + b_o2 36 | x_o = x_o * T.neq(mask, 0).dimshuffle(0, 'x') 37 | 38 | h_tm1 = h_tm1 * T.neq(mask, 0).dimshuffle(0, 'x') 39 | h_tm1 = dropout_scan(h_tm1, mask_rec, dropout_switch, 0.2) 40 | 41 | i = hard_sigmoid(x_i + T.dot(h_tm1, U_i)) 42 | f = hard_sigmoid(x_f + T.dot(h_tm1, U_f)) 43 | c = f * c_tm1 + i * T.tanh(x_c + T.dot(h_tm1, U_c)) 44 | o = hard_sigmoid(x_o + T.dot(h_tm1, U_o)) 45 | h = o * T.tanh(c) 46 | return [h, c] 47 | 48 | # Source Embeddings 49 | self.emb = theano.shared(name='Words', value=emb.astype('float32')) 50 | 51 | self.pos = theano.shared(name='Pos', value=pos.astype('float32')) 52 | 53 | # Source Output Weights 54 | self.w_o = theano.shared(name='w_o', value=he_normal((nh + nh, nc)).astype('float32')) 55 | self.b_o = theano.shared(name='b_o', value=np.zeros((nc,)).astype('float32')) 56 | 57 | # input 58 | idxs = T.matrix() 59 | e1_pos_idxs = T.matrix() 60 | e2_pos_idxs = T.matrix() 61 | Y = T.ivector() 62 | dropout_switch = T.scalar() 63 | 64 | # get word embeddings based on indicies 65 | x_word = self.emb[T.cast(idxs, 'int32')] 66 | x_e1_pos = self.pos[T.cast(e1_pos_idxs, 'int32')] 67 | x_e2_pos = self.pos[T.cast(e2_pos_idxs, 'int32')] 68 | x_word = T.concatenate([x_word, x_e1_pos, x_e2_pos], axis=2) 69 | mask = T.neq(idxs, 0) * 1 70 | x_word = x_word * mask.dimshuffle(0, 1, 'x') 71 | 72 | de = emb.shape[1] + 2 * pos.shape[1] 73 | 74 | fwd_params, bck_params = bilstm_weights(de, nh) 75 | 76 | # Update these parameters 77 | self.params = [self.w_o, self.b_o, self.emb, self.pos] 78 | self.params += fwd_params + bck_params 79 | 80 | self.h0 = theano.shared(name='h0', value=np.zeros((nh,), dtype="float32")) 81 | 82 | maskd1 = srng.binomial((x_word.shape[0], x_word.shape[-1]), p=0.8, dtype='float32') 83 | maskd2 = srng.binomial((x_word.shape[0], nh), p=0.8, dtype='float32') 84 | [h_fwd, _], u = theano.scan(fn=recurrence, 85 | sequences=[x_word.dimshuffle(1, 0, 2), idxs.dimshuffle(1, 0)], 86 | non_sequences=fwd_params + [maskd1, maskd2], 87 | outputs_info=[T.alloc(self.h0, x_word.shape[0], nh), 88 | T.alloc(self.h0, x_word.shape[0], nh)], 89 | n_steps=x_word.shape[1], 90 | strict=True) 91 | 92 | maskd3 = srng.binomial((x_word.shape[0], x_word.shape[-1]), p=0.8, dtype='float32') 93 | maskd4 = srng.binomial((x_word.shape[0], nh), p=0.8, dtype='float32') 94 | [h_bck, _], u = theano.scan(fn=recurrence, 95 | sequences=[x_word.dimshuffle(1, 0, 2)[::-1, :, :], idxs.dimshuffle(1, 0)[::-1, :]], 96 | non_sequences=bck_params + [maskd3, maskd4], 97 | outputs_info=[T.alloc(self.h0, x_word.shape[0], nh), 98 | T.alloc(self.h0, x_word.shape[0], nh)], 99 | n_steps=x_word.shape[1], 100 | strict=True) 101 | 102 | h_bck = h_bck[::-1, :, :].dimshuffle(1, 0, 2) 103 | h_fwd = h_fwd.dimshuffle(1, 0, 2) 104 | h_priv = T.concatenate([h_fwd, h_bck], axis=2) 105 | h = h_priv.max(axis=1) 106 | h = dropout(h, dropout_switch, 0.2) 107 | 108 | Y_neg = T.ivector() 109 | pyx = T.nnet.nnet.softmax(T.dot(h, self.w_o) + self.b_o.dimshuffle('x', 0)) 110 | pyx = T.clip(pyx, 1e-5, 1 - 1e-5) 111 | L = -T.mean(T.log(pyx)[T.arange(Y.shape[0]), Y]) + 1e-6 * sum([(x ** 2).sum() for x in self.params]) 112 | 113 | updates, _ = Adam(L, self.params, lr2=0.001) 114 | 115 | self.train_batch = theano.function([idxs, e1_pos_idxs, e2_pos_idxs, \ 116 | Y, dropout_switch], 117 | L, updates=updates, allow_input_downcast=True, on_unused_input='ignore') 118 | self.predict_proba = theano.function([idxs, e1_pos_idxs, e2_pos_idxs, dropout_switch], \ 119 | pyx, allow_input_downcast=True, on_unused_input='ignore') 120 | 121 | def __getstate__(self): 122 | values = [x.get_value() for x in self.params] 123 | return values 124 | 125 | def __setstate__(self, weights): 126 | for x, w in zip(self.params, weights): 127 | x.set_value(w) 128 | 129 | 130 | def bilstm_weights(de, nh): 131 | """ 132 | 133 | Args: 134 | de: Dimensionality of word embeddings 135 | nh: Hidden layer dimensionality 136 | 137 | Returns: 138 | forward weights, backward weights 139 | """ 140 | # forward Bi-LSTM Weights 141 | Wf_i = theano.shared(name='wf_i', value=he_normal((de, nh)).astype("float32")) 142 | Uf_i = theano.shared(name='uf_i', value=he_normal((nh, nh)).astype("float32")) 143 | bf_i = theano.shared(name='bf_i', value=np.zeros((nh,), dtype="float32")) 144 | 145 | Wf_f = theano.shared(name='wf_f', value=he_normal((de, nh)).astype("float32")) 146 | Uf_f = theano.shared(name='uf_f', value=orthogonal_tmp((nh, nh)).astype("float32")) 147 | bf_f = theano.shared(name='bf_f', value=np.ones((nh,), dtype="float32")) 148 | 149 | Wf_c = theano.shared(name='wf_c', value=he_normal((de, nh)).astype("float32")) 150 | Uf_c = theano.shared(name='uf_c', value=orthogonal_tmp((nh, nh)).astype("float32")) 151 | bf_c = theano.shared(name='bf_c', value=np.zeros((nh), dtype="float32")) 152 | 153 | Wf_o2 = theano.shared(name='wfoo', value=he_normal((de, nh)).astype("float32")) 154 | Uf_o = theano.shared(name='ufoo', value=orthogonal_tmp((nh, nh)).astype("float32")) 155 | bf_o2 = theano.shared(name='bfoo', value=np.zeros((nh,), dtype="float32")) 156 | 157 | # backward Bi-LSTM Weights 158 | Wb_i = theano.shared(name='wb_i', value=he_normal((de, nh)).astype("float32")) 159 | Ub_i = theano.shared(name='ub_i', value=orthogonal_tmp((nh, nh)).astype("float32")) 160 | bb_i = theano.shared(name='bb_i', value=np.zeros((nh,), dtype="float32")) 161 | 162 | Wb_f = theano.shared(name='wb_f', value=he_normal((de, nh)).astype("float32")) 163 | Ub_f = theano.shared(name='ub_f', value=orthogonal_tmp((nh, nh)).astype("float32")) 164 | bb_f = theano.shared(name='bb_f', value=np.ones((nh), dtype="float32")) 165 | 166 | Wb_c = theano.shared(name='wb_c', value=he_normal((de, nh)).astype("float32")) 167 | Ub_c = theano.shared(name='ub_c', value=orthogonal_tmp((nh, nh)).astype("float32")) 168 | bb_c = theano.shared(name='bb_c', value=np.zeros((nh), dtype="float32")) 169 | 170 | Wb_o2 = theano.shared(name='wboo', value=he_normal((de, nh)).astype("float32")) 171 | Ub_o = theano.shared(name='uboo', value=orthogonal_tmp((nh, nh)).astype("float32")) 172 | bb_o2 = theano.shared(name='bboo', value=np.zeros((nh), dtype="float32")) 173 | 174 | params_forward = [Wb_i, Ub_i, bb_i, 175 | Wb_c, Ub_c, bb_c, 176 | Wb_f, Ub_f, bb_f, 177 | Wb_o2, Ub_o, bb_o2] 178 | 179 | params_backward = [Wf_i, Uf_i, bf_i, 180 | Wf_c, Uf_c, bf_c, 181 | Wf_f, Uf_f, bf_f, 182 | Wf_o2, Uf_o, bf_o2] 183 | 184 | return params_forward, params_backward 185 | -------------------------------------------------------------------------------- /models/utils.py: -------------------------------------------------------------------------------- 1 | from theano import tensor as T 2 | #import theano.sandbox.cuda 3 | from collections import OrderedDict 4 | from theano.ifelse import ifelse 5 | import theano 6 | from theano import config 7 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 8 | import numpy as np 9 | from theano.tensor.nnet.conv import conv2d 10 | 11 | np.random.seed(1234) 12 | rng = np.random.RandomState(1234) 13 | srng = RandomStreams(rng.randint(54321)) 14 | 15 | class ReverseGradient(theano.Op): 16 | """ theano operation to reverse the gradients 17 | Introduced in http://arxiv.org/pdf/1409.7495.pdf 18 | """ 19 | 20 | view_map = {0: [0]} 21 | 22 | __props__ = ('hp_lambda', ) 23 | 24 | def __init__(self, hp_lambda): 25 | super(ReverseGradient, self).__init__() 26 | self.hp_lambda = hp_lambda 27 | 28 | def make_node(self, x): 29 | assert hasattr(self, '_props'), "Your version of theano is too old to support __props__." 30 | x = theano.tensor.as_tensor_variable(x) 31 | return theano.Apply(self, [x], [x.type()]) 32 | 33 | def perform(self, node, inputs, output_storage): 34 | xin, = inputs 35 | xout, = output_storage 36 | xout[0] = xin 37 | 38 | def grad(self, input, output_gradients): 39 | return [-self.hp_lambda * output_gradients[0]] 40 | 41 | def infer_shape(self, node, i0_shapes): 42 | return i0_shapes 43 | 44 | def hard_sigmoid(x): 45 | return T.nnet.hard_sigmoid(x) 46 | 47 | def log_softmax(x): 48 | xdev = x - x.max(1, keepdims=True) 49 | return xdev - T.log(T.sum(T.exp(xdev), axis=1, keepdims=True)) 50 | 51 | def categorical_crossentropy_logdomain(log_predictions, targets): 52 | return -T.mean(targets * log_predictions, axis=1) 53 | 54 | def normal(shape, scale=0.05): 55 | return np.random.normal(0, scale, size=shape).astype('float32') 56 | 57 | def get_fans(shape): 58 | fan_in = shape[0] if len(shape) == 2 else np.prod(shape[1:]) 59 | fan_out = shape[1] if len(shape) == 2 else shape[0] 60 | return fan_in, fan_out 61 | 62 | def orthogonal(shape): 63 | ''' Reference: Glorot & Bengio, AISTATS 2010 glorot_normal 64 | ''' 65 | fan_in, fan_out = get_fans(shape) 66 | s = np.sqrt(2. / (fan_in * fan_out)) 67 | return normal(shape, s) 68 | 69 | def he_normal(shape): 70 | ''' Reference: He et al., http://arxiv.org/abs/1502.01852 71 | ''' 72 | fan_in, fan_out = get_fans(shape) 73 | s = np.sqrt(2. / fan_in) 74 | return normal(shape, s) 75 | 76 | def glorot_uniform(shape): 77 | fan_in, fan_out = get_fans(shape) 78 | s = np.sqrt(6. / (fan_in + fan_out)) 79 | return uniform(shape, s) 80 | 81 | def orthogonal_tmp2(shape): 82 | fan_in, fan_out = get_fans(shape) 83 | s = np.sqrt(6. / (fan_in + fan_out)) 84 | return uniform(shape, s) 85 | 86 | def uniform(shape, scale=0.05): 87 | return np.random.uniform(low=-scale, high=scale, size=shape).astype('float32') 88 | 89 | def orthogonal_tmp(shape, scale=1.0): 90 | ''' From Lasagne. Reference: Saxe et al., http://arxiv.org/abs/1312.6120 91 | ''' 92 | flat_shape = (shape[0], np.prod(shape[1:])) 93 | a = np.random.normal(0.0, 1.0, flat_shape) 94 | u, _, v = np.linalg.svd(a, full_matrices=False) 95 | # pick the one with the correct shape 96 | q = u if u.shape == flat_shape else v 97 | q = q.reshape(shape) 98 | return scale * q[:shape[0], :shape[1]] 99 | 100 | def as_floatX(variable): 101 | if isinstance(variable, float): 102 | #return np.cast["float32"](variable) 103 | return np.cast['float32'](variable) 104 | elif isinstance(variable, np.ndarray): 105 | #return np.cast["float32"](variable) 106 | return np.cast['float32'](variable) 107 | 108 | def rectify(X): 109 | return T.maximum(X, 0.) 110 | 111 | def cappedrectify(X): 112 | return T.minimum(5., T.maximum(X, 0.)) 113 | 114 | def elu(X): 115 | return T.switch(T.ge(X, 0), X, T.exp(X)-1.) 116 | 117 | def snelu(X): 118 | scale = 1.0507009873554804934193349852946 119 | alpha = 1.6732632423543772848170429916717 120 | return scale * T.switch(T.ge(X, 0), X, alpha*T.exp(X)-alpha) 121 | 122 | def dropout(X, dropout_switch=1, p=0.): 123 | retain_prob = 1 - p 124 | mask = srng.binomial(X.shape, p=retain_prob, dtype='float32') 125 | X = ifelse(T.lt(dropout_switch, 0.5), X*mask, (X*retain_prob).reshape(mask.shape)) 126 | return X 127 | 128 | def dropout_scan(X, mask, dropout_switch=1, p=0.): 129 | retain_prob = 1 - p 130 | X = ifelse(T.lt(dropout_switch, 0.5), X*mask, (X*retain_prob).reshape(mask.shape)) 131 | return X 132 | 133 | def clip_norm(g, c, n): 134 | if c > 0: 135 | g = T.switch(T.ge(n, c), g * c / n, g) 136 | return g 137 | 138 | def sgdm(cost, parameters, lr2=1., momentum=0.8): 139 | lr = theano.shared(as_floatX(lr2).astype("float32")) 140 | grads = T.grad(cost, parameters) 141 | updates = OrderedDict() 142 | for param,g2 in zip(parameters,grads): 143 | grad = clip_norm(g2, 3, T.sum(g2 ** 2)) 144 | mparam = theano.shared(param.get_value()*0.) 145 | updates[param] = param - lr * mparam 146 | updates[mparam] = mparam*momentum + (1.-momentum)*grad 147 | 148 | return updates, lr 149 | 150 | def sgd(cost, parameters, lr, updates=None): 151 | grads = T.grad(cost,parameters) 152 | updates = OrderedDict({}) 153 | for param,grad in zip(parameters,grads): 154 | updates[param] = param - lr*grad 155 | 156 | return updates 157 | 158 | #def Adam(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8): 159 | def Adam(cost, params, lr2=0.001, b1=0.1, b2=0.001, e=1e-8): 160 | updates = [] 161 | lr = theano.shared(as_floatX(lr2).astype("float32")) 162 | grads = T.grad(cost, params) 163 | i = theano.shared(as_floatX(0.)) 164 | i_t = i + as_floatX(1.) 165 | fix1 = as_floatX(1.) - (as_floatX(1.) - as_floatX(b1))**i_t 166 | fix2 = as_floatX(1.) - (as_floatX(1.) - as_floatX(b2))**i_t 167 | #lr_t = as_floatX(lr) * (T.sqrt(fix2) / fix1) 168 | lr_t = lr * (T.sqrt(fix2) / fix1) 169 | for p, g2 in zip(params, grads): 170 | ''' 171 | if p.name != 'Words' and p.name != 'Pos' and p.name != 'lang': 172 | else: 173 | g = g2 174 | ''' 175 | #g = clip_norm(g2, 3, T.sum(g2 ** 2)) 176 | #g = clip_norm(g2, 3, T.sum(g2 ** 2)) 177 | g = g2 178 | #g = g2.clip(-.5, .5) 179 | #g = clip_norm(g, 3, T.sqrt(T.sum(g**2))) 180 | m = theano.shared(p.get_value() * as_floatX(0.)) 181 | v = theano.shared(p.get_value() * as_floatX(0.)) 182 | m_t = (as_floatX(b1) * g) + ((as_floatX(1.) - as_floatX(b1)) * m) 183 | v_t = (as_floatX(b2) * T.sqr(g)) + ((as_floatX(1.) - as_floatX(b2)) * v) 184 | g_t = m_t / (T.sqrt(v_t) + as_floatX(e)) 185 | p_t = p - (lr_t * g_t) 186 | updates.append((m, m_t)) 187 | updates.append((v, v_t)) 188 | if p.name == 'w_o': 189 | p_t = clip_norm(p_t, 3, T.sum(p_t ** 2)) 190 | updates.append((p, p_t)) 191 | updates.append((i, i_t)) 192 | return updates, lr 193 | 194 | def RMSprop(cost, params, lr=0.001, rho=0.9, epsilon=1e-6): 195 | grads = T.grad(cost=cost, wrt=params) 196 | ''' 197 | norm = T.sqrt(sum([T.sum(g ** 2) for g,p in zip(grads, params) if p.name != 'Words' and p.get_value(borrow=True).ndim == 2 and p.name != 'label_embeddings'])) 198 | tmp_grads = [] 199 | for g,p in zip(grads, params): 200 | if p.name != 'Words' and (p.get_value(borrow=True).ndim == 2) and p.name != 'label_embeddings': 201 | tmp_grads.append(clip_norm(g, 5, norm)) 202 | else: 203 | tmp_grads.append(g) 204 | grads = tmp_grads 205 | ''' 206 | norm = T.sqrt(sum([T.sum(g ** as_floatX(2.)) for g in grads])) 207 | grads = [clip_norm(g, as_floatX(5.), norm) for g in grads] 208 | updates = [] 209 | for p, g in zip(params, grads): 210 | acc = theano.shared(p.get_value() * as_floatX(0.)) 211 | acc_new = as_floatX(rho) * acc + (as_floatX(1.) - as_floatX(rho)) * g ** as_floatX(2.) 212 | gradient_scaling = T.sqrt(acc_new + as_floatX(epsilon)) 213 | g = g / gradient_scaling 214 | updates.append((acc, acc_new)) 215 | updates.append((p, p - as_floatX(lr) * g)) 216 | return updates 217 | 218 | 219 | def adagrad(cost, params, lr=0.001, eps=1e-8, sparse=False): 220 | lr = theano.shared(as_floatX(lr).astype("float32")) 221 | eps = as_floatX(eps).astype("float32") 222 | 223 | gsums = [theano.shared(np.zeros_like(param.get_value(borrow=True))+0.1) for param in params] 224 | #gsums = [theano.shared(np.zeros_like(param.get_value(borrow=True))) for param in params] 225 | xsums = [None for param in params] 226 | 227 | gparams = T.grad(cost, params) 228 | 229 | updates = OrderedDict() 230 | 231 | for gparam, param, gsum in zip(gparams, params, gsums): 232 | updates[gsum] = T.cast(gsum + (gparam ** as_floatX(2.)), "float32") 233 | updates[param] = T.cast(param - lr * (gparam / (T.sqrt(updates[gsum] + eps))), "float32") 234 | 235 | return updates, lr 236 | 237 | def sgd_updates_adadelta(params,cost,rho=0.95,epsilon=1e-6,norm_lim=9,word_vec_name='Words'): 238 | """ 239 | adadelta update rule, mostly from 240 | https://groups.google.com/forum/#!topic/pylearn-dev/3QbKtCumAW4 (for Adadelta) 241 | """ 242 | updates = OrderedDict({}) 243 | exp_sqr_grads = OrderedDict({}) 244 | exp_sqr_ups = OrderedDict({}) 245 | gparams = [] 246 | for param in params: 247 | empty = np.zeros_like(param.get_value()) 248 | exp_sqr_grads[param] = theano.shared(value=as_floatX(empty),name="exp_grad_%s" % param.name) 249 | gp = T.grad(cost, param) 250 | exp_sqr_ups[param] = theano.shared(value=as_floatX(empty), name="exp_grad_%s" % param.name) 251 | gparams.append(gp) 252 | 253 | for param, gp in zip(params, gparams): 254 | exp_sg = exp_sqr_grads[param] 255 | exp_su = exp_sqr_ups[param] 256 | up_exp_sg = rho * exp_sg + (1 - rho) * T.sqr(gp) 257 | updates[exp_sg] = up_exp_sg 258 | step = -(T.sqrt(exp_su + epsilon) / T.sqrt(up_exp_sg + epsilon)) * gp 259 | updates[exp_su] = rho * exp_su + (1 - rho) * T.sqr(step) 260 | stepped_param = param + step 261 | updates[param] = stepped_param 262 | return updates 263 | -------------------------------------------------------------------------------- /load_data.py: -------------------------------------------------------------------------------- 1 | import random 2 | import pickle 3 | from time import time 4 | import sys 5 | from collections import defaultdict 6 | import gensim 7 | import logging 8 | from gensim.models.keyedvectors import KeyedVectors 9 | import nltk 10 | nltk.download('averaged_perceptron_tagger') 11 | nltk.download('words') 12 | from nltk import conlltags2tree, tree2conlltags 13 | 14 | import numpy as np 15 | from sklearn.externals import joblib 16 | from sklearn.metrics import confusion_matrix 17 | from sklearn.metrics import f1_score, precision_recall_fscore_support, accuracy_score 18 | from sklearn.model_selection import train_test_split 19 | 20 | def dataRead(fname): 21 | print ("Input File Reading") 22 | fp = open(fname, 'r') 23 | #samples = fp.read().strip().split('\r\n\r\n') 24 | samples = fp.read().strip().split('\n\n') 25 | sent_lengths = [] #1-d array 26 | sent_contents = [] #2-d array [[w1,w2,....] ...] 27 | sent_lables = [] #1-d array 28 | entity1_list = [] #2-d array [[e1,e1_t] [e1,e1_t]...] 29 | entity2_list = [] #2-d array [[e1,e1_t] [e1,e1_t]...] 30 | doc_ids = [] 31 | idents = [] 32 | for sample in samples: 33 | #sent, entities = sample.strip().split('\r\n') 34 | sent, entities = sample.strip().split('\n') 35 | doc_id, ident, e1, e2, relation = entities.split('\t') 36 | sent_contents.append(sent.lower()) 37 | entity1_list.append([e1, ident]) 38 | entity2_list.append([e2, ident]) 39 | sent_lables.append(relation) 40 | idents.append(ident) 41 | doc_ids.append(doc_id) 42 | 43 | return idents, sent_contents, entity1_list, entity2_list, sent_lables 44 | 45 | 46 | class LoadDataReturn(object): 47 | def __init__(self): 48 | self.pairs_idx = [] 49 | self.pos_idx = [] 50 | self.pairs_idx_rev = [] 51 | self.domain_labels = [] 52 | self.pos_e2_idx = [] 53 | self.pos_e1_idx = [] 54 | self.subj_labels = [] 55 | self.pred_labels = [] 56 | self.obj_labels = [] 57 | self.e1_ids = [] 58 | self.e2_ids = [] 59 | self.y = [] 60 | self.idents = [] 61 | 62 | 63 | class LoadData(object): 64 | def __init__(self, word2vec_file): 65 | self.word_index = {} 66 | self.pos_index = {} 67 | self.num_words = 1 68 | self.num_pos = 1 69 | self.embs = [np.zeros((300,))] 70 | self.pos = [np.zeros((32,))] 71 | logging.debug('Loading %s', word2vec_file) 72 | #self.wv = gensim.models.Word2Vec.load('/home/amri228/i2b2_2016/ddi/word_vecs2/gensim_model_pubmed') 73 | self.wv = KeyedVectors.load_word2vec_format(word2vec_file, binary=False) 74 | logging.debug('Done') 75 | self.max_u = self.wv.syn0.max() 76 | self.min_u = self.wv.syn0.min() 77 | 78 | def fit(self, filename, ids): 79 | all_data = dataRead(filename) 80 | word_cnts = {} 81 | pos_cnts = {} 82 | missing = set() 83 | for ident, tr, tl, e1, e2 in zip(all_data[0], all_data[1], all_data[-1], all_data[2], all_data[3]): 84 | if ident not in ids: 85 | continue 86 | final_string = tr.split() 87 | final_string_pos = nltk.pos_tag(final_string) 88 | #tree = self.tagger.parse(final_string_pos) 89 | #iob_tags = tree2conlltags(tree) 90 | final_e1_string = ['druga'] 91 | final_e2_string = ['drugb'] 92 | e1_pos = None 93 | e2_pos = None 94 | cnt = 0 95 | for w in final_string: 96 | if w == 'druga': 97 | e1_pos = cnt 98 | elif w == 'drugb': 99 | e2_pos = cnt 100 | cnt += 1 101 | tmp = [] 102 | final_e1_pos = [] 103 | final_e2_pos = [] 104 | cnt = 0 105 | error = False 106 | #print final_string 107 | for w in final_string: 108 | if cnt-e1_pos in pos_cnts: 109 | pos_cnts[cnt-e1_pos] += 1 110 | else: 111 | pos_cnts[cnt-e1_pos] = 1 112 | if cnt-e2_pos in pos_cnts: 113 | pos_cnts[cnt-e2_pos] += 1 114 | else: 115 | pos_cnts[cnt-e2_pos] = 1 116 | cnt += 1 117 | for w in final_string: 118 | if w in word_cnts: 119 | word_cnts[w] += 1 120 | else: 121 | word_cnts[w] = 1 122 | for w in final_e1_pos: 123 | if w in pos_cnts: 124 | pos_cnts[w] += 1 125 | else: 126 | pos_cnts[w] = 1 127 | for w in final_e2_pos: 128 | if w in pos_cnts: 129 | pos_cnts[w] += 1 130 | else: 131 | pos_cnts[w] = 1 132 | 133 | for w, cnt in word_cnts.iteritems(): 134 | if cnt > 5: 135 | if w in self.wv: 136 | self.embs.append(self.wv[w]) 137 | self.word_index[w] = self.num_words 138 | self.num_words += 1 139 | else: 140 | missing.add(w) 141 | self.embs.append(np.random.uniform(-1., 1., (300,))) 142 | self.word_index[w] = self.num_words 143 | self.num_words += 1 144 | for w, cnt in pos_cnts.iteritems(): 145 | if cnt > 5: 146 | self.pos.append(np.random.uniform(-1., 1., (32,))) 147 | self.pos_index[w] = self.num_pos 148 | self.num_pos += 1 149 | 150 | self.pos_index['NegUNK'] = self.num_pos 151 | self.num_pos += 1 152 | self.pos.append(np.random.uniform(-1., 1., (32,))) 153 | self.pos_index['PosUNK'] = self.num_pos 154 | self.num_pos += 1 155 | self.pos.append(np.random.uniform(-1., 1., (32,))) 156 | 157 | self.word_index['UNK'] = self.num_words 158 | self.embs.append(np.random.uniform(-1., 1., (300,))) 159 | self.num_words += 1 160 | 161 | del self.wv 162 | self.embs = np.array(self.embs, dtype='float32') 163 | self.pos = np.array(self.pos, dtype='float32') 164 | return 165 | 166 | def transform(self, filename, ids): 167 | all_data = dataRead(filename) 168 | pairs_idx = [] 169 | pos_idx = [] 170 | pairs_idx_rev = [] 171 | domain_labels = [] 172 | pos_e2_idx = [] 173 | pos_e1_idx = [] 174 | subj_labels = [] 175 | pred_labels = [] 176 | obj_labels = [] 177 | e1_ids = [] 178 | e2_ids = [] 179 | y = [] 180 | idents = [] 181 | for ident, tr, tl, e1, e2 in zip(all_data[0], all_data[1], 182 | all_data[-1], all_data[2], all_data[3]): 183 | if ident not in ids: 184 | continue 185 | final_string = tr.split() 186 | final_string_pos = nltk.pos_tag(final_string) 187 | #tree = self.tagger.parse(final_string_pos) 188 | #iob_tags = tree2conlltags(tree) 189 | final_e1_string = ['druga'] 190 | final_e2_string = ['drugb'] 191 | e1_pos = None 192 | e2_pos = None 193 | cnt = 0 194 | for w in final_string: 195 | if w == 'druga': 196 | e1_pos = cnt 197 | elif w == 'drugb': 198 | e2_pos = cnt 199 | cnt += 1 200 | if e1_pos is None or e2_pos is None: 201 | continue 202 | tmp = [] 203 | final_e1_pos = [] 204 | final_e2_pos = [] 205 | cnt = 0 206 | tmp_subj = [] 207 | tmp_pred = [] 208 | tmp_obj = [] 209 | for w in final_string: 210 | final_e1_pos.append(cnt - e1_pos) 211 | final_e2_pos.append(cnt - e2_pos) 212 | cnt += 1 213 | idents.append(ident) 214 | e1_ids.append(e1[0]) 215 | e2_ids.append(e2[0]) 216 | y.append(tl) 217 | fstring = [] 218 | for w in final_string: 219 | fstring.append(w) 220 | final_string = fstring 221 | str_idx = [] 222 | for w in final_string: 223 | if w in self.word_index: 224 | str_idx.append(self.word_index[w]) 225 | else: 226 | str_idx.append(self.word_index['UNK']) 227 | pairs_idx.append(str_idx) 228 | e1_idx = [] 229 | for p in final_e1_pos: 230 | if p in self.pos_index: 231 | e1_idx.append(self.pos_index[p]) 232 | else: 233 | if p < 0: 234 | e1_idx.append(self.pos_index['NegUNK']) 235 | else: 236 | e1_idx.append(self.pos_index['PosUNK']) 237 | pos_e1_idx.append(e1_idx) 238 | e2_idx = [] 239 | for p in final_e2_pos: 240 | if p in self.pos_index: 241 | e2_idx.append(self.pos_index[p]) 242 | else: 243 | if p < 0: 244 | e2_idx.append(self.pos_index['NegUNK']) 245 | else: 246 | e2_idx.append(self.pos_index['PosUNK']) 247 | pos_e2_idx.append(e2_idx) 248 | 249 | lab_lookup = {'OTHER':0, 'CLASS1':1} 250 | self.lab_lookup_rev = {0:'OTHER', 1:'CLASS1'} 251 | final_y = np.array([np.int32(lab_lookup[x]) for x in y]) 252 | 253 | return pairs_idx, pos_e1_idx, pos_e2_idx, final_y, subj_labels, pred_labels, obj_labels, idents, e1_ids, e2_ids 254 | 255 | def fit_transform(self, filename, ids): 256 | self.fit(filename, ids) 257 | return self.transform(filename, ids) 258 | 259 | def pad_data(self, data, max_len = None): 260 | max_len = np.max([len(x) for x in data]) 261 | padded_dataset = [] 262 | for example in data: 263 | try: 264 | zeros = [0]*(max_len-len(example)) 265 | padded_dataset.append(example+zeros) 266 | except: 267 | logging.exception('%s %s %s', max_len, len(example), example) 268 | exit(1) 269 | if max_len is None: 270 | return np.array(padded_dataset) 271 | else: 272 | return np.array(padded_dataset)[:,:max_len] 273 | --------------------------------------------------------------------------------