├── models
    ├── __init__.py
    ├── bilstm.py
    └── utils.py
├── .gitignore
├── data
    ├── train_dev_test
    │   ├── test_ids.txt
    │   ├── dev_ids.txt
    │   └── train_ids.txt
    └── example_dataset.txt
├── enviroment2.yml
├── .theanorc
├── pred.py
├── README.md
├── train.py
└── load_data.py


/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.swp
3 | 


--------------------------------------------------------------------------------
/data/train_dev_test/test_ids.txt:
--------------------------------------------------------------------------------
1 | 23293962
2 | 7678677
3 | 


--------------------------------------------------------------------------------
/data/train_dev_test/dev_ids.txt:
--------------------------------------------------------------------------------
1 | 14967461
2 | 16357751
3 | 23293962
4 | 


--------------------------------------------------------------------------------
/data/train_dev_test/train_ids.txt:
--------------------------------------------------------------------------------
1 | 11716850
2 | 16437532
3 | 16554356
4 | 16789740
5 | 


--------------------------------------------------------------------------------
/enviroment2.yml:
--------------------------------------------------------------------------------
 1 | name: bilstm-relaiton-classification
 2 | dependencies:
 3 |   - python=2.7
 4 |   - theano
 5 |   - numpy
 6 |   - scikit-learn
 7 |   - gensim
 8 |   - docopt
 9 |   - nltk
10 | 


--------------------------------------------------------------------------------
/.theanorc:
--------------------------------------------------------------------------------
 1 | [dnn]
 2 | include_path=/export/home/CUDA/include/
 3 | library_path=/export/home/CUDA/lib64/
 4 | 
 5 | [global]
 6 | device = cuda0
 7 | floatX = float32
 8 | mode = FAST_RUN
 9 | allow_gc=True
10 | 
11 | [scan]
12 | allow_gc=True
13 | 


--------------------------------------------------------------------------------
/pred.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 |     pred.py [options]
 4 | 
 5 | Options:
 6 |   -h --help                    show this help message and exit
 7 |   --word2vec=<file>            word vectors in gensim format
 8 |   --dataset=<file>             dataset (see data folder for example)
 9 |   --test_ids=<file>            ids of test examples (see data folder for example)
10 |   --model=<file>               filename to use to save model 
11 |   --mini_batch_size=<arg>      Minibatch size [default: 32]
12 |   --num_classes=<arg>          Total number of classes for training [default: 5]
13 |   --lstm_hidden_state=<arg>    lstm hidden state size [default: 256]
14 |   --random_seed=<arg>          random seed [default: 42]
15 | 
16 | """
17 | 
18 | import logging
19 | import pickle
20 | import random
21 | import sys
22 | from models.bilstm import BiLSTM
23 | import docopt
24 | import numpy as np
25 | from sklearn.metrics import f1_score
26 | 
27 | 
28 | def main(argv):
29 |     argv = docopt.docopt(__doc__, argv=argv)
30 | 
31 |     random_seed = argv['--random_seed']
32 |     np.random.seed(random_seed)
33 |     random.seed(random_seed)
34 | 
35 |     mini_batch_size = argv['--mini_batch_size']
36 | 
37 |     def read_ids(file):
38 |         ids = []
39 |         with open(file, 'r') as fp:
40 |             for row in fp:
41 |                 ids.append(row.strip())
42 |         return ids
43 | 
44 |     test_ids = read_ids(argv['<test_ids>'])
45 | 
46 |     with open(argv['--model']) as fp:
47 |         tmp = pickle.load(fp)
48 | 
49 |     ld = tmp['token']
50 |     mod = BiLSTM(ld.embs, ld.pos, ld.pospeech, ld.chunk, nc=5, nh=2048, de=ld.embs.shape[1])
51 |     mod.__setstate__(tmp['model_params'])
52 | 
53 |     pairs_idx, pos_e1_idx, pos_e2_idx, y, _, _, _, _, _, _  = ld.transform(argv['--dataset'], test_ids)
54 | 
55 |     test_idxs = list(range(len(pairs_idx)))
56 | 
57 |     all_test_preds = []
58 |     scores = []
59 |     for start, end in zip(range(0, len(test_idxs), mini_batch_size),
60 |                           range(mini_batch_size, len(test_idxs) + mini_batch_size,
61 |                                 mini_batch_size)):
62 |         if len(test_idxs[start:end]) == 0:
63 |             continue
64 |         tpairs = ld.pad_data([pairs_idx[i] for i in test_idxs[start:end]])
65 |         te1 = ld.pad_data([pos_e1_idx[i] for i in test_idxs[start:end]])
66 |         te2 = ld.pad_data([pos_e2_idx[i] for i in test_idxs[start:end]])
67 |         preds = mod.predict_proba(tpairs, te1, te2, np.float32(1.))
68 | 
69 |         for x in preds:
70 |             all_test_preds.append(x.argmax())
71 | 
72 |     test_f1 = f1_score(y, all_test_preds, average='micro')
73 |     print("test_f1: %.4f" % (test_f1))
74 |     sys.stdout.flush()
75 | 
76 | if __name__ == '__main__':
77 |     logging.basicConfig(level=logging.DEBUG)
78 |     main(sys.argv[1:])
79 | 


--------------------------------------------------------------------------------
/data/example_dataset.txt:
--------------------------------------------------------------------------------
 1 | Recent studies have provided consistent evidence that treatment with abatacept results in a rapid onset of efficacy that is maintained over the course of treatment in patients with inadequate response to DRUGB and anti- DRUGA therapies .
 2 | 16357751.s1	16357751	T1	T2	OTHER
 3 | 
 4 | DRUGA inhibitors currently under investigation include the small molecules DRUGB ( Iressa , ZDdgdgdgdg ) and erlotinib ( Tarceva , OSI-dgdgdg ) , as well as monoclonal antibodies such as cetuximab ( IMC-dgdgdg , Erbitux ) .
 5 | 14967461.s1	14967461	T22	T1	CLASS1
 6 | 
 7 | Taken together , the results of the present study have characterized DRUGA as an inhibitor of matriptase-dg that modulates the synthesis of hepcidin and provides new insights into the regulatory mechanism of DRUGB homoeostasis , with clinical importance for a treatment of iron overload diseases .
 8 | 23293962.s1	23293962	T5	T1	OTHER
 9 | 
10 | Taken together , the results of the present study have characterized HAI-dg as an inhibitor of matriptase-dg that modulates the synthesis of DRUGA and provides new insights into the regulatory mechanism of iron homoeostasis , with clinical importance for a treatment of DRUGB overload diseases .
11 | 23293962.s1	23293962	T7	T2	OTHER
12 | 
13 | DRUGB and bromoacetylalprenololmenthane are competitive slowly reversible antagonists at the DRUGA of rat left atria .
14 | 7678677.s1	7678677	T14	T19	CLASS1
15 | 
16 | Alprenolol and DRUGB are competitive slowly reversible antagonists at the DRUGA of rat left atria .
17 | 7678677.s1	7678677	T15	T19	CLASS1
18 | 
19 | DRUGA was chemically bound via linkers to DRUGB -loaded HSA-NP .
20 | 16554356.s1	16554356	T10	T3	OTHER
21 | 
22 | Apolipoprotein E was chemically bound via linkers to DRUGB -loaded DRUGA -NP .
23 | 16554356.s1	16554356	T3	T11	OTHER
24 | 
25 | Discovery and optimization of DRUGB as inhibitors of methionine aminopeptidase-dg : a structural basis for the reduction of DRUGA binding .
26 | 16789740.s1	16789740	T4	T13	CLASS1
27 | 
28 | Discovery and optimization of DRUGB as inhibitors of DRUGA : a structural basis for the reduction of albumin binding .
29 | 16789740.s1	16789740	T4	T14	OTHER
30 | 
31 | BACKGROUND : Since the introduction of the first DRUGA inhibitor ( ChEI ) in dgdgdgdg , most clinicians and probably most patients would consider the cholinergic drugs , DRUGB , galantamine and rivastigmine , to be the first line pharmacotherapy for mild to moderate Alzheimer 's disease.The drugs have slightly different pharmacological properties , but they all work by inhibiting the breakdown of acetylcholine , an important neurotransmitter associated with memory , by blocking the enzyme acetylcholinesterase .
32 | 16437532.s1	16437532	T39	T11	CLASS1
33 | 
34 | Mitiglinide ( DRUGB ) , a new anti-diabetic drug , is thought to stimulate insulin secretion by closing the DRUGA in pancreatic beta-cells .
35 | 11716850.s1	11716850	T15	T42	CLASS1
36 | 
37 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Bi-LSTM Relation Extraction Model
 2 | 
 3 | Implementation of a word-level bi-lstm relation extraction model (Kavuluru et al., 2017).
 4 | 
 5 | ## Required Packages
 6 | - Python 2.7
 7 | - numpy 1.11.1+
 8 | - scipy 0.18.0+
 9 | - Theano
10 | - gensim
11 | - sklearn
12 | - docopt
13 | - nltk
14 | 
15 | ## Usage
16 | 
17 | ### Data Format
18 | 
19 | We use a custom data format as input to our model. Specifically, each example consists of two lines. The first line represents the sentences and the two entities **must** be marked as DRUGA or DRUGB, respectively. We use the DRUGA and DRUGB convention because our work focused on extracting drug-drug interactions. These entity markers must be used because they are used to find the position vectors for each word in the sentence relative to each entity. The second line should contain the sentence id, document id, DRUGA id, DRUGB id, and the associated class for that instance. Each id should be separated by a tab. Finally, each example must be separated by a blank line.
20 | 
21 | ```
22 | Sentence start DRUGA sentence middle DRUGB sentence end .
23 | sentence_id\tdoc_id\tdruga_id\tdrugb_id\tclass
24 | 
25 | Sentence start DRUGA sentence middle DRUGB sentence end .
26 | sentence_id\tdoc_id\tdruga_id\tdrugb_id\tclass
27 | ```
28 | 
29 | Example data is available in the data folder.
30 | 
31 | **Note**: Depending on the classes in your dataset, lines 249 and 250 in load_data.py must be changes to include them.
32 | 
33 | ### Training
34 | 
35 | ```
36 | python train.py --word2vec=/path/to/word2vecfile.pkl --dataset=./data/example_dataset.txt --train_ids=./data/train_dev_test/train_ids.txt --dev_ids=./data/train_dev_test/dev_ids.txt --model=/path/to/save/model_name
37 | ```
38 | 
39 | ```
40 | Usage:
41 |   train.py [options]
42 | 
43 | Options:
44 |   -h --help                    show this help message and exit
45 |   --word2vec=<file>            word vectors in gensim format
46 |   --dataset=<file>             dataset (see data folder for example)
47 |   --train_ids=<file>           ids of training examples (see data folder for example)
48 |   --dev_ids=<file>             ids of dev exapmles (see data folder for example)
49 |   --model=<file>               filename to use to save model
50 |   --num_epochs=<arg>            Max number of epochs [default: 25]
51 |   --mini_batch_size=<arg>       Minibatch size [default: 32]
52 |   --num_classes=<arg>           Total number of classes for training [default: 5]
53 |   --lstm_hidden_state=<arg>     lstm hidden state size [default: 256]
54 |   --random_seed=<arg>           random seed [default: 42]
55 | ```
56 | 
57 | ### Testing
58 | 
59 | **Note**: The current test code is mainly for evaluation purposes
60 | 
61 | ```
62 | Usage:
63 |     pred.py [options]
64 | 
65 | Options:
66 |   -h --help                    show this help message and exit
67 |   --word2vec=<file>            word vectors in gensim format
68 |   --dataset=<file>             dataset (see data folder for example)
69 |   --test_ids=<file>            ids of test examples (see data folder for example)
70 |   --model=<file>               filename to use to save model
71 |   --mini_batch_size=<arg>      Minibatch size [default: 32]
72 |   --num_classes=<arg>          Total number of classes for training [default: 5]
73 |   --lstm_hidden_state=<arg>    lstm hidden state size [default: 256]
74 |   --random_seed=<arg>          random seed [default: 42]
75 | ```
76 | 
77 | ## Acknowledgements
78 | 
79 | > Ramakanth Kavuluru, Anthony Rios, and Tung Tran. "[Extracting Drug-Drug Interactions with Word and Character-Level Recurrent Neural Networks.](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5639883/)" In Healthcare Informatics (ICHI), 2017 IEEE International Conference on, pp. 5-12. IEEE, 2017.
80 | 
81 | ```
82 | @inproceedings{kavuluru2017extracting,
83 |   title={Extracting Drug-Drug Interactions with Word and Character-Level Recurrent Neural Networks},
84 |   author={Kavuluru, Ramakanth and Rios, Anthony and Tran, Tung},
85 |   booktitle={Healthcare Informatics (ICHI), 2017 IEEE International Conference on},
86 |   pages={5--12},
87 |   year={2017},
88 |   organization={IEEE}
89 | }
90 | ```
91 | 
92 | For the character-level counterpart to this model, see this [repo](https://github.com/bionlproc/relation-extraction-char-rnn) by Tung Tran.
93 | 
94 | Written by Anthony Rios (anthonymrios at gmail dot com)
95 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Usage:
  3 |   train.py [options]
  4 | 
  5 | Options:
  6 |   -h --help                    show this help message and exit
  7 |   --word2vec=<file>            word vectors in gensim format
  8 |   --dataset=<file>             dataset (see data folder for example)
  9 |   --train_ids=<file>           ids of training examples (see data folder for example)
 10 |   --dev_ids=<file>             ids of dev exapmles (see data folder for example)
 11 |   --model=<file>               filename to use to save model 
 12 |   --num_epochs=<arg>           Max number of epochs [default: 25]
 13 |   --mini_batch_size=<arg>      Minibatch size [default: 32]
 14 |   --num_classes=<arg>          Total number of classes for training [default: 5]
 15 |   --lstm_hidden_state=<arg>    lstm hidden state size [default: 256]
 16 |   --random_seed=<arg>          random seed [default: 42]
 17 | 
 18 | """
 19 | 
 20 | import random
 21 | import sys
 22 | import logging
 23 | 
 24 | import docopt
 25 | import numpy as np
 26 | from sklearn.metrics import f1_score
 27 | 
 28 | from models.bilstm import BiLSTM
 29 | from load_data import LoadData
 30 | import pickle
 31 | 
 32 | 
 33 | def main(argv):
 34 |     argv = docopt.docopt(__doc__)
 35 | 
 36 |     num_epochs = int(argv['--num_epochs'])
 37 |     mini_batch_size = int(argv['--mini_batch_size'])
 38 |     val_mini_batch_size = 64
 39 |     num_classes = int(argv['--num_classes'])
 40 |     lstm_hidden_state_size = int(argv['--lstm_hidden_state'])
 41 |     random_seed = argv['--random_seed']
 42 | 
 43 |     np.random.seed(int(random_seed))
 44 |     random.seed(int(random_seed))
 45 | 
 46 |     def read_ids(filename):
 47 |         ids = []
 48 |         with open(filename, 'r') as fp:
 49 |             for row in fp:
 50 |                 ids.append(row.strip())
 51 |         return ids
 52 | 
 53 |     train_ids = read_ids(argv['--train_ids'])
 54 |     val_ids = read_ids(argv['--dev_ids'])
 55 | 
 56 |     ld = LoadData(argv['--word2vec'])
 57 | 
 58 |     train_pairs, train_e1, train_e2, train_y, _, _, _, train_ids, _, _  = ld.fit_transform(argv['--dataset'], train_ids)
 59 |     dev_pairs, dev_e1, dev_e2, dev_y, _, _, _, val_ids, dev_e1_ids,dev_e2_ids  = ld.transform(argv['--dataset'], val_ids)
 60 | 
 61 |     idxs = list(range(len(train_pairs)))
 62 |     dev_idxs = list(range(len(dev_pairs)))
 63 | 
 64 |     last_loss = None
 65 |     avg_loss = []
 66 |     avg_f1 = []
 67 |     check_preds = None
 68 |     mod = BiLSTM(ld.embs, ld.pos, nc=int(num_classes), nh=int(lstm_hidden_state_size), de=ld.embs.shape[1])
 69 |     best_dev_f1 = 0
 70 |     for epoch in range(1, int(num_epochs)+1):
 71 |         mean_loss = []
 72 |         random.shuffle(idxs)
 73 |         for start, end in zip(range(0, len(idxs), mini_batch_size), range(mini_batch_size, len(idxs)+mini_batch_size,
 74 |                 mini_batch_size)):
 75 |             idxs_sample = idxs[start:end]
 76 |             batch_labels = np.array(train_y[idxs_sample], dtype='int32')
 77 |             tpairs = ld.pad_data([train_pairs[i] for i in idxs_sample])
 78 |             te1 = ld.pad_data([train_e1[i] for i in idxs_sample])
 79 |             te2 = ld.pad_data([train_e2[i] for i in idxs_sample])
 80 |             cost = mod.train_batch(tpairs, te1, te2, train_y[idxs_sample].astype('int32'), np.float32(0.))
 81 |             mean_loss.append(cost)
 82 |             print("EPOCH: %d loss: %.4f train_loss: %.4f" % (epoch, cost, np.mean(mean_loss)))
 83 |             sys.stdout.flush()
 84 | 
 85 |         all_dev_preds = []
 86 |         scores = []
 87 |         for start, end in zip(range(0, len(dev_idxs), val_mini_batch_size), range(val_mini_batch_size, len(dev_idxs)+val_mini_batch_size,
 88 |                     val_mini_batch_size)):
 89 |             if len(dev_idxs[start:end]) == 0:
 90 |                 continue
 91 |             vpairs = ld.pad_data([dev_pairs[i] for i in dev_idxs[start:end]])
 92 |             ve1 = ld.pad_data([dev_e1[i] for i in dev_idxs[start:end]])
 93 |             ve2 = ld.pad_data([dev_e2[i] for i in dev_idxs[start:end]])
 94 |             preds = mod.predict_proba(vpairs, ve1, ve2, np.float32(1.))
 95 |             for x in preds:
 96 |                 all_dev_preds.append(x.argmax())
 97 | 
 98 |         dev_f1 = f1_score(dev_y, all_dev_preds, average='micro')
 99 |         print("EPOCH: %d train_loss: %.4f dev_f1: %.4f" % (epoch, np.mean(mean_loss), dev_f1))
100 |         sys.stdout.flush()
101 | 
102 |         if dev_f1 > best_dev_f1:
103 |             with open(argv['--model'], 'w') as fp:
104 |                 pickle.dump({'model_params':mod.__getstate__(), 'token':ld}, fp, pickle.HIGHEST_PROTOCOL)
105 |             best_dev_f1 = dev_f1
106 | 
107 | if __name__ == '__main__':
108 |     logging.basicConfig(level=logging.DEBUG)
109 |     main(sys.argv[1:])
110 | 


--------------------------------------------------------------------------------
/models/bilstm.py:
--------------------------------------------------------------------------------
  1 | from theano.tensor.shared_randomstreams import RandomStreams
  2 | 
  3 | srng2 = RandomStreams(seed=234)
  4 | 
  5 | from .utils import *
  6 | 
  7 | 
  8 | class BiLSTM(object):
  9 |     def __init__(self, emb, pos, nh=256, nc=2, de=100, p_drop=0.5):
 10 |         """
 11 |         Args:
 12 |             emb: Embedding Matrix
 13 |             pos: position matrix
 14 |             nh: hidden layer size
 15 |             nc: Number of classes
 16 |             # de: Dimensionality of word embeddings
 17 |             p_drop :: Dropout probability
 18 |         """
 19 | 
 20 |         def recurrence(xi, mask, h_tm1, c_tm1,
 21 |                        W_i, U_i, b_i, W_c, U_c, b_c, W_f, U_f, b_f, W_o2, U_o, b_o2,
 22 |                        mask_in, mask_rec):
 23 |             x = xi * T.neq(mask, 0).dimshuffle(0, 'x')
 24 |             x = dropout_scan(x, mask_in, dropout_switch, 0.2)
 25 | 
 26 |             x_i = T.dot(x, W_i) + b_i
 27 |             x_i = x_i * T.neq(mask, 0).dimshuffle(0, 'x')
 28 | 
 29 |             x_f = T.dot(x, W_f) + b_f
 30 |             x_f = x_f * T.neq(mask, 0).dimshuffle(0, 'x')
 31 | 
 32 |             x_c = T.dot(x, W_c) + b_c
 33 |             x_c = x_c * T.neq(mask, 0).dimshuffle(0, 'x')
 34 | 
 35 |             x_o = T.dot(x, W_o2) + b_o2
 36 |             x_o = x_o * T.neq(mask, 0).dimshuffle(0, 'x')
 37 | 
 38 |             h_tm1 = h_tm1 * T.neq(mask, 0).dimshuffle(0, 'x')
 39 |             h_tm1 = dropout_scan(h_tm1, mask_rec, dropout_switch, 0.2)
 40 | 
 41 |             i = hard_sigmoid(x_i + T.dot(h_tm1, U_i))
 42 |             f = hard_sigmoid(x_f + T.dot(h_tm1, U_f))
 43 |             c = f * c_tm1 + i * T.tanh(x_c + T.dot(h_tm1, U_c))
 44 |             o = hard_sigmoid(x_o + T.dot(h_tm1, U_o))
 45 |             h = o * T.tanh(c)
 46 |             return [h, c]
 47 | 
 48 |         # Source Embeddings
 49 |         self.emb = theano.shared(name='Words', value=emb.astype('float32'))
 50 | 
 51 |         self.pos = theano.shared(name='Pos', value=pos.astype('float32'))
 52 | 
 53 |         # Source Output Weights
 54 |         self.w_o = theano.shared(name='w_o', value=he_normal((nh + nh, nc)).astype('float32'))
 55 |         self.b_o = theano.shared(name='b_o', value=np.zeros((nc,)).astype('float32'))
 56 | 
 57 |         # input
 58 |         idxs = T.matrix()
 59 |         e1_pos_idxs = T.matrix()
 60 |         e2_pos_idxs = T.matrix()
 61 |         Y = T.ivector()
 62 |         dropout_switch = T.scalar()
 63 | 
 64 |         # get word embeddings based on indicies
 65 |         x_word = self.emb[T.cast(idxs, 'int32')]
 66 |         x_e1_pos = self.pos[T.cast(e1_pos_idxs, 'int32')]
 67 |         x_e2_pos = self.pos[T.cast(e2_pos_idxs, 'int32')]
 68 |         x_word = T.concatenate([x_word, x_e1_pos, x_e2_pos], axis=2)
 69 |         mask = T.neq(idxs, 0) * 1
 70 |         x_word = x_word * mask.dimshuffle(0, 1, 'x')
 71 | 
 72 |         de = emb.shape[1] + 2 * pos.shape[1]
 73 | 
 74 |         fwd_params, bck_params = bilstm_weights(de, nh)
 75 | 
 76 |         # Update these parameters
 77 |         self.params = [self.w_o, self.b_o, self.emb, self.pos]
 78 |         self.params += fwd_params + bck_params
 79 | 
 80 |         self.h0 = theano.shared(name='h0', value=np.zeros((nh,), dtype="float32"))
 81 | 
 82 |         maskd1 = srng.binomial((x_word.shape[0], x_word.shape[-1]), p=0.8, dtype='float32')
 83 |         maskd2 = srng.binomial((x_word.shape[0], nh), p=0.8, dtype='float32')
 84 |         [h_fwd, _], u = theano.scan(fn=recurrence,
 85 |                                      sequences=[x_word.dimshuffle(1, 0, 2), idxs.dimshuffle(1, 0)],
 86 |                                      non_sequences=fwd_params + [maskd1, maskd2],
 87 |                                      outputs_info=[T.alloc(self.h0, x_word.shape[0], nh),
 88 |                                                    T.alloc(self.h0, x_word.shape[0], nh)],
 89 |                                      n_steps=x_word.shape[1],
 90 |                                      strict=True)
 91 | 
 92 |         maskd3 = srng.binomial((x_word.shape[0], x_word.shape[-1]), p=0.8, dtype='float32')
 93 |         maskd4 = srng.binomial((x_word.shape[0], nh), p=0.8, dtype='float32')
 94 |         [h_bck, _], u = theano.scan(fn=recurrence,
 95 |                                     sequences=[x_word.dimshuffle(1, 0, 2)[::-1, :, :], idxs.dimshuffle(1, 0)[::-1, :]],
 96 |                                     non_sequences=bck_params + [maskd3, maskd4],
 97 |                                     outputs_info=[T.alloc(self.h0, x_word.shape[0], nh),
 98 |                                                   T.alloc(self.h0, x_word.shape[0], nh)],
 99 |                                     n_steps=x_word.shape[1],
100 |                                     strict=True)
101 | 
102 |         h_bck = h_bck[::-1, :, :].dimshuffle(1, 0, 2)
103 |         h_fwd = h_fwd.dimshuffle(1, 0, 2)
104 |         h_priv = T.concatenate([h_fwd, h_bck], axis=2)
105 |         h = h_priv.max(axis=1)
106 |         h = dropout(h, dropout_switch, 0.2)
107 | 
108 |         Y_neg = T.ivector()
109 |         pyx = T.nnet.nnet.softmax(T.dot(h, self.w_o) + self.b_o.dimshuffle('x', 0))
110 |         pyx = T.clip(pyx, 1e-5, 1 - 1e-5)
111 |         L = -T.mean(T.log(pyx)[T.arange(Y.shape[0]), Y]) + 1e-6 * sum([(x ** 2).sum() for x in self.params])
112 | 
113 |         updates, _ = Adam(L, self.params, lr2=0.001)
114 | 
115 |         self.train_batch = theano.function([idxs, e1_pos_idxs, e2_pos_idxs, \
116 |                                             Y, dropout_switch],
117 |                                            L, updates=updates, allow_input_downcast=True, on_unused_input='ignore')
118 |         self.predict_proba = theano.function([idxs, e1_pos_idxs, e2_pos_idxs, dropout_switch], \
119 |                                              pyx, allow_input_downcast=True, on_unused_input='ignore')
120 | 
121 |     def __getstate__(self):
122 |         values = [x.get_value() for x in self.params]
123 |         return values
124 | 
125 |     def __setstate__(self, weights):
126 |         for x, w in zip(self.params, weights):
127 |             x.set_value(w)
128 | 
129 | 
130 | def bilstm_weights(de, nh):
131 |     """
132 | 
133 |     Args:
134 |         de: Dimensionality of word embeddings
135 |         nh: Hidden layer dimensionality
136 | 
137 |     Returns:
138 |         forward weights, backward weights
139 |     """
140 |     # forward Bi-LSTM Weights
141 |     Wf_i = theano.shared(name='wf_i', value=he_normal((de, nh)).astype("float32"))
142 |     Uf_i = theano.shared(name='uf_i', value=he_normal((nh, nh)).astype("float32"))
143 |     bf_i = theano.shared(name='bf_i', value=np.zeros((nh,), dtype="float32"))
144 | 
145 |     Wf_f = theano.shared(name='wf_f', value=he_normal((de, nh)).astype("float32"))
146 |     Uf_f = theano.shared(name='uf_f', value=orthogonal_tmp((nh, nh)).astype("float32"))
147 |     bf_f = theano.shared(name='bf_f', value=np.ones((nh,), dtype="float32"))
148 | 
149 |     Wf_c = theano.shared(name='wf_c', value=he_normal((de, nh)).astype("float32"))
150 |     Uf_c = theano.shared(name='uf_c', value=orthogonal_tmp((nh, nh)).astype("float32"))
151 |     bf_c = theano.shared(name='bf_c', value=np.zeros((nh), dtype="float32"))
152 | 
153 |     Wf_o2 = theano.shared(name='wfoo', value=he_normal((de, nh)).astype("float32"))
154 |     Uf_o = theano.shared(name='ufoo', value=orthogonal_tmp((nh, nh)).astype("float32"))
155 |     bf_o2 = theano.shared(name='bfoo', value=np.zeros((nh,), dtype="float32"))
156 | 
157 |     # backward Bi-LSTM Weights
158 |     Wb_i = theano.shared(name='wb_i', value=he_normal((de, nh)).astype("float32"))
159 |     Ub_i = theano.shared(name='ub_i', value=orthogonal_tmp((nh, nh)).astype("float32"))
160 |     bb_i = theano.shared(name='bb_i', value=np.zeros((nh,), dtype="float32"))
161 | 
162 |     Wb_f = theano.shared(name='wb_f', value=he_normal((de, nh)).astype("float32"))
163 |     Ub_f = theano.shared(name='ub_f', value=orthogonal_tmp((nh, nh)).astype("float32"))
164 |     bb_f = theano.shared(name='bb_f', value=np.ones((nh), dtype="float32"))
165 | 
166 |     Wb_c = theano.shared(name='wb_c', value=he_normal((de, nh)).astype("float32"))
167 |     Ub_c = theano.shared(name='ub_c', value=orthogonal_tmp((nh, nh)).astype("float32"))
168 |     bb_c = theano.shared(name='bb_c', value=np.zeros((nh), dtype="float32"))
169 | 
170 |     Wb_o2 = theano.shared(name='wboo', value=he_normal((de, nh)).astype("float32"))
171 |     Ub_o = theano.shared(name='uboo', value=orthogonal_tmp((nh, nh)).astype("float32"))
172 |     bb_o2 = theano.shared(name='bboo', value=np.zeros((nh), dtype="float32"))
173 | 
174 |     params_forward = [Wb_i, Ub_i, bb_i,
175 |                       Wb_c, Ub_c, bb_c,
176 |                       Wb_f, Ub_f, bb_f,
177 |                       Wb_o2, Ub_o, bb_o2]
178 | 
179 |     params_backward = [Wf_i, Uf_i, bf_i,
180 |                        Wf_c, Uf_c, bf_c,
181 |                        Wf_f, Uf_f, bf_f,
182 |                        Wf_o2, Uf_o, bf_o2]
183 | 
184 |     return params_forward, params_backward
185 | 


--------------------------------------------------------------------------------
/models/utils.py:
--------------------------------------------------------------------------------
  1 | from theano import tensor as T
  2 | #import theano.sandbox.cuda
  3 | from collections import OrderedDict
  4 | from theano.ifelse import ifelse
  5 | import theano
  6 | from theano import config
  7 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
  8 | import numpy as np
  9 | from theano.tensor.nnet.conv import conv2d
 10 | 
 11 | np.random.seed(1234)
 12 | rng = np.random.RandomState(1234)
 13 | srng = RandomStreams(rng.randint(54321))
 14 | 
 15 | class ReverseGradient(theano.Op):
 16 |     """ theano operation to reverse the gradients
 17 |     Introduced in http://arxiv.org/pdf/1409.7495.pdf
 18 |     """
 19 | 
 20 |     view_map = {0: [0]}
 21 | 
 22 |     __props__ = ('hp_lambda', )
 23 | 
 24 |     def __init__(self, hp_lambda):
 25 |         super(ReverseGradient, self).__init__()
 26 |         self.hp_lambda = hp_lambda
 27 | 
 28 |     def make_node(self, x):
 29 |         assert hasattr(self, '_props'), "Your version of theano is too old to support __props__."
 30 |         x = theano.tensor.as_tensor_variable(x)
 31 |         return theano.Apply(self, [x], [x.type()])
 32 | 
 33 |     def perform(self, node, inputs, output_storage):
 34 |         xin, = inputs
 35 |         xout, = output_storage
 36 |         xout[0] = xin
 37 | 
 38 |     def grad(self, input, output_gradients):
 39 |         return [-self.hp_lambda * output_gradients[0]]
 40 | 
 41 |     def infer_shape(self, node, i0_shapes):
 42 |         return i0_shapes
 43 | 
 44 | def hard_sigmoid(x):
 45 |     return T.nnet.hard_sigmoid(x)
 46 | 
 47 | def log_softmax(x):
 48 |     xdev = x - x.max(1, keepdims=True)
 49 |     return xdev - T.log(T.sum(T.exp(xdev), axis=1, keepdims=True))
 50 | 
 51 | def categorical_crossentropy_logdomain(log_predictions, targets):
 52 |     return -T.mean(targets * log_predictions, axis=1)
 53 | 
 54 | def normal(shape, scale=0.05):
 55 |     return np.random.normal(0, scale, size=shape).astype('float32')
 56 | 
 57 | def get_fans(shape):
 58 |     fan_in = shape[0] if len(shape) == 2 else np.prod(shape[1:])
 59 |     fan_out = shape[1] if len(shape) == 2 else shape[0]
 60 |     return fan_in, fan_out
 61 | 
 62 | def orthogonal(shape):
 63 |     ''' Reference: Glorot & Bengio, AISTATS 2010 glorot_normal
 64 |     '''
 65 |     fan_in, fan_out = get_fans(shape)
 66 |     s = np.sqrt(2. / (fan_in * fan_out))
 67 |     return normal(shape, s)
 68 | 
 69 | def he_normal(shape):
 70 |     ''' Reference:  He et al., http://arxiv.org/abs/1502.01852
 71 |     '''
 72 |     fan_in, fan_out = get_fans(shape)
 73 |     s = np.sqrt(2. / fan_in)
 74 |     return normal(shape, s)
 75 | 
 76 | def glorot_uniform(shape):
 77 |     fan_in, fan_out = get_fans(shape)
 78 |     s = np.sqrt(6. / (fan_in + fan_out))
 79 |     return uniform(shape, s)
 80 | 
 81 | def orthogonal_tmp2(shape):
 82 |     fan_in, fan_out = get_fans(shape)
 83 |     s = np.sqrt(6. / (fan_in + fan_out))
 84 |     return uniform(shape, s)
 85 | 
 86 | def uniform(shape, scale=0.05):
 87 |         return np.random.uniform(low=-scale, high=scale, size=shape).astype('float32')
 88 | 
 89 | def orthogonal_tmp(shape, scale=1.0):
 90 |     ''' From Lasagne. Reference: Saxe et al., http://arxiv.org/abs/1312.6120
 91 |     '''
 92 |     flat_shape = (shape[0], np.prod(shape[1:]))
 93 |     a = np.random.normal(0.0, 1.0, flat_shape)
 94 |     u, _, v = np.linalg.svd(a, full_matrices=False)
 95 |     # pick the one with the correct shape
 96 |     q = u if u.shape == flat_shape else v
 97 |     q = q.reshape(shape)
 98 |     return scale * q[:shape[0], :shape[1]]
 99 | 
100 | def as_floatX(variable):
101 |     if isinstance(variable, float):
102 |         #return np.cast["float32"](variable)
103 |         return np.cast['float32'](variable)
104 |     elif isinstance(variable, np.ndarray):
105 |         #return np.cast["float32"](variable)
106 |         return np.cast['float32'](variable)
107 | 
108 | def rectify(X):
109 |     return T.maximum(X, 0.)
110 | 
111 | def cappedrectify(X):
112 |     return T.minimum(5., T.maximum(X, 0.))
113 | 
114 | def elu(X):
115 |     return T.switch(T.ge(X, 0), X, T.exp(X)-1.)
116 | 
117 | def snelu(X):
118 |     scale = 1.0507009873554804934193349852946
119 |     alpha = 1.6732632423543772848170429916717
120 |     return scale * T.switch(T.ge(X, 0), X, alpha*T.exp(X)-alpha)
121 | 
122 | def dropout(X, dropout_switch=1, p=0.):
123 |     retain_prob = 1 - p
124 |     mask = srng.binomial(X.shape, p=retain_prob, dtype='float32')
125 |     X = ifelse(T.lt(dropout_switch, 0.5), X*mask, (X*retain_prob).reshape(mask.shape))
126 |     return X
127 | 
128 | def dropout_scan(X, mask, dropout_switch=1, p=0.):
129 |     retain_prob = 1 - p
130 |     X = ifelse(T.lt(dropout_switch, 0.5), X*mask, (X*retain_prob).reshape(mask.shape))
131 |     return X
132 | 
133 | def clip_norm(g, c, n):
134 |     if c > 0:
135 |         g = T.switch(T.ge(n, c), g * c / n, g)
136 |     return g
137 | 
138 | def sgdm(cost, parameters, lr2=1., momentum=0.8):
139 |     lr = theano.shared(as_floatX(lr2).astype("float32"))
140 |     grads = T.grad(cost, parameters)
141 |     updates = OrderedDict()
142 |     for param,g2 in zip(parameters,grads):
143 |         grad = clip_norm(g2, 3, T.sum(g2 ** 2))
144 |         mparam = theano.shared(param.get_value()*0.)
145 |         updates[param] = param - lr * mparam
146 |         updates[mparam] = mparam*momentum + (1.-momentum)*grad
147 | 
148 |     return updates, lr
149 | 
150 | def sgd(cost, parameters, lr, updates=None):
151 |     grads = T.grad(cost,parameters)
152 |     updates = OrderedDict({})
153 |     for param,grad in zip(parameters,grads):
154 |             updates[param] = param - lr*grad
155 | 
156 |     return updates
157 | 
158 | #def Adam(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8):
159 | def Adam(cost, params, lr2=0.001, b1=0.1, b2=0.001, e=1e-8):
160 |     updates = []
161 |     lr = theano.shared(as_floatX(lr2).astype("float32"))
162 |     grads = T.grad(cost, params)
163 |     i = theano.shared(as_floatX(0.))
164 |     i_t = i + as_floatX(1.)
165 |     fix1 = as_floatX(1.) - (as_floatX(1.) - as_floatX(b1))**i_t
166 |     fix2 = as_floatX(1.) - (as_floatX(1.) - as_floatX(b2))**i_t
167 |     #lr_t = as_floatX(lr) * (T.sqrt(fix2) / fix1)
168 |     lr_t = lr * (T.sqrt(fix2) / fix1)
169 |     for p, g2 in zip(params, grads):
170 |         '''
171 |         if p.name != 'Words' and  p.name != 'Pos' and p.name != 'lang':
172 |         else:
173 |             g = g2
174 |         '''
175 |         #g = clip_norm(g2, 3, T.sum(g2 ** 2))
176 |         #g = clip_norm(g2, 3, T.sum(g2 ** 2))
177 |         g = g2
178 |         #g = g2.clip(-.5, .5)
179 |         #g = clip_norm(g, 3, T.sqrt(T.sum(g**2)))
180 |         m = theano.shared(p.get_value() * as_floatX(0.))
181 |         v = theano.shared(p.get_value() * as_floatX(0.))
182 |         m_t = (as_floatX(b1) * g) + ((as_floatX(1.) - as_floatX(b1)) * m)
183 |         v_t = (as_floatX(b2) * T.sqr(g)) + ((as_floatX(1.) - as_floatX(b2)) * v)
184 |         g_t = m_t / (T.sqrt(v_t) + as_floatX(e))
185 |         p_t = p - (lr_t * g_t)
186 |         updates.append((m, m_t))
187 |         updates.append((v, v_t))
188 |         if p.name == 'w_o':
189 |             p_t = clip_norm(p_t, 3, T.sum(p_t ** 2))
190 |         updates.append((p, p_t))
191 |     updates.append((i, i_t))
192 |     return updates, lr
193 | 
194 | def RMSprop(cost, params, lr=0.001, rho=0.9, epsilon=1e-6):
195 |     grads = T.grad(cost=cost, wrt=params)
196 |     '''
197 |     norm = T.sqrt(sum([T.sum(g ** 2) for g,p in zip(grads, params) if p.name != 'Words' and p.get_value(borrow=True).ndim == 2 and p.name != 'label_embeddings']))
198 |     tmp_grads = []
199 |     for g,p in zip(grads, params):
200 |         if p.name != 'Words' and (p.get_value(borrow=True).ndim == 2) and p.name != 'label_embeddings':
201 |             tmp_grads.append(clip_norm(g, 5, norm))
202 |         else:
203 |             tmp_grads.append(g)
204 |     grads = tmp_grads
205 |     '''
206 |     norm = T.sqrt(sum([T.sum(g ** as_floatX(2.)) for g in grads]))
207 |     grads = [clip_norm(g, as_floatX(5.), norm) for g in grads]
208 |     updates = []
209 |     for p, g in zip(params, grads):
210 |         acc = theano.shared(p.get_value() * as_floatX(0.))
211 |         acc_new = as_floatX(rho) * acc + (as_floatX(1.) - as_floatX(rho)) * g ** as_floatX(2.)
212 |         gradient_scaling = T.sqrt(acc_new + as_floatX(epsilon))
213 |         g = g / gradient_scaling
214 |         updates.append((acc, acc_new))
215 |         updates.append((p, p - as_floatX(lr) * g))
216 |     return updates
217 | 
218 | 
219 | def adagrad(cost, params, lr=0.001, eps=1e-8, sparse=False):
220 |     lr = theano.shared(as_floatX(lr).astype("float32"))
221 |     eps = as_floatX(eps).astype("float32")
222 | 
223 |     gsums   = [theano.shared(np.zeros_like(param.get_value(borrow=True))+0.1) for param in params]
224 |     #gsums   = [theano.shared(np.zeros_like(param.get_value(borrow=True))) for param in params]
225 |     xsums   = [None for param in params]
226 | 
227 |     gparams = T.grad(cost, params)
228 | 
229 |     updates = OrderedDict()
230 | 
231 |     for gparam, param, gsum in zip(gparams, params, gsums):
232 |         updates[gsum] =  T.cast(gsum + (gparam ** as_floatX(2.)), "float32")
233 |         updates[param] =  T.cast(param - lr * (gparam / (T.sqrt(updates[gsum] + eps))), "float32")
234 | 
235 |     return updates, lr
236 | 
237 | def sgd_updates_adadelta(params,cost,rho=0.95,epsilon=1e-6,norm_lim=9,word_vec_name='Words'):
238 |     """
239 |     adadelta update rule, mostly from
240 |     https://groups.google.com/forum/#!topic/pylearn-dev/3QbKtCumAW4 (for Adadelta)
241 |     """
242 |     updates = OrderedDict({})
243 |     exp_sqr_grads = OrderedDict({})
244 |     exp_sqr_ups = OrderedDict({})
245 |     gparams = []
246 |     for param in params:
247 |         empty = np.zeros_like(param.get_value())
248 |         exp_sqr_grads[param] = theano.shared(value=as_floatX(empty),name="exp_grad_%s" % param.name)
249 |         gp = T.grad(cost, param)
250 |         exp_sqr_ups[param] = theano.shared(value=as_floatX(empty), name="exp_grad_%s" % param.name)
251 |         gparams.append(gp)
252 | 
253 |     for param, gp in zip(params, gparams):
254 |         exp_sg = exp_sqr_grads[param]
255 |         exp_su = exp_sqr_ups[param]
256 |         up_exp_sg = rho * exp_sg + (1 - rho) * T.sqr(gp)
257 |         updates[exp_sg] = up_exp_sg
258 |         step =  -(T.sqrt(exp_su + epsilon) / T.sqrt(up_exp_sg + epsilon)) * gp
259 |         updates[exp_su] = rho * exp_su + (1 - rho) * T.sqr(step)
260 |         stepped_param = param + step
261 |         updates[param] = stepped_param      
262 |     return updates 
263 | 


--------------------------------------------------------------------------------
/load_data.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import pickle
  3 | from time import time
  4 | import sys
  5 | from collections import defaultdict
  6 | import gensim
  7 | import logging
  8 | from gensim.models.keyedvectors import KeyedVectors
  9 | import nltk
 10 | nltk.download('averaged_perceptron_tagger')
 11 | nltk.download('words')
 12 | from nltk import conlltags2tree, tree2conlltags
 13 | 
 14 | import numpy as np
 15 | from sklearn.externals import joblib
 16 | from sklearn.metrics import confusion_matrix
 17 | from sklearn.metrics import f1_score, precision_recall_fscore_support, accuracy_score
 18 | from sklearn.model_selection import train_test_split
 19 | 
 20 | def dataRead(fname):
 21 |     print ("Input File Reading")
 22 |     fp = open(fname, 'r')
 23 |     #samples = fp.read().strip().split('\r\n\r\n')
 24 |     samples = fp.read().strip().split('\n\n')
 25 |     sent_lengths   = []        #1-d array
 26 |     sent_contents  = []        #2-d array [[w1,w2,....] ...]
 27 |     sent_lables    = []        #1-d array
 28 |     entity1_list   = []        #2-d array [[e1,e1_t] [e1,e1_t]...]
 29 |     entity2_list   = []        #2-d array [[e1,e1_t] [e1,e1_t]...]
 30 |     doc_ids = []
 31 |     idents = []
 32 |     for sample in samples:
 33 |         #sent, entities = sample.strip().split('\r\n')
 34 |         sent, entities = sample.strip().split('\n')
 35 |         doc_id, ident, e1, e2, relation = entities.split('\t') 
 36 |         sent_contents.append(sent.lower())
 37 |         entity1_list.append([e1, ident])
 38 |         entity2_list.append([e2, ident])
 39 |         sent_lables.append(relation)
 40 |         idents.append(ident)
 41 |         doc_ids.append(doc_id)
 42 | 
 43 |     return idents, sent_contents, entity1_list, entity2_list, sent_lables 
 44 | 
 45 | 
 46 | class LoadDataReturn(object):
 47 |     def __init__(self):
 48 |         self.pairs_idx = []
 49 |         self.pos_idx = []
 50 |         self.pairs_idx_rev = []
 51 |         self.domain_labels = []
 52 |         self.pos_e2_idx = []
 53 |         self.pos_e1_idx = []
 54 |         self.subj_labels = []
 55 |         self.pred_labels = []
 56 |         self.obj_labels = []
 57 |         self.e1_ids = []
 58 |         self.e2_ids = []
 59 |         self.y = []
 60 |         self.idents = []
 61 | 
 62 | 
 63 | class LoadData(object):
 64 |     def __init__(self, word2vec_file):
 65 |         self.word_index = {}
 66 |         self.pos_index = {}
 67 |         self.num_words = 1
 68 |         self.num_pos = 1
 69 |         self.embs = [np.zeros((300,))]
 70 |         self.pos  = [np.zeros((32,))]
 71 |         logging.debug('Loading %s', word2vec_file)
 72 |         #self.wv = gensim.models.Word2Vec.load('/home/amri228/i2b2_2016/ddi/word_vecs2/gensim_model_pubmed')
 73 |         self.wv = KeyedVectors.load_word2vec_format(word2vec_file, binary=False)
 74 |         logging.debug('Done')
 75 |         self.max_u = self.wv.syn0.max()
 76 |         self.min_u = self.wv.syn0.min()
 77 | 
 78 |     def fit(self, filename, ids):
 79 |         all_data = dataRead(filename)
 80 |         word_cnts = {}
 81 |         pos_cnts = {}
 82 |         missing = set()
 83 |         for ident, tr, tl, e1, e2 in zip(all_data[0], all_data[1], all_data[-1], all_data[2], all_data[3]):
 84 |             if ident not in ids:
 85 |                 continue
 86 |             final_string = tr.split()
 87 |             final_string_pos = nltk.pos_tag(final_string)
 88 |             #tree = self.tagger.parse(final_string_pos)
 89 |             #iob_tags = tree2conlltags(tree)
 90 |             final_e1_string = ['druga']
 91 |             final_e2_string = ['drugb']
 92 |             e1_pos = None
 93 |             e2_pos = None
 94 |             cnt = 0
 95 |             for w in final_string:
 96 |                 if w == 'druga':
 97 |                     e1_pos = cnt
 98 |                 elif w == 'drugb':
 99 |                     e2_pos = cnt
100 |                 cnt += 1
101 |             tmp = []
102 |             final_e1_pos = []
103 |             final_e2_pos = []
104 |             cnt = 0
105 |             error = False
106 |             #print final_string
107 |             for w in final_string:
108 |                 if cnt-e1_pos in pos_cnts:
109 |                     pos_cnts[cnt-e1_pos] += 1
110 |                 else:
111 |                     pos_cnts[cnt-e1_pos] = 1
112 |                 if cnt-e2_pos in pos_cnts:
113 |                     pos_cnts[cnt-e2_pos] += 1
114 |                 else:
115 |                     pos_cnts[cnt-e2_pos] = 1
116 |                 cnt += 1
117 |             for w in final_string:
118 |                 if w in word_cnts:
119 |                     word_cnts[w] += 1
120 |                 else:
121 |                     word_cnts[w] = 1
122 |             for w in final_e1_pos:
123 |                 if w in pos_cnts:
124 |                     pos_cnts[w] += 1
125 |                 else:
126 |                     pos_cnts[w] = 1
127 |             for w in final_e2_pos:
128 |                 if w in pos_cnts:
129 |                     pos_cnts[w] += 1
130 |                 else:
131 |                     pos_cnts[w] = 1
132 | 
133 |         for w, cnt in word_cnts.iteritems():
134 |             if cnt > 5:
135 |                 if w in self.wv:
136 |                     self.embs.append(self.wv[w])
137 |                     self.word_index[w] = self.num_words
138 |                     self.num_words += 1
139 |                 else:
140 |                     missing.add(w)
141 |                     self.embs.append(np.random.uniform(-1., 1., (300,)))
142 |                     self.word_index[w] = self.num_words
143 |                     self.num_words += 1
144 |         for w, cnt in pos_cnts.iteritems():
145 |             if cnt > 5:
146 |                 self.pos.append(np.random.uniform(-1., 1., (32,)))
147 |                 self.pos_index[w] = self.num_pos
148 |                 self.num_pos += 1
149 | 
150 |         self.pos_index['NegUNK'] = self.num_pos
151 |         self.num_pos += 1
152 |         self.pos.append(np.random.uniform(-1., 1., (32,)))
153 |         self.pos_index['PosUNK'] = self.num_pos
154 |         self.num_pos += 1
155 |         self.pos.append(np.random.uniform(-1., 1., (32,)))
156 | 
157 |         self.word_index['UNK'] = self.num_words
158 |         self.embs.append(np.random.uniform(-1., 1., (300,)))
159 |         self.num_words += 1
160 | 
161 |         del self.wv
162 |         self.embs = np.array(self.embs, dtype='float32')
163 |         self.pos = np.array(self.pos, dtype='float32')
164 |         return
165 | 
166 |     def transform(self, filename, ids):
167 |         all_data = dataRead(filename)
168 |         pairs_idx = []
169 |         pos_idx = []
170 |         pairs_idx_rev = []
171 |         domain_labels = []
172 |         pos_e2_idx = []
173 |         pos_e1_idx = []
174 |         subj_labels = []
175 |         pred_labels = []
176 |         obj_labels = []
177 |         e1_ids = []
178 |         e2_ids = []
179 |         y = []
180 |         idents = []
181 |         for ident, tr, tl, e1, e2 in zip(all_data[0], all_data[1],
182 |                                          all_data[-1], all_data[2], all_data[3]):
183 |             if ident not in ids:
184 |                 continue
185 |             final_string = tr.split()
186 |             final_string_pos = nltk.pos_tag(final_string)
187 |             #tree = self.tagger.parse(final_string_pos)
188 |             #iob_tags = tree2conlltags(tree)
189 |             final_e1_string = ['druga']
190 |             final_e2_string = ['drugb']
191 |             e1_pos = None
192 |             e2_pos = None
193 |             cnt = 0
194 |             for w in final_string:
195 |                 if w == 'druga':
196 |                     e1_pos = cnt
197 |                 elif w == 'drugb':
198 |                     e2_pos = cnt
199 |                 cnt += 1
200 |             if e1_pos is None or e2_pos is None:
201 |                 continue
202 |             tmp = []
203 |             final_e1_pos = []
204 |             final_e2_pos = []
205 |             cnt = 0
206 |             tmp_subj = []
207 |             tmp_pred = []
208 |             tmp_obj = []
209 |             for w in final_string:
210 |                 final_e1_pos.append(cnt - e1_pos)
211 |                 final_e2_pos.append(cnt - e2_pos)
212 |                 cnt += 1
213 |             idents.append(ident)
214 |             e1_ids.append(e1[0])
215 |             e2_ids.append(e2[0])
216 |             y.append(tl)
217 |             fstring = []
218 |             for w in final_string:
219 |                 fstring.append(w)
220 |             final_string = fstring
221 |             str_idx = []
222 |             for w in final_string:
223 |                 if w in self.word_index:
224 |                     str_idx.append(self.word_index[w])
225 |                 else:
226 |                     str_idx.append(self.word_index['UNK'])
227 |             pairs_idx.append(str_idx)
228 |             e1_idx = []
229 |             for p in final_e1_pos:
230 |                 if p in self.pos_index:
231 |                     e1_idx.append(self.pos_index[p])
232 |                 else:
233 |                     if p < 0:
234 |                         e1_idx.append(self.pos_index['NegUNK'])
235 |                     else:
236 |                         e1_idx.append(self.pos_index['PosUNK'])
237 |             pos_e1_idx.append(e1_idx)
238 |             e2_idx = []
239 |             for p in final_e2_pos:
240 |                 if p in self.pos_index:
241 |                     e2_idx.append(self.pos_index[p])
242 |                 else:
243 |                     if p < 0:
244 |                         e2_idx.append(self.pos_index['NegUNK'])
245 |                     else:
246 |                         e2_idx.append(self.pos_index['PosUNK'])
247 |             pos_e2_idx.append(e2_idx)
248 | 
249 |         lab_lookup = {'OTHER':0, 'CLASS1':1}
250 |         self.lab_lookup_rev = {0:'OTHER', 1:'CLASS1'}
251 |         final_y = np.array([np.int32(lab_lookup[x]) for x in y])
252 | 
253 |         return pairs_idx, pos_e1_idx, pos_e2_idx, final_y, subj_labels, pred_labels, obj_labels, idents, e1_ids, e2_ids
254 | 
255 |     def fit_transform(self, filename, ids):
256 |         self.fit(filename, ids)
257 |         return self.transform(filename, ids)
258 | 
259 |     def pad_data(self, data, max_len = None):
260 |         max_len = np.max([len(x) for x in data])
261 |         padded_dataset = []
262 |         for example in data:
263 |             try:
264 |                 zeros = [0]*(max_len-len(example))
265 |                 padded_dataset.append(example+zeros)
266 |             except:
267 |                 logging.exception('%s %s %s', max_len, len(example), example)
268 |                 exit(1)
269 |         if max_len is None:
270 |             return np.array(padded_dataset)
271 |         else:
272 |             return np.array(padded_dataset)[:,:max_len]
273 | 


--------------------------------------------------------------------------------