├── .DS_Store
├── README.md
├── build_data.py
├── model
    ├── __init__.py
    ├── base_model.py
    ├── config.py
    ├── data_utils.py
    ├── general_utils.py
    ├── models.py
    ├── ner_model.py
    └── resnet.py
└── train.py


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jind11/LSTM-PICO-Detection/0c720b6e1d114c192dc9a78e5d581dc22db65911/.DS_Store


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Code for PubMed PICO Element Detection
 2 | 
 3 | This is the code for PICO element detection introduced by *[Jin, Di, and Peter Szolovits. "PICO Element Detection in Medical Text via Long Short-Term Memory Neural Networks." Proceedings of the BioNLP 2018 workshop. 2018.](http://www.aclweb.org/anthology/W18-2308)*
 4 | 
 5 | Abstract
 6 | 
 7 | >Successful evidence-based medicine (EBM) applications rely on answering clinical questions by analyzing large medical literature databases. In order to formulate a well-defined, focused clinical question, a framework called PICO is widely used, which identifies the sentences in a given medical text that belong to the four components: Participants/Problem (P), Intervention (I), Comparison (C) and Outcome (O). In this work, we present a Long Short-Term Memory (LSTM) neural network based model to automatically detect PICO elements. By jointly classifying subsequent sentences in the given text, we achieve state-of-the-art results on PICO element classification compared to several strong baseline models. We also make our curated data public as a benchmarking dataset so that the community can benefit from it.
 8 | 
 9 | ## How to use
10 | 
11 | 1. First define the path to the word embeddings file, data file and output file, which are defined in the file `model/config.py`. The [data](https://github.com/jind11/PubMed-PICO-Detection) can be downloaded online.
12 | 2. Then run the command below to compile the raw data
13 | ```
14 | python build_data.py
15 | ```
16 | 3. Finally run the command below to start training
17 | ```
18 | python train.py
19 | ```
20 | Note that, after each epoch, the validation set will be evaluated to get the prediction performance and if there are 3 epochs without improvement, the training will be terminated and the test set will be evaludated.
21 | 
22 | Welcome to post any questions you have and use our code for your work by citing us!
23 | 


--------------------------------------------------------------------------------
/build_data.py:
--------------------------------------------------------------------------------
 1 | from model.config import Config
 2 | from model.data_utils import Dataset, get_vocabs, UNK, NUM, \
 3 |     get_wordvec_vocab, write_vocab, load_vocab, get_char_vocab, \
 4 |     export_trimmed_wordvec_vectors, get_processing_word
 5 | import argparse
 6 | 
 7 | parser = argparse.ArgumentParser()
 8 | 
 9 | 
10 | def main():
11 |     """Procedure to build data
12 | 
13 |     You MUST RUN this procedure. It iterates over the whole dataset (train,
14 |     dev and test) and extract the vocabularies in terms of words, tags, and
15 |     characters. Having built the vocabularies it writes them in a file. The
16 |     writing of vocabulary in a file assigns an id (the line #) to each word.
17 |     It then extract the relevant GloVe vectors and stores them in a np array
18 |     such that the i-th entry corresponds to the i-th word in the vocabulary.
19 | 
20 | 
21 |     Args:
22 |         config: (instance of Config) has attributes like hyper-params...
23 | 
24 |     """
25 |     # get config and processing of words
26 |     config = Config(parser, load=False)
27 |     processing_word = get_processing_word(lowercase=True)
28 | 
29 |     # Generators
30 |     dev   = Dataset(config.filename_dev, processing_word)
31 |     test  = Dataset(config.filename_test, processing_word)
32 |     train = Dataset(config.filename_train, processing_word)
33 | 
34 |     # Build Word and Tag vocab
35 |     vocab_words, vocab_tags = get_vocabs([train, dev, test])
36 |     # vocab_glove = get_wordvec_vocab(config.filename_wordvec)
37 | 
38 |     # vocab = vocab_words & vocab_glove
39 |     vocab = list(vocab_words)
40 |     vocab.insert(0, UNK)
41 |     vocab.append(NUM)
42 | 
43 |     # Save vocab
44 |     write_vocab(vocab, config.filename_words)
45 |     write_vocab(vocab_tags, config.filename_tags)
46 | 
47 |     # Trim GloVe Vectors
48 |     vocab = load_vocab(config.filename_words)
49 |     export_trimmed_wordvec_vectors(vocab, config.filename_wordvec,
50 |                                 config.filename_wordvec_trimmed)
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     main()
55 | 


--------------------------------------------------------------------------------
/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jind11/LSTM-PICO-Detection/0c720b6e1d114c192dc9a78e5d581dc22db65911/model/__init__.py


--------------------------------------------------------------------------------
/model/base_model.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tensorflow as tf
  3 | 
  4 | 
  5 | class BaseModel(object):
  6 |     """Generic class for general methods that are not specific to NER"""
  7 | 
  8 |     def __init__(self, config):
  9 |         """Defines self.config and self.logger
 10 | 
 11 |         Args:
 12 |             config: (Config instance) class with hyper parameters,
 13 |                 vocab and embeddings
 14 | 
 15 |         """
 16 |         self.config = config
 17 |         self.logger = config.logger
 18 |         self.sess   = None
 19 |         self.saver  = None
 20 | 
 21 | 
 22 |     def reinitialize_weights(self, scope_name):
 23 |         """Reinitializes the weights of a given layer"""
 24 |         variables = tf.contrib.framework.get_variables(scope_name)
 25 |         init = tf.variables_initializer(variables)
 26 |         self.sess.run(init)
 27 | 
 28 | 
 29 |     def add_train_op(self, lr_method, lr, loss, clip=-1):
 30 |         """Defines self.train_op that performs an update on a batch
 31 | 
 32 |         Args:
 33 |             lr_method: (string) sgd method, for example "adam"
 34 |             lr: (tf.placeholder) tf.float32, learning rate
 35 |             loss: (tensor) tf.float32 loss to minimize
 36 |             clip: (python float) clipping of gradient. If < 0, no clipping
 37 | 
 38 |         """
 39 |         _lr_m = lr_method.lower() # lower to make sure
 40 | 
 41 |         with tf.variable_scope("train_step"):
 42 |             if _lr_m == 'adam': # sgd method
 43 |                 optimizer = tf.train.AdamOptimizer(lr)
 44 |             elif _lr_m == 'adagrad':
 45 |                 optimizer = tf.train.AdagradOptimizer(lr)
 46 |             elif _lr_m == 'sgd':
 47 |                 optimizer = tf.train.GradientDescentOptimizer(lr)
 48 |             elif _lr_m == 'rmsprop':
 49 |                 optimizer = tf.train.RMSPropOptimizer(lr)
 50 |             else:
 51 |                 raise NotImplementedError("Unknown method {}".format(_lr_m))
 52 | 
 53 |             if clip > 0: # gradient clipping if clip is positive
 54 |                 grads, vs     = zip(*optimizer.compute_gradients(loss))
 55 |                 grads, gnorm  = tf.clip_by_global_norm(grads, clip)
 56 |                 self.train_op = optimizer.apply_gradients(zip(grads, vs))
 57 |             else:
 58 |                 self.train_op = optimizer.minimize(loss)
 59 | 
 60 | 
 61 |     def initialize_session(self):
 62 |         """Defines self.sess and initialize the variables"""
 63 |         self.logger.info("Initializing tf session")
 64 |         self.sess = tf.Session()
 65 |         self.sess.run(tf.global_variables_initializer())
 66 |         self.saver = tf.train.Saver()
 67 | 
 68 | 
 69 |     def restore_session(self, dir_model):
 70 |         """Reload weights into session
 71 | 
 72 |         Args:
 73 |             sess: tf.Session()
 74 |             dir_model: dir with weights
 75 | 
 76 |         """
 77 |         self.logger.info("Reloading the latest trained model...")
 78 |         self.saver.restore(self.sess, dir_model)
 79 | 
 80 | 
 81 |     def save_session(self):
 82 |         """Saves session = weights"""
 83 |         if not os.path.exists(self.config.dir_model):
 84 |             os.makedirs(self.config.dir_model)
 85 |         self.saver.save(self.sess, self.config.dir_model)
 86 | 
 87 | 
 88 |     def close_session(self):
 89 |         """Closes the session"""
 90 |         self.sess.close()
 91 | 
 92 | 
 93 |     def add_summary(self):
 94 |         """Defines variables for Tensorboard
 95 | 
 96 |         Args:
 97 |             dir_output: (string) where the results are written
 98 | 
 99 |         """
100 |         self.merged      = tf.summary.merge_all()
101 |         self.file_writer = tf.summary.FileWriter(self.config.dir_output,
102 |                 self.sess.graph)
103 | 
104 | 
105 |     def train(self, train, dev):
106 |         """Performs training with early stopping and lr exponential decay
107 | 
108 |         Args:
109 |             train: dataset that yields tuple of (sentences, tags)
110 |             dev: dataset
111 | 
112 |         """
113 |         best_score = 0
114 |         nepoch_no_imprv = 0 # for early stopping
115 |         self.add_summary() # tensorboard
116 | 
117 |         for epoch in range(self.config.nepochs):
118 |             self.logger.info("Epoch {:} out of {:}".format(epoch + 1,
119 |                         self.config.nepochs))
120 | 
121 |             score = self.run_epoch(train, dev, epoch)
122 |             self.config.lr *= self.config.lr_decay # decay learning rate
123 | 
124 |             # early stopping and saving best parameters
125 |             if score >= best_score:
126 |                 nepoch_no_imprv = 0
127 |                 self.save_session()
128 |                 best_score = score
129 |                 self.logger.info("- new best score!")
130 |             else:
131 |                 nepoch_no_imprv += 1
132 |                 if nepoch_no_imprv >= self.config.nepoch_no_imprv:
133 |                     self.logger.info("- early stopping {} epochs without "\
134 |                             "improvement".format(nepoch_no_imprv))
135 |                     break
136 | 
137 | 
138 |     def evaluate(self, test):
139 |         """Evaluate model on test set
140 | 
141 |         Args:
142 |             test: instance of class Dataset
143 | 
144 |         """
145 |         self.logger.info("Testing model over test set")
146 |         metrics = self.run_evaluate(test, report=True)
147 |         msg = " - ".join(["{} {:04.2f}".format(k, v)
148 |                     if k == 'acc' else '{} {}'.format(k, ', '.join(['{}: {:04.3f}'.format(a, b) \
149 |                     for a, b in v.items()])) for k, v in metrics.items()])
150 |         self.logger.info(msg)
151 | 
152 |         return metrics
153 | 


--------------------------------------------------------------------------------
/model/config.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from .general_utils import get_logger
  4 | from .data_utils import get_trimmed_wordvec_vectors, load_vocab, \
  5 |         get_processing_word
  6 | 
  7 | 
  8 | class Config():
  9 |     def __init__(self, parser, load=True):
 10 |         """Initialize hyperparameters and load vocabs
 11 | 
 12 |         Args:
 13 |             load_embeddings: (bool) if True, load embeddings into
 14 |                 np array, else None
 15 | 
 16 |         """
 17 |         ## parse args
 18 |         self.parser = parser
 19 |         # training parameters
 20 |         parser.add_argument('--nepochs', default='40', type=int,
 21 |                     help='number of epochs')
 22 |         parser.add_argument('--dropout', default='0.5', type=float,
 23 |                     help='number of epochs')
 24 |         parser.add_argument('--batch_size', default='40', type=int,
 25 |                     help='batch size')
 26 |         parser.add_argument('--lr', default='0.001', type=float,
 27 |                     help='learning rate')
 28 |         parser.add_argument('--lr_method', default='adam', type=str,
 29 |                     help='optimization method')
 30 |         parser.add_argument('--lr_decay', default='0.99', type=float,
 31 |                     help='learning rate decay rate')
 32 |         parser.add_argument('--clip', default='10', type=float,
 33 |                     help='gradient clipping')
 34 |         parser.add_argument('--nepoch_no_imprv', default='3', type=int,
 35 |                     help='number of epoch patience')
 36 |         parser.add_argument('--l2_reg_lambda', default='0.0001', type=float,
 37 |                     help='l2 regularization coefficient')
 38 | 
 39 |         # data and results paths
 40 |         parser.add_argument('--dir_output', default='test', type=str,
 41 |                     help='directory for output')
 42 |         parser.add_argument('--data_root', default='/data/medg/misc/jindi/nlp/PICO', type=str,
 43 |                     help='directory for output')
 44 |         parser.add_argument('--filename_wordvec_trimmed', default='data/word2vec_pubmed.trimmed.txt', 
 45 |                     type=str, help='directory for trimmed word embeddings file')
 46 |         parser.add_argument('--filename_wordvec', default='/data/medg/misc/jindi/nlp/embeddings/word2vec/wikipedia-pubmed-and-PMC-w2v.txt', 
 47 |                     type=str, help='directory for original word embeddings file')
 48 | 
 49 |         # model hyperparameters
 50 |         parser.add_argument('--hidden_size_lstm_sentence', default='150', type=int,
 51 |                     help='hidden size of sentence level lstm')
 52 |         parser.add_argument('--attention_size', default='300', type=int,
 53 |                     help='attention vector size')
 54 | 
 55 |         # misc
 56 |         parser.add_argument('--restore', action='store_true', 
 57 |                     help='whether restore from previous trained model')
 58 |         parser.add_argument('--use_crf', action='store_false', 
 59 |                     help='whether use crf optimization layer')
 60 |         parser.add_argument('--train_embeddings', action='store_true', 
 61 |                     help='whether use cnn or lstm for sentence representation')
 62 |         parser.add_argument('--use_pretrained', action='store_false', 
 63 |                     help='whether use pre-trained word embeddings')
 64 |         parser.add_argument('--train_accuracy', action='store_false', 
 65 |                     help='whether report accuracy while training')
 66 | 
 67 |         self.parser.parse_args(namespace=self)
 68 | 
 69 |         self.dir_output = os.path.join('results', self.dir_output)
 70 |         self.dir_model  = os.path.join(self.dir_output, "model.weights")
 71 |         self.path_log   = os.path.join(self.dir_output, "log.txt")
 72 | 
 73 |         # dataset
 74 |         self.filename_dev = os.path.join(self.data_root, 'PICO_dev.txt')
 75 |         self.filename_test = os.path.join(self.data_root, 'PICO_test.txt')
 76 |         self.filename_train = os.path.join(self.data_root, 'PICO_train.txt')
 77 | 
 78 |         # vocab (created from dataset with build_data.py)
 79 |         self.filename_words = os.path.join('data', 'words.txt')
 80 |         self.filename_tags = os.path.join('data', 'tags.txt')
 81 | 
 82 |         # directory for training outputs
 83 |         if not os.path.exists('data'):
 84 |             os.makedirs('data')
 85 | 
 86 |         # directory for data output
 87 |         if not os.path.exists(self.dir_output):
 88 |             os.makedirs(self.dir_output)
 89 | 
 90 |         # create instance of logger
 91 |         self.logger = get_logger(self.path_log)
 92 | 
 93 |         # log the attributes
 94 |         msg = ', '.join(['{}: {}'.format(attr, getattr(self, attr)) for attr in dir(self) \
 95 |                         if not callable(getattr(self, attr)) and not attr.startswith("__")])
 96 |         self.logger.info(msg)
 97 | 
 98 |         # load if requested (default)
 99 |         if load:
100 |             self.load()
101 | 
102 | 
103 |     def load(self):
104 |         """Loads vocabulary, processing functions and embeddings
105 | 
106 |         Supposes that build_data.py has been run successfully and that
107 |         the corresponding files have been created (vocab and trimmed GloVe
108 |         vectors)
109 | 
110 |         """
111 |         # 1. vocabulary
112 |         self.vocab_words = load_vocab(self.filename_words)
113 |         self.vocab_tags  = load_vocab(self.filename_tags)
114 | 
115 |         self.nwords     = len(self.vocab_words)
116 |         self.ntags      = len(self.vocab_tags)
117 | 
118 |         # 2. get processing functions that map str -> id
119 |         self.processing_word = get_processing_word(self.vocab_words, lowercase=True)
120 |         self.processing_tag  = get_processing_word(self.vocab_tags,
121 |                 lowercase=False, allow_unk=False)
122 | 
123 |         # 3. get pre-trained embeddings
124 |         self.embeddings = (get_trimmed_wordvec_vectors(self.filename_wordvec_trimmed, self.vocab_words)
125 |                 if self.use_pretrained else None)
126 |         self.dim_word = self.embeddings.shape[1]
127 | 
128 | 
129 |     max_iter = None # if not None, max number of examples in Dataset
130 | 
131 | 


--------------------------------------------------------------------------------
/model/data_utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | 
  4 | 
  5 | # shared global variables to be imported from model also
  6 | UNK = "$UNK$"
  7 | NUM = "$NUM$"
  8 | NONE = "O"
  9 | 
 10 | 
 11 | # special error message
 12 | class MyIOError(Exception):
 13 |     def __init__(self, filename):
 14 |         # custom error message
 15 |         message = """
 16 | ERROR: Unable to locate file {}.
 17 | 
 18 | FIX: Have you tried running python build_data.py first?
 19 | This will build vocab file from your train, test and dev sets and
 20 | trimm your word vectors.
 21 | """.format(filename)
 22 |         super(MyIOError, self).__init__(message)
 23 | 
 24 | 
 25 | class Dataset(object):
 26 |     """Class that iterates over CoNLL Dataset
 27 | 
 28 |     __iter__ method yields a tuple (words, tags)
 29 |         words: list of raw words
 30 |         tags: list of raw tags
 31 | 
 32 |     If processing_word and processing_tag are not None,
 33 |     optional preprocessing is appplied
 34 | 
 35 |     Example:
 36 |         ```python
 37 |         data = CoNLLDataset(filename)
 38 |         for sentence, tags in data:
 39 |             pass
 40 |         ```
 41 | 
 42 |     """
 43 |     def __init__(self, filename, processing_word=None, processing_tag=None,
 44 |                  max_iter=None):
 45 |         """
 46 |         Args:
 47 |             filename: path to the file
 48 |             processing_words: (optional) function that takes a word as input
 49 |             processing_tags: (optional) function that takes a tag as input
 50 |             max_iter: (optional) max number of sentences to yield
 51 | 
 52 |         """
 53 |         self.filename = filename
 54 |         self.processing_word = processing_word
 55 |         self.processing_tag = processing_tag
 56 |         self.max_iter = max_iter
 57 |         self.length = None
 58 | 
 59 | 
 60 |     def __iter__(self):
 61 |         niter = 0
 62 |         with open(self.filename) as f:
 63 |             sentences, tags = [], []
 64 |             for line in f:
 65 |                 line = line.strip()
 66 |                 if not line:
 67 |                     if len(sentences) != 0:
 68 |                         niter += 1
 69 |                         if self.max_iter is not None and niter > self.max_iter:
 70 |                             break
 71 |                         yield sentences, tags
 72 |                         sentences, tags = [], []
 73 |                 elif not line.startswith("###"):
 74 |                     ls = line.split('|')
 75 |                     tag, sentence = ls[1], ls[2].split()
 76 |                     # if tag != 'Others':
 77 |                     if self.processing_word is not None:
 78 |                         sentence = [self.processing_word(word) for word in sentence]
 79 |                     if self.processing_tag is not None:
 80 |                         tag = self.processing_tag(tag)
 81 |                     sentences += [sentence]
 82 |                     tags += [tag]
 83 | 
 84 | 
 85 |     def __len__(self):
 86 |         """Iterates once over the corpus to set and store length"""
 87 |         if self.length is None:
 88 |             self.length = 0
 89 |             for _ in self:
 90 |                 self.length += 1
 91 | 
 92 |         return self.length
 93 | 
 94 | 
 95 | def get_vocabs(datasets):
 96 |     """Build vocabulary from an iterable of datasets objects
 97 | 
 98 |     Args:
 99 |         datasets: a list of dataset objects
100 | 
101 |     Returns:
102 |         a set of all the words in the dataset
103 | 
104 |     """
105 |     print("Building vocab...")
106 |     vocab_words = set()
107 |     vocab_tags = set()
108 |     for dataset in datasets:
109 |         for sentences, tags in dataset:
110 |             for sent in sentences:
111 |                 vocab_words.update(sent)
112 |             vocab_tags.update(tags)
113 |     print("- done. {} tokens".format(len(vocab_words)))
114 |     return vocab_words, vocab_tags
115 | 
116 | 
117 | def get_char_vocab(dataset):
118 |     """Build char vocabulary from an iterable of datasets objects
119 | 
120 |     Args:
121 |         dataset: a iterator yielding tuples (sentence, tags)
122 | 
123 |     Returns:
124 |         a set of all the characters in the dataset
125 | 
126 |     """
127 |     vocab_char = set()
128 |     for sents, _ in dataset:
129 |         for sent in sents:
130 |             for word in sent:
131 |                 vocab_char.update(word)
132 | 
133 |     return vocab_char
134 | 
135 | 
136 | def get_wordvec_vocab(filename):
137 |     """Load vocab from file
138 | 
139 |     Args:
140 |         filename: path to the glove vectors
141 | 
142 |     Returns:
143 |         vocab: set() of strings
144 |     """
145 |     print("Building vocab...")
146 |     vocab = set()
147 |     with open(filename) as f:
148 |         for line in f:
149 |             word = line.strip().split(' ')[0]
150 |             vocab.add(word)
151 |     print("- done. {} tokens".format(len(vocab)))
152 |     return vocab
153 | 
154 | 
155 | def write_vocab(vocab, filename):
156 |     """Writes a vocab to a file
157 | 
158 |     Writes one word per line.
159 | 
160 |     Args:
161 |         vocab: iterable that yields word
162 |         filename: path to vocab file
163 | 
164 |     Returns:
165 |         write a word per line
166 | 
167 |     """
168 |     print("Writing vocab...")
169 |     with open(filename, "w") as f:
170 |         for i, word in enumerate(vocab):
171 |             if i != len(vocab) - 1:
172 |                 f.write("{}\n".format(word))
173 |             else:
174 |                 f.write(word)
175 |     print("- done. {} tokens".format(len(vocab)))
176 | 
177 | 
178 | def load_vocab(filename):
179 |     """Loads vocab from a file
180 | 
181 |     Args:
182 |         filename: (string) the format of the file must be one word per line.
183 | 
184 |     Returns:
185 |         d: dict[word] = index
186 | 
187 |     """
188 |     try:
189 |         d = dict()
190 |         with open(filename) as f:
191 |             for idx, word in enumerate(f):
192 |                 word = word.strip()
193 |                 d[word] = idx
194 | 
195 |     except IOError:
196 |         raise MyIOError(filename)
197 |     return d
198 | 
199 | 
200 | def export_trimmed_wordvec_vectors(vocab, wordvec_filename, trimmed_filename):
201 |     """Saves glove vectors in numpy array
202 | 
203 |     Args:
204 |         vocab: dictionary vocab[word] = index
205 |         glove_filename: a path to a glove file
206 |         trimmed_filename: a path where to store a matrix in npy
207 |         dim: (int) dimension of embeddings
208 | 
209 |     """
210 |     num = 0
211 |     with open(trimmed_filename, 'w') as outFile:
212 |         with open(wordvec_filename, 'r') as inFile:
213 |             for line in inFile:
214 |                 word = line.strip().split(' ')[0]
215 |                 if word in vocab:
216 |                     outFile.write(line)
217 |                     num += 1
218 | 
219 |     print('{} out of {} tokens can find pre-trained embeddings!'.format(num, len(vocab)))
220 | 
221 | 
222 | def get_trimmed_wordvec_vectors(filename, vocab):
223 |     """
224 |     Args:
225 |         filename: path to the npz file
226 | 
227 |     Returns:
228 |         matrix of embeddings (np array)
229 | 
230 |     """
231 |     f = open(filename, 'r')
232 |     next(f)
233 |     dim = len(f.readline().strip().split()) - 1
234 |     embeddings = np.random.uniform(-0.1, 0.1, size=(len(vocab)+1, dim))
235 |     with open(filename, 'r') as inFile:
236 |         for line in inFile:
237 |             line = line.strip().split()
238 |             word = line[0]       
239 |             if word in vocab:
240 |                 embeddings[vocab[word]] = np.array([float(item) for item in line[1:]])
241 | 
242 |     return embeddings
243 | 
244 | 
245 | def get_processing_word(vocab_words=None, vocab_chars=None,
246 |                     lowercase=False, chars=False, allow_unk=True):
247 |     """Return lambda function that transform a word (string) into list,
248 |     or tuple of (list, id) of int corresponding to the ids of the word and
249 |     its corresponding characters.
250 | 
251 |     Args:
252 |         vocab: dict[word] = idx
253 | 
254 |     Returns:
255 |         f("cat") = ([12, 4, 32], 12345)
256 |                  = (list of char ids, word id)
257 | 
258 |     """
259 |     def f(word):
260 |         # 0. get chars of words
261 |         if vocab_chars is not None and chars == True:
262 |             char_ids = []
263 |             for char in word:
264 |                 # ignore chars out of vocabulary
265 |                 if char in vocab_chars:
266 |                     char_ids += [vocab_chars[char]]
267 | 
268 |         # 1. preprocess word
269 |         if lowercase:
270 |             word = word.lower()
271 |         if word.isdigit():
272 |             word = NUM
273 | 
274 |         # 2. get id of word
275 |         if vocab_words is not None:
276 |             if word in vocab_words:
277 |                 word = vocab_words[word]
278 |             else:
279 |                 if allow_unk:
280 |                     word = vocab_words[UNK]
281 |                 else:
282 |                     raise Exception("Unknow key is not allowed. Check that "\
283 |                                     "your vocab (tags?) is correct")
284 | 
285 |         # 3. return tuple char ids, word id
286 |         if vocab_chars is not None and chars == True:
287 |             return char_ids, word
288 |         else:
289 |             return word
290 | 
291 |     return f
292 | 
293 | 
294 | def _pad_sequences(sequences, pad_tok, max_length):
295 |     """
296 |     Args:
297 |         sequences: a generator of list or tuple
298 |         pad_tok: the char to pad with
299 | 
300 |     Returns:
301 |         a list of list where each sublist has same length
302 |     """
303 |     sequence_padded, sequence_length = [], []
304 | 
305 |     for seq in sequences:
306 |         seq = list(seq)
307 |         seq_ = seq[:max_length] + [pad_tok]*max(max_length - len(seq), 0)
308 |         sequence_padded +=  [seq_]
309 |         sequence_length += [min(len(seq), max_length)]
310 | 
311 |     return sequence_padded, sequence_length
312 | 
313 | 
314 | def pad_sequences(sequences, pad_tok, nlevels=2):
315 |     """
316 |     Args:
317 |         sequences: a generator of list or tuple
318 |         pad_tok: the char to pad with
319 |         nlevels: "depth" of padding, for the case where we have characters ids
320 | 
321 |     Returns:
322 |         a list of list where each sublist has same length
323 | 
324 |     """
325 |     if nlevels == 1:
326 |         max_length = max(map(lambda x : len(x), sequences))
327 |         sequence_padded, sequence_length = _pad_sequences(sequences,
328 |                                             pad_tok, max_length) 
329 | 
330 |     elif nlevels == 2:
331 |         max_length_sentence = max([max(map(lambda x: len(x), seq))
332 |                                for seq in sequences])
333 |         sequence_padded, sequence_length = [], []
334 |         for seq in sequences:
335 |             # all words are same length now
336 |             sp, sl = _pad_sequences(seq, pad_tok, max_length_sentence)
337 |             sequence_padded += [sp]
338 |             sequence_length += [sl]
339 | 
340 |         max_length_document = max(map(lambda x : len(x), sequences))
341 |         sequence_padded, _ = _pad_sequences(sequence_padded,
342 |                 [pad_tok]*max_length_sentence, max_length_document)
343 |         sequence_length, _ = _pad_sequences(sequence_length, 0,
344 |                 max_length_document)
345 | 
346 |     elif nlevels == 3:
347 |         max_length_word = max([max([max(map(lambda x: len(x), sen)) for sen in seq])
348 |                                for seq in sequences])
349 |         max_length_sentence = max([max(map(lambda x: len(x), seq))
350 |                                for seq in sequences])
351 |         sequence_padded, sequence_length = [], []
352 |         for seq in sequences:
353 |             sentence_padded, sentence_length = [], []
354 |             for sen in seq:
355 |             # all words are same length now
356 |                 sp, sl = _pad_sequences(sen, pad_tok, max_length_word)
357 |                 sentence_padded += [sp]
358 |                 sentence_length += [sl]
359 |             # all sentences are same length now 
360 |             sentence_padded, _ = _pad_sequences(sentence_padded,
361 |                 [pad_tok]*max_length_word, max_length_sentence)
362 |             sentence_length, _ = _pad_sequences(sentence_length, 0,
363 |                     max_length_sentence)
364 |             sequence_padded += [sentence_padded]
365 |             sequence_length += [sentence_length]
366 | 
367 |         max_length_document = max(map(lambda x : len(x), sequences))
368 |         sequence_padded, _ = _pad_sequences(sequence_padded,
369 |                 [[pad_tok]*max_length_word]*max_length_sentence, max_length_document)
370 |         sequence_length, _ = _pad_sequences(sequence_length, [0]*max_length_sentence,
371 |                 max_length_document)
372 | 
373 |     return sequence_padded, sequence_length
374 | 
375 | 
376 | def minibatches(data, minibatch_size):
377 |     """
378 |     Args:
379 |         data: generator of (sentence, tags) tuples
380 |         minibatch_size: (int)
381 | 
382 |     Yields:
383 |         list of tuples
384 | 
385 |     """
386 |     x_batch, y_batch = [], []
387 |     for (x, y) in data:
388 |         if len(x_batch) == minibatch_size:
389 |             yield x_batch, y_batch
390 |             x_batch, y_batch = [], []
391 | 
392 |         if type(x[0]) == tuple:
393 |             x = zip(*x)
394 |         x_batch += [x]
395 |         y_batch += [y]
396 | 
397 |     if len(x_batch) != 0:
398 |         yield x_batch, y_batch
399 | 
400 | 
401 | def get_chunk_type(tok, idx_to_tag):
402 |     """
403 |     Args:
404 |         tok: id of token, ex 4
405 |         idx_to_tag: dictionary {4: "B-PER", ...}
406 | 
407 |     Returns:
408 |         tuple: "B", "PER"
409 | 
410 |     """
411 |     tag_name = idx_to_tag[tok]
412 |     tag_class = tag_name.split('-')[0]
413 |     tag_type = tag_name.split('-')[-1]
414 |     return tag_class, tag_type
415 | 
416 | 
417 | def get_chunks(seq, tags):
418 |     """Given a sequence of tags, group entities and their position
419 | 
420 |     Args:
421 |         seq: [4, 4, 0, 0, ...] sequence of labels
422 |         tags: dict["O"] = 4
423 | 
424 |     Returns:
425 |         list of (chunk_type, chunk_start, chunk_end)
426 | 
427 |     Example:
428 |         seq = [4, 5, 0, 3]
429 |         tags = {"B-PER": 4, "I-PER": 5, "B-LOC": 3}
430 |         result = [("PER", 0, 2), ("LOC", 3, 4)]
431 | 
432 |     """
433 |     default = tags[NONE]
434 |     idx_to_tag = {idx: tag for tag, idx in tags.items()}
435 |     chunks = []
436 |     chunk_type, chunk_start = None, None
437 |     for i, tok in enumerate(seq):
438 |         # End of a chunk 1
439 |         if tok == default and chunk_type is not None:
440 |             # Add a chunk.
441 |             chunk = (chunk_type, chunk_start, i)
442 |             chunks.append(chunk)
443 |             chunk_type, chunk_start = None, None
444 | 
445 |         # End of a chunk + start of a chunk!
446 |         elif tok != default:
447 |             tok_chunk_class, tok_chunk_type = get_chunk_type(tok, idx_to_tag)
448 |             if chunk_type is None:
449 |                 chunk_type, chunk_start = tok_chunk_type, i
450 |             elif tok_chunk_type != chunk_type or tok_chunk_class == "B":
451 |                 chunk = (chunk_type, chunk_start, i)
452 |                 chunks.append(chunk)
453 |                 chunk_type, chunk_start = tok_chunk_type, i
454 |         else:
455 |             pass
456 | 
457 |     # end condition
458 |     if chunk_type is not None:
459 |         chunk = (chunk_type, chunk_start, len(seq))
460 |         chunks.append(chunk)
461 | 
462 |     return chunks
463 | 


--------------------------------------------------------------------------------
/model/general_utils.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import sys
  3 | import logging
  4 | import numpy as np
  5 | 
  6 | 
  7 | def get_logger(filename):
  8 |     """Return a logger instance that writes in filename
  9 | 
 10 |     Args:
 11 |         filename: (string) path to log.txt
 12 | 
 13 |     Returns:
 14 |         logger: (instance of logger)
 15 | 
 16 |     """
 17 |     logger = logging.getLogger('logger')
 18 |     logger.setLevel(logging.DEBUG)
 19 |     logging.basicConfig(format='%(message)s', level=logging.DEBUG)
 20 |     handler = logging.FileHandler(filename)
 21 |     handler.setLevel(logging.DEBUG)
 22 |     handler.setFormatter(logging.Formatter(
 23 |             '%(asctime)s:%(levelname)s: %(message)s'))
 24 |     logging.getLogger().addHandler(handler)
 25 | 
 26 |     return logger
 27 | 
 28 | 
 29 | class Progbar(object):
 30 |     """Progbar class copied from keras (https://github.com/fchollet/keras/)
 31 | 
 32 |     Displays a progress bar.
 33 |     Small edit : added strict arg to update
 34 |     # Arguments
 35 |         target: Total number of steps expected.
 36 |         interval: Minimum visual progress update interval (in seconds).
 37 |     """
 38 | 
 39 |     def __init__(self, target, width=30, verbose=1):
 40 |         self.width = width
 41 |         self.target = target
 42 |         self.sum_values = {}
 43 |         self.unique_values = []
 44 |         self.start = time.time()
 45 |         self.total_width = 0
 46 |         self.seen_so_far = 0
 47 |         self.verbose = verbose
 48 | 
 49 |     def update(self, current, values=[], exact=[], strict=[]):
 50 |         """
 51 |         Updates the progress bar.
 52 |         # Arguments
 53 |             current: Index of current step.
 54 |             values: List of tuples (name, value_for_last_step).
 55 |                 The progress bar will display averages for these values.
 56 |             exact: List of tuples (name, value_for_last_step).
 57 |                 The progress bar will display these values directly.
 58 |         """
 59 | 
 60 |         for k, v in values:
 61 |             if k not in self.sum_values:
 62 |                 self.sum_values[k] = [v * (current - self.seen_so_far),
 63 |                                       current - self.seen_so_far]
 64 |                 self.unique_values.append(k)
 65 |             else:
 66 |                 self.sum_values[k][0] += v * (current - self.seen_so_far)
 67 |                 self.sum_values[k][1] += (current - self.seen_so_far)
 68 |         for k, v in exact:
 69 |             if k not in self.sum_values:
 70 |                 self.unique_values.append(k)
 71 |             self.sum_values[k] = [v, 1]
 72 | 
 73 |         for k, v in strict:
 74 |             if k not in self.sum_values:
 75 |                 self.unique_values.append(k)
 76 |             self.sum_values[k] = v
 77 | 
 78 |         self.seen_so_far = current
 79 | 
 80 |         now = time.time()
 81 |         if self.verbose == 1:
 82 |             prev_total_width = self.total_width
 83 |             sys.stdout.write("\b" * prev_total_width)
 84 |             sys.stdout.write("\r")
 85 | 
 86 |             numdigits = int(np.floor(np.log10(self.target))) + 1
 87 |             barstr = '%%%dd/%%%dd [' % (numdigits, numdigits)
 88 |             bar = barstr % (current, self.target)
 89 |             prog = float(current)/self.target
 90 |             prog_width = int(self.width*prog)
 91 |             if prog_width > 0:
 92 |                 bar += ('='*(prog_width-1))
 93 |                 if current < self.target:
 94 |                     bar += '>'
 95 |                 else:
 96 |                     bar += '='
 97 |             bar += ('.'*(self.width-prog_width))
 98 |             bar += ']'
 99 |             sys.stdout.write(bar)
100 |             self.total_width = len(bar)
101 | 
102 |             if current:
103 |                 time_per_unit = (now - self.start) / current
104 |             else:
105 |                 time_per_unit = 0
106 |             eta = time_per_unit*(self.target - current)
107 |             info = ''
108 |             if current < self.target:
109 |                 info += ' - ETA: %ds' % eta
110 |             else:
111 |                 info += ' - %ds' % (now - self.start)
112 |             for k in self.unique_values:
113 |                 if type(self.sum_values[k]) is list:
114 |                     info += ' - %s: %.4f' % (k,
115 |                         self.sum_values[k][0] / max(1, self.sum_values[k][1]))
116 |                 else:
117 |                     info += ' - %s: %s' % (k, self.sum_values[k])
118 | 
119 |             self.total_width += len(info)
120 |             if prev_total_width > self.total_width:
121 |                 info += ((prev_total_width-self.total_width) * " ")
122 | 
123 |             sys.stdout.write(info)
124 |             sys.stdout.flush()
125 | 
126 |             if current >= self.target:
127 |                 sys.stdout.write("\n")
128 | 
129 |         if self.verbose == 2:
130 |             if current >= self.target:
131 |                 info = '%ds' % (now - self.start)
132 |                 for k in self.unique_values:
133 |                     info += ' - %s: %.4f' % (k,
134 |                         self.sum_values[k][0] / max(1, self.sum_values[k][1]))
135 |                 sys.stdout.write(info + "\n")
136 | 
137 |     def add(self, n, values=[]):
138 |         self.update(self.seen_so_far+n, values)
139 | 
140 | 
141 | 


--------------------------------------------------------------------------------
/model/models.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import tensorflow as tf
  4 | from sklearn.metrics import precision_recall_fscore_support, classification_report, confusion_matrix
  5 | 
  6 | 
  7 | from .data_utils import minibatches, pad_sequences, get_chunks
  8 | from .general_utils import Progbar
  9 | from .base_model import BaseModel
 10 | 
 11 | 
 12 | class HANNModel(BaseModel):
 13 |     """Specialized class of Model for NER"""
 14 | 
 15 |     def __init__(self, config):
 16 |         super(HANNModel, self).__init__(config)
 17 |         self.idx_to_tag = {idx: tag for tag, idx in
 18 |                            self.config.vocab_tags.items()}
 19 |         self.idx_to_words = {idx: word for word, idx in
 20 |                            self.config.vocab_words.items()}
 21 |         # self.class_weights = [self.config.weight_tags[tag] for idx, tag in sorted(self.idx_to_tag.items())]
 22 |         self.initializer = tf.contrib.layers.xavier_initializer()
 23 |         self.regularizer = tf.contrib.layers.l2_regularizer(scale=self.config.l2_reg_lambda)
 24 | 
 25 | 
 26 |     def add_placeholders(self):
 27 |         """Define placeholders = entries to computational graph"""
 28 |         # shape = (batch size)
 29 |         self.document_lengths = tf.placeholder(tf.int32, shape=[None],
 30 |                         name="document_lengths")
 31 | 
 32 |         # shape = (batch size, max length of documents in batch (how many sentences in one abstract), max length of sentence in batch)
 33 |         self.word_ids = tf.placeholder(tf.int32, shape=[None, None, None],
 34 |                         name="word_ids")
 35 | 
 36 |         # shape = (batch_size, max_length of sentence)
 37 |         self.sentence_lengths = tf.placeholder(tf.int32, shape=[None, None],
 38 |                         name="word_lengths")
 39 | 
 40 |         # shape = (batch_size, max_length of sentence)
 41 |         self.word_lengths = tf.placeholder(tf.int32, shape=[None, None, None],
 42 |                         name="word_lengths")
 43 | 
 44 |         # shape = (batch size, max length of sentence in batch)
 45 |         self.labels = tf.placeholder(tf.int32, shape=[None, None],
 46 |                         name="labels")
 47 | 
 48 |         # hyper parameters
 49 |         self.dropout = tf.placeholder(dtype=tf.float32, shape=[],
 50 |                         name="dropout")
 51 |         self.lr = tf.placeholder(dtype=tf.float32, shape=[],
 52 |                         name="lr")
 53 | 
 54 | 
 55 |     def get_feed_dict(self, words, labels=None, lr=None, dropout=None):
 56 |         """Given some data, pad it and build a feed dictionary
 57 | 
 58 |         Args:
 59 |             words: list of sentences. A sentence is a list of ids of a list of
 60 |                 words. A word is a list of ids
 61 |             labels: list of ids
 62 |             lr: (float) learning rate
 63 |             dropout: (float) keep prob
 64 | 
 65 |         Returns:
 66 |             dict {placeholder: value}
 67 | 
 68 |         """
 69 |         # perform padding of the given data
 70 |         _, document_lengths = pad_sequences(words, pad_tok=0, nlevels=1)
 71 |         word_ids, sentence_lengths = pad_sequences(words, pad_tok=0, nlevels=2)
 72 | 
 73 |         # build feed dictionary
 74 |         feed = {
 75 |             self.word_ids: word_ids,
 76 |             self.document_lengths: document_lengths,
 77 |             self.sentence_lengths: sentence_lengths
 78 |         }
 79 | 
 80 |         if labels is not None:
 81 |             labels, _ = pad_sequences(labels, 0, nlevels=1)
 82 |             feed[self.labels] = labels
 83 | 
 84 |         if lr is not None:
 85 |             feed[self.lr] = lr
 86 | 
 87 |         if dropout is not None:
 88 |             feed[self.dropout] = dropout
 89 | 
 90 |         return feed, document_lengths
 91 | 
 92 | 
 93 |     def add_word_embeddings_op(self):
 94 |         """Defines self.word_embeddings
 95 | 
 96 |         If self.config.embeddings is not None and is a np array initialized
 97 |         with pre-trained word vectors, the word embeddings is just a look-up
 98 |         and we don't train the vectors. Otherwise, a random matrix with
 99 |         the correct shape is initialized.
100 |         """
101 |         with tf.variable_scope("words"):
102 |             if self.config.embeddings is None:
103 |                 self.logger.info("WARNING: randomly initializing word vectors")
104 |                 _word_embeddings = tf.get_variable(
105 |                         name="_word_embeddings",
106 |                         dtype=tf.float32,
107 |                         shape=[self.config.nwords, self.config.dim_word])
108 |             else:
109 |                 _word_embeddings = tf.Variable(
110 |                         self.config.embeddings,
111 |                         name="_word_embeddings",
112 |                         dtype=tf.float32,
113 |                         trainable=self.config.train_embeddings)
114 | 
115 |             word_embeddings = tf.nn.embedding_lookup(_word_embeddings,
116 |                     self.word_ids, name="word_embeddings")
117 | 
118 |         self.word_embeddings = tf.nn.dropout(word_embeddings, self.dropout)
119 | 
120 | 
121 |     def add_logits_op(self):
122 |         """Defines self.logits
123 | 
124 |         For each word in each sentence of the batch, it corresponds to a vector
125 |         of scores, of dimension equal to the number of tags.
126 |         """
127 |         s = tf.shape(self.word_embeddings)
128 |         word_embeddings_dim = self.config.dim_word
129 | 
130 |         sentence_lengths = tf.reshape(self.sentence_lengths, shape=[s[0]*s[1]])
131 |         
132 |         word_embeddings = tf.reshape(self.word_embeddings, 
133 |                             shape=[s[0]*s[1], s[-2], word_embeddings_dim])
134 | 
135 |         with tf.variable_scope("bi-lstm-sentence"):
136 |             cell_fw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_lstm_sentence)
137 |             cell_bw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_lstm_sentence)
138 | 
139 |             (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn(
140 |                     cell_fw, cell_bw, word_embeddings,
141 |                     sequence_length=sentence_lengths, dtype=tf.float32)
142 |             output = tf.concat([output_fw, output_bw], axis=-1)
143 | 
144 |             W_word = tf.get_variable("weight", dtype=tf.float32, 
145 |                     initializer=self.initializer, regularizer=self.regularizer,
146 |                     shape=[2*self.config.hidden_size_lstm_sentence, self.config.attention_size])
147 |             b_word = tf.get_variable("bias", shape=[self.config.attention_size],
148 |                 dtype=tf.float32, initializer=tf.zeros_initializer())
149 |             U_word = tf.get_variable("U-noreg", dtype=tf.float32, 
150 |                     initializer=self.initializer, 
151 |                     shape=[self.config.attention_size, 1])
152 | 
153 |             output = tf.reshape(output, shape=[-1, 2*self.config.hidden_size_lstm_sentence])
154 |             U_sent = tf.tanh(tf.matmul(output, W_word) + b_word)
155 |             A = tf.nn.softmax(tf.reshape(tf.squeeze(tf.matmul(U_sent, U_word)), shape=[-1, s[2]]))
156 |             output = tf.reshape(output, shape=[-1, s[2], 2*self.config.hidden_size_lstm_sentence])
157 |             output = tf.reduce_sum(tf.multiply(output, tf.tile(tf.expand_dims(A, axis=-1), 
158 |                                     [1, 1, 2*self.config.hidden_size_lstm_sentence])), axis=1)
159 | 
160 |         # dropout
161 |         output = tf.nn.dropout(output, self.dropout)
162 | 
163 |         with tf.variable_scope("proj"):
164 |             W_infer = tf.get_variable("weight", dtype=tf.float32, 
165 |                     initializer=self.initializer, regularizer=self.regularizer,
166 |                     shape=[2*self.config.hidden_size_lstm_sentence, self.config.ntags])
167 | 
168 |             b_infer = tf.get_variable("bias", shape=[self.config.ntags],
169 |                     dtype=tf.float32, initializer=tf.zeros_initializer())
170 | 
171 |             pred = tf.matmul(output, W_infer) + b_infer
172 |             self.logits = tf.reshape(pred, [-1, s[1], self.config.ntags])
173 | 
174 | 
175 |     def add_pred_op(self):
176 |         """Defines self.labels_pred
177 | 
178 |         This op is defined only in the case where we don't use a CRF since in
179 |         that case we can make the prediction "in the graph" (thanks to tf
180 |         functions in other words). With theCRF, as the inference is coded
181 |         in python and not in pure tensroflow, we have to make the prediciton
182 |         outside the graph.
183 |         """
184 |         if not self.config.use_crf:
185 |             self.labels_pred = tf.cast(tf.argmax(self.logits, axis=-1),
186 |                     tf.int32)
187 | 
188 | 
189 |     def add_loss_op(self):
190 |         """Defines the loss"""
191 |         if self.config.use_crf:
192 |             log_likelihood, trans_params = tf.contrib.crf.crf_log_likelihood(
193 |                     self.logits, self.labels, self.document_lengths)
194 |             self.trans_params = trans_params # need to evaluate it for decoding
195 |             self.loss = tf.reduce_mean(-log_likelihood)
196 |         else:
197 |             losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
198 |                     logits=self.logits, labels=self.labels)
199 |             # class_weights = tf.constant(self.class_weights)
200 |             # weights = tf.gather(class_weights, self.labels, axis=-1)
201 |             # losses = tf.losses.sparse_softmax_cross_entropy(
202 |             #                         labels=self.labels, 
203 |             #                         logits=self.logits, 
204 |             #                         weights=weights,
205 |             #                         reduction=tf.losses.Reduction.NONE)
206 |             mask = tf.sequence_mask(self.document_lengths)
207 |             losses = tf.boolean_mask(losses, mask)
208 |             self.loss = tf.reduce_mean(losses)
209 | 
210 |         # add l2 regularizationtete
211 |         l2 = self.config.l2_reg_lambda * sum([
212 |             tf.nn.l2_loss(tf_var)
213 |             for tf_var in tf.trainable_variables()
214 |             if not ("noreg" in tf_var.name or "bias" in tf_var.name)])
215 |         self.loss += l2
216 | 
217 |         # for tensorboard
218 |         tf.summary.scalar("loss", self.loss)
219 | 
220 | 
221 |     def build(self):
222 |         # NER specific functions
223 |         self.add_placeholders()
224 |         self.add_word_embeddings_op()
225 |         self.add_logits_op()
226 |         self.add_pred_op()
227 |         self.add_loss_op()
228 | 
229 |         # Generic functions that add training op and initialize session
230 |         self.add_train_op(self.config.lr_method, self.lr, self.loss,
231 |                 self.config.clip)
232 |         self.initialize_session() # now self.sess is defined and vars are init
233 | 
234 | 
235 |     def predict_batch(self, words):
236 |         """
237 |         Args:
238 |             words: list of sentences
239 | 
240 |         Returns:
241 |             labels_pred: list of labels for each sentence
242 |             document_length
243 | 
244 |         """
245 |         fd, document_lengths = self.get_feed_dict(words, dropout=1.0)
246 | 
247 |         if self.config.use_crf:
248 |             # get tag scores and transition params of CRF
249 |             viterbi_sequences = []
250 |             logits, trans_params = self.sess.run(
251 |                     [self.logits, self.trans_params], feed_dict=fd)
252 | 
253 |             # iterate over the sentences because no batching in vitervi_decode
254 |             for logit, document_length in zip(logits, document_lengths):
255 |                 logit = logit[:document_length] # keep only the valid steps
256 |                 viterbi_seq, viterbi_score = tf.contrib.crf.viterbi_decode(
257 |                         logit, trans_params)
258 |                 viterbi_sequences += [viterbi_seq]
259 | 
260 |             return viterbi_sequences, document_lengths
261 | 
262 |         else:
263 |             labels_pred = self.sess.run(self.labels_pred, feed_dict=fd)
264 | 
265 |             return labels_pred, document_lengths
266 | 
267 | 
268 |     def run_epoch(self, train, dev, epoch):
269 |         """Performs one complete pass over the train set and evaluate on dev
270 | 
271 |         Args:
272 |             train: dataset that yields tuple of sentences, tags
273 |             dev: dataset
274 |             epoch: (int) index of the current epoch
275 | 
276 |         Returns:
277 |             f1: (python float), score to select model on, higher is better
278 | 
279 |         """
280 |         # progbar stuff for logging
281 |         batch_size = self.config.batch_size
282 |         nbatches = (len(train) + batch_size - 1) // batch_size
283 |         prog = Progbar(target=nbatches)
284 | 
285 |         # iterate over dataset
286 |         for i, (words, labels) in enumerate(minibatches(train, batch_size)):
287 |             fd, _ = self.get_feed_dict(words, labels, self.config.lr,
288 |                     self.config.dropout)
289 | 
290 |             _, train_loss, summary = self.sess.run(
291 |                     [self.train_op, self.loss, self.merged], feed_dict=fd)
292 | 
293 |             if not self.config.train_accuracy:
294 |                 prog.update(i + 1, [("train loss", train_loss)])
295 |             else:
296 |                 labels_pred, document_lengths = self.predict_batch(words)
297 |                 accs = []
298 |                 for lab, lab_pred, length in zip(labels, labels_pred,
299 |                                                  document_lengths):
300 |                     lab      = lab[:length]
301 |                     lab_pred = lab_pred[:length]
302 |                     accs    += [a==b for (a, b) in zip(lab, lab_pred)]
303 |                 acc = np.mean(accs)
304 |                 prog.update(i + 1, [("train loss", train_loss), ("accuracy", acc)])
305 | 
306 |             # tensorboard
307 |             if i % 10 == 0:
308 |                 self.file_writer.add_summary(summary, epoch*nbatches + i)
309 | 
310 |         metrics = self.run_evaluate(dev, report=True)
311 |         msg = " - ".join(["{} {:04.3f}".format(k, v)
312 |                     if k == 'acc' else '{} {}'.format(k, ', '.join(['{}: {:04.2f}'.format(a, b) \
313 |                     for a, b in v.items()])) for k, v in metrics.items()])
314 |         self.logger.info(msg)
315 | 
316 |         return np.mean(list(metrics["f1"].values()))
317 | 
318 | 
319 |     def run_evaluate(self, test, report=False):
320 |         """Evaluates performance on test set
321 | 
322 |         Args:
323 |             test: dataset that yields tuple of (sentences, tags)
324 | 
325 |         Returns:
326 |             metrics: (dict) metrics["acc"] = 98.4, ...
327 | 
328 |         """
329 |         accs = []
330 |         labs = []
331 |         labs_pred = []
332 |         for words, labels in minibatches(test, self.config.batch_size):
333 |             labels_pred, document_lengths = self.predict_batch(words)
334 | 
335 |             for lab, lab_pred, length in zip(labels, labels_pred,
336 |                                              document_lengths):
337 |                 lab      = lab[:length]
338 |                 lab_pred = lab_pred[:length]
339 |                 accs    += [a==b for (a, b) in zip(lab, lab_pred)]
340 | 
341 |                 # lab_chunks      = set(get_chunks(lab, self.config.vocab_tags))
342 |                 # lab_pred_chunks = set(get_chunks(lab_pred,
343 |                                                  # self.config.vocab_tags))
344 | 
345 |                 # correct_preds += len(accs)
346 |                 # total_preds   += len(lab_pred)
347 |                 # total_correct += len(lab)
348 | 
349 |                 labs.extend(lab)
350 |                 labs_pred.extend(lab_pred)
351 | 
352 |         # p   = correct_preds / total_preds if correct_preds > 0 else 0
353 |         # r   = correct_preds / total_correct if correct_preds > 0 else 0
354 |         # f1  = 2 * p * r / (p + r) if correct_preds > 0 else 0
355 |         precision, recall, f1, _ = precision_recall_fscore_support(labs, labs_pred)
356 |         acc = np.mean(accs)
357 | 
358 |         if report:
359 |             target_names = [self.idx_to_tag[i] for i in range(len(self.idx_to_tag))]
360 |             print(classification_report(labs, labs_pred, target_names=target_names, digits=4))
361 |             print(self.idx_to_tag)
362 |             print(confusion_matrix(labs, labs_pred))
363 | 
364 |         return {"acc": 100*acc, 
365 |                 'precision': {tag: precision[self.config.vocab_tags[tag]] for tag in ['P', 'I', 'O']},
366 |                 'recall': {tag: recall[self.config.vocab_tags[tag]] for tag in ['P', 'I', 'O']},
367 |                 'f1': {tag: f1[self.config.vocab_tags[tag]] for tag in ['P', 'I', 'O']}}
368 | 
369 | 
370 |     def predict(self, words_raw):
371 |         """Returns list of tags
372 | 
373 |         Args:
374 |             words_raw: list of words (string), just one sentence (no batch)
375 | 
376 |         Returns:
377 |             preds: list of tags (string), one for each word in the sentence
378 | 
379 |         """
380 |         words = [self.config.processing_word(w) for w in words_raw]
381 |         if type(words[0]) == tuple:
382 |             words = zip(*words)
383 |         pred_ids, _ = self.predict_batch([words])
384 |         preds = [self.idx_to_tag[idx] for idx in list(pred_ids[0])]
385 | 
386 |         return preds
387 | 


--------------------------------------------------------------------------------
/model/ner_model.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import tensorflow as tf
  4 | 
  5 | 
  6 | from .data_utils import minibatches, pad_sequences, get_chunks
  7 | from .general_utils import Progbar
  8 | from .base_model import BaseModel
  9 | 
 10 | 
 11 | class NERModel(BaseModel):
 12 |     """Specialized class of Model for NER"""
 13 | 
 14 |     def __init__(self, config):
 15 |         super(NERModel, self).__init__(config)
 16 |         self.idx_to_tag = {idx: tag for tag, idx in
 17 |                            self.config.vocab_tags.items()}
 18 | 
 19 | 
 20 |     def add_placeholders(self):
 21 |         """Define placeholders = entries to computational graph"""
 22 |         # shape = (batch size, max length of sentence in batch)
 23 |         self.word_ids = tf.placeholder(tf.int32, shape=[None, None],
 24 |                         name="word_ids")
 25 | 
 26 |         # shape = (batch size)
 27 |         self.sequence_lengths = tf.placeholder(tf.int32, shape=[None],
 28 |                         name="sequence_lengths")
 29 | 
 30 |         # shape = (batch size, max length of sentence, max length of word)
 31 |         self.char_ids = tf.placeholder(tf.int32, shape=[None, None, None],
 32 |                         name="char_ids")
 33 | 
 34 |         # shape = (batch_size, max_length of sentence)
 35 |         self.word_lengths = tf.placeholder(tf.int32, shape=[None, None],
 36 |                         name="word_lengths")
 37 | 
 38 |         # shape = (batch size, max length of sentence in batch)
 39 |         self.labels = tf.placeholder(tf.int32, shape=[None, None],
 40 |                         name="labels")
 41 | 
 42 |         # hyper parameters
 43 |         self.dropout = tf.placeholder(dtype=tf.float32, shape=[],
 44 |                         name="dropout")
 45 |         self.lr = tf.placeholder(dtype=tf.float32, shape=[],
 46 |                         name="lr")
 47 | 
 48 | 
 49 |     def get_feed_dict(self, words, labels=None, lr=None, dropout=None):
 50 |         """Given some data, pad it and build a feed dictionary
 51 | 
 52 |         Args:
 53 |             words: list of sentences. A sentence is a list of ids of a list of
 54 |                 words. A word is a list of ids
 55 |             labels: list of ids
 56 |             lr: (float) learning rate
 57 |             dropout: (float) keep prob
 58 | 
 59 |         Returns:
 60 |             dict {placeholder: value}
 61 | 
 62 |         """
 63 |         # perform padding of the given data
 64 |         if self.config.use_chars:
 65 |             char_ids, word_ids = zip(*words)
 66 |             word_ids, sequence_lengths = pad_sequences(word_ids, 0)
 67 |             char_ids, word_lengths = pad_sequences(char_ids, pad_tok=0,
 68 |                 nlevels=2)
 69 |         else:
 70 |             word_ids, sequence_lengths = pad_sequences(words, 0)
 71 | 
 72 |         # build feed dictionary
 73 |         feed = {
 74 |             self.word_ids: word_ids,
 75 |             self.sequence_lengths: sequence_lengths
 76 |         }
 77 | 
 78 |         if self.config.use_chars:
 79 |             feed[self.char_ids] = char_ids
 80 |             feed[self.word_lengths] = word_lengths
 81 | 
 82 |         if labels is not None:
 83 |             labels, _ = pad_sequences(labels, 0)
 84 |             feed[self.labels] = labels
 85 | 
 86 |         if lr is not None:
 87 |             feed[self.lr] = lr
 88 | 
 89 |         if dropout is not None:
 90 |             feed[self.dropout] = dropout
 91 | 
 92 |         return feed, sequence_lengths
 93 | 
 94 | 
 95 |     def add_word_embeddings_op(self):
 96 |         """Defines self.word_embeddings
 97 | 
 98 |         If self.config.embeddings is not None and is a np array initialized
 99 |         with pre-trained word vectors, the word embeddings is just a look-up
100 |         and we don't train the vectors. Otherwise, a random matrix with
101 |         the correct shape is initialized.
102 |         """
103 |         with tf.variable_scope("words"):
104 |             if self.config.embeddings is None:
105 |                 self.logger.info("WARNING: randomly initializing word vectors")
106 |                 _word_embeddings = tf.get_variable(
107 |                         name="_word_embeddings",
108 |                         dtype=tf.float32,
109 |                         shape=[self.config.nwords, self.config.dim_word])
110 |             else:
111 |                 _word_embeddings = tf.Variable(
112 |                         self.config.embeddings,
113 |                         name="_word_embeddings",
114 |                         dtype=tf.float32,
115 |                         trainable=self.config.train_embeddings)
116 | 
117 |             word_embeddings = tf.nn.embedding_lookup(_word_embeddings,
118 |                     self.word_ids, name="word_embeddings")
119 | 
120 |         with tf.variable_scope("chars"):
121 |             if self.config.use_chars:
122 |                 # get char embeddings matrix
123 |                 _char_embeddings = tf.get_variable(
124 |                         name="_char_embeddings",
125 |                         dtype=tf.float32,
126 |                         shape=[self.config.nchars, self.config.dim_char])
127 |                 char_embeddings = tf.nn.embedding_lookup(_char_embeddings,
128 |                         self.char_ids, name="char_embeddings")
129 | 
130 |                 # put the time dimension on axis=1
131 |                 s = tf.shape(char_embeddings)
132 |                 char_embeddings = tf.reshape(char_embeddings,
133 |                         shape=[s[0]*s[1], s[-2], self.config.dim_char])
134 |                 word_lengths = tf.reshape(self.word_lengths, shape=[s[0]*s[1]])
135 | 
136 |                 # bi lstm on chars
137 |                 cell_fw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_char,
138 |                         state_is_tuple=True)
139 |                 cell_bw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_char,
140 |                         state_is_tuple=True)
141 |                 _output = tf.nn.bidirectional_dynamic_rnn(
142 |                         cell_fw, cell_bw, char_embeddings,
143 |                         sequence_length=word_lengths, dtype=tf.float32)
144 | 
145 |                 # read and concat output
146 |                 _, ((_, output_fw), (_, output_bw)) = _output
147 |                 output = tf.concat([output_fw, output_bw], axis=-1)
148 | 
149 |                 # shape = (batch size, max sentence length, char hidden size)
150 |                 output = tf.reshape(output,
151 |                         shape=[s[0], s[1], 2*self.config.hidden_size_char])
152 |                 word_embeddings = tf.concat([word_embeddings, output], axis=-1)
153 | 
154 |         self.word_embeddings =  tf.nn.dropout(word_embeddings, self.dropout)
155 | 
156 | 
157 |     def add_logits_op(self):
158 |         """Defines self.logits
159 | 
160 |         For each word in each sentence of the batch, it corresponds to a vector
161 |         of scores, of dimension equal to the number of tags.
162 |         """
163 |         with tf.variable_scope("bi-lstm"):
164 |             cell_fw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_lstm)
165 |             cell_bw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_lstm)
166 |             (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn(
167 |                     cell_fw, cell_bw, self.word_embeddings,
168 |                     sequence_length=self.sequence_lengths, dtype=tf.float32)
169 |             output = tf.concat([output_fw, output_bw], axis=-1)
170 |             output = tf.nn.dropout(output, self.dropout)
171 | 
172 |         with tf.variable_scope("proj"):
173 |             W = tf.get_variable("W", dtype=tf.float32,
174 |                     shape=[2*self.config.hidden_size_lstm, self.config.ntags])
175 | 
176 |             b = tf.get_variable("b", shape=[self.config.ntags],
177 |                     dtype=tf.float32, initializer=tf.zeros_initializer())
178 | 
179 |             nsteps = tf.shape(output)[1]
180 |             output = tf.reshape(output, [-1, 2*self.config.hidden_size_lstm])
181 |             pred = tf.matmul(output, W) + b
182 |             self.logits = tf.reshape(pred, [-1, nsteps, self.config.ntags])
183 | 
184 | 
185 |     def add_pred_op(self):
186 |         """Defines self.labels_pred
187 | 
188 |         This op is defined only in the case where we don't use a CRF since in
189 |         that case we can make the prediction "in the graph" (thanks to tf
190 |         functions in other words). With theCRF, as the inference is coded
191 |         in python and not in pure tensroflow, we have to make the prediciton
192 |         outside the graph.
193 |         """
194 |         if not self.config.use_crf:
195 |             self.labels_pred = tf.cast(tf.argmax(self.logits, axis=-1),
196 |                     tf.int32)
197 | 
198 | 
199 |     def add_loss_op(self):
200 |         """Defines the loss"""
201 |         if self.config.use_crf:
202 |             log_likelihood, trans_params = tf.contrib.crf.crf_log_likelihood(
203 |                     self.logits, self.labels, self.sequence_lengths)
204 |             self.trans_params = trans_params # need to evaluate it for decoding
205 |             self.loss = tf.reduce_mean(-log_likelihood)
206 |         else:
207 |             losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
208 |                     logits=self.logits, labels=self.labels)
209 |             mask = tf.sequence_mask(self.sequence_lengths)
210 |             losses = tf.boolean_mask(losses, mask)
211 |             self.loss = tf.reduce_mean(losses)
212 | 
213 |         # for tensorboard
214 |         tf.summary.scalar("loss", self.loss)
215 | 
216 | 
217 |     def build(self):
218 |         # NER specific functions
219 |         self.add_placeholders()
220 |         self.add_word_embeddings_op()
221 |         self.add_logits_op()
222 |         self.add_pred_op()
223 |         self.add_loss_op()
224 | 
225 |         # Generic functions that add training op and initialize session
226 |         self.add_train_op(self.config.lr_method, self.lr, self.loss,
227 |                 self.config.clip)
228 |         self.initialize_session() # now self.sess is defined and vars are init
229 | 
230 | 
231 |     def predict_batch(self, words):
232 |         """
233 |         Args:
234 |             words: list of sentences
235 | 
236 |         Returns:
237 |             labels_pred: list of labels for each sentence
238 |             sequence_length
239 | 
240 |         """
241 |         fd, sequence_lengths = self.get_feed_dict(words, dropout=1.0)
242 | 
243 |         if self.config.use_crf:
244 |             # get tag scores and transition params of CRF
245 |             viterbi_sequences = []
246 |             logits, trans_params = self.sess.run(
247 |                     [self.logits, self.trans_params], feed_dict=fd)
248 | 
249 |             # iterate over the sentences because no batching in vitervi_decode
250 |             for logit, sequence_length in zip(logits, sequence_lengths):
251 |                 logit = logit[:sequence_length] # keep only the valid steps
252 |                 viterbi_seq, viterbi_score = tf.contrib.crf.viterbi_decode(
253 |                         logit, trans_params)
254 |                 viterbi_sequences += [viterbi_seq]
255 | 
256 |             return viterbi_sequences, sequence_lengths
257 | 
258 |         else:
259 |             labels_pred = self.sess.run(self.labels_pred, feed_dict=fd)
260 | 
261 |             return labels_pred, sequence_lengths
262 | 
263 | 
264 |     def run_epoch(self, train, dev, epoch):
265 |         """Performs one complete pass over the train set and evaluate on dev
266 | 
267 |         Args:
268 |             train: dataset that yields tuple of sentences, tags
269 |             dev: dataset
270 |             epoch: (int) index of the current epoch
271 | 
272 |         Returns:
273 |             f1: (python float), score to select model on, higher is better
274 | 
275 |         """
276 |         # progbar stuff for logging
277 |         batch_size = self.config.batch_size
278 |         nbatches = (len(train) + batch_size - 1) // batch_size
279 |         prog = Progbar(target=nbatches)
280 | 
281 |         # iterate over dataset
282 |         for i, (words, labels) in enumerate(minibatches(train, batch_size)):
283 |             fd, _ = self.get_feed_dict(words, labels, self.config.lr,
284 |                     self.config.dropout)
285 | 
286 |             _, train_loss, summary = self.sess.run(
287 |                     [self.train_op, self.loss, self.merged], feed_dict=fd)
288 | 
289 |             prog.update(i + 1, [("train loss", train_loss)])
290 | 
291 |             # tensorboard
292 |             if i % 10 == 0:
293 |                 self.file_writer.add_summary(summary, epoch*nbatches + i)
294 | 
295 |         metrics = self.run_evaluate(dev)
296 |         msg = " - ".join(["{} {:04.2f}".format(k, v)
297 |                 for k, v in metrics.items()])
298 |         self.logger.info(msg)
299 | 
300 |         return metrics["f1"]
301 | 
302 | 
303 |     def run_evaluate(self, test):
304 |         """Evaluates performance on test set
305 | 
306 |         Args:
307 |             test: dataset that yields tuple of (sentences, tags)
308 | 
309 |         Returns:
310 |             metrics: (dict) metrics["acc"] = 98.4, ...
311 | 
312 |         """
313 |         accs = []
314 |         correct_preds, total_correct, total_preds = 0., 0., 0.
315 |         for words, labels in minibatches(test, self.config.batch_size):
316 |             labels_pred, sequence_lengths = self.predict_batch(words)
317 | 
318 |             for lab, lab_pred, length in zip(labels, labels_pred,
319 |                                              sequence_lengths):
320 |                 lab      = lab[:length]
321 |                 lab_pred = lab_pred[:length]
322 |                 accs    += [a==b for (a, b) in zip(lab, lab_pred)]
323 | 
324 |                 lab_chunks      = set(get_chunks(lab, self.config.vocab_tags))
325 |                 lab_pred_chunks = set(get_chunks(lab_pred,
326 |                                                  self.config.vocab_tags))
327 | 
328 |                 correct_preds += len(lab_chunks & lab_pred_chunks)
329 |                 total_preds   += len(lab_pred_chunks)
330 |                 total_correct += len(lab_chunks)
331 | 
332 |         p   = correct_preds / total_preds if correct_preds > 0 else 0
333 |         r   = correct_preds / total_correct if correct_preds > 0 else 0
334 |         f1  = 2 * p * r / (p + r) if correct_preds > 0 else 0
335 |         acc = np.mean(accs)
336 | 
337 |         return {"acc": 100*acc, "f1": 100*f1}
338 | 
339 | 
340 |     def predict(self, words_raw):
341 |         """Returns list of tags
342 | 
343 |         Args:
344 |             words_raw: list of words (string), just one sentence (no batch)
345 | 
346 |         Returns:
347 |             preds: list of tags (string), one for each word in the sentence
348 | 
349 |         """
350 |         words = [self.config.processing_word(w) for w in words_raw]
351 |         if type(words[0]) == tuple:
352 |             words = zip(*words)
353 |         pred_ids, _ = self.predict_batch([words])
354 |         preds = [self.idx_to_tag[idx] for idx in list(pred_ids[0])]
355 | 
356 |         return preds
357 | 


--------------------------------------------------------------------------------
/model/resnet.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | import math
  3 | import torch.utils.model_zoo as model_zoo
  4 | 
  5 | 
  6 | __all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
  7 |            'resnet152']
  8 | 
  9 | 
 10 | model_urls = {
 11 |     'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
 12 |     'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
 13 |     'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
 14 |     'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
 15 |     'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
 16 | }
 17 | 
 18 | 
 19 | def conv3x3(in_planes, out_planes, stride=1):
 20 |     """3x3 convolution with padding"""
 21 |     return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
 22 |                      padding=1, bias=False)
 23 | 
 24 | 
 25 | class BasicBlock(nn.Module):
 26 |     expansion = 1
 27 | 
 28 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 29 |         super(BasicBlock, self).__init__()
 30 |         self.conv1 = conv3x3(inplanes, planes, stride)
 31 |         self.bn1 = nn.BatchNorm2d(planes)
 32 |         self.relu = nn.ReLU(inplace=True)
 33 |         self.conv2 = conv3x3(planes, planes)
 34 |         self.bn2 = nn.BatchNorm2d(planes)
 35 |         self.downsample = downsample
 36 |         self.stride = stride
 37 | 
 38 |     def forward(self, x):
 39 |         residual = x
 40 | 
 41 |         out = self.conv1(x)
 42 |         out = self.bn1(out)
 43 |         out = self.relu(out)
 44 | 
 45 |         out = self.conv2(out)
 46 |         out = self.bn2(out)
 47 | 
 48 |         if self.downsample is not None:
 49 |             residual = self.downsample(x)
 50 | 
 51 |         out += residual
 52 |         out = self.relu(out)
 53 | 
 54 |         return out
 55 | 
 56 | 
 57 | class Bottleneck(nn.Module):
 58 |     expansion = 4
 59 | 
 60 |     def __init__(self, inplanes, planes, stride=1, downsample=None):
 61 |         super(Bottleneck, self).__init__()
 62 |         self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
 63 |         self.bn1 = nn.BatchNorm2d(planes)
 64 |         self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
 65 |                                padding=1, bias=False)
 66 |         self.bn2 = nn.BatchNorm2d(planes)
 67 |         self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
 68 |         self.bn3 = nn.BatchNorm2d(planes * 4)
 69 |         self.relu = nn.ReLU(inplace=True)
 70 |         self.downsample = downsample
 71 |         self.stride = stride
 72 | 
 73 |     def forward(self, x):
 74 |         residual = x
 75 | 
 76 |         out = self.conv1(x)
 77 |         out = self.bn1(out)
 78 |         out = self.relu(out)
 79 | 
 80 |         out = self.conv2(out)
 81 |         out = self.bn2(out)
 82 |         out = self.relu(out)
 83 | 
 84 |         out = self.conv3(out)
 85 |         out = self.bn3(out)
 86 | 
 87 |         if self.downsample is not None:
 88 |             residual = self.downsample(x)
 89 | 
 90 |         out += residual
 91 |         out = self.relu(out)
 92 | 
 93 |         return out
 94 | 
 95 | 
 96 | class ResNet(nn.Module):
 97 | 
 98 |     def __init__(self, block, layers, num_classes=1000):
 99 |         self.inplanes = 64
100 |         super(ResNet, self).__init__()
101 |         self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
102 |                                bias=False)
103 |         self.bn1 = nn.BatchNorm2d(64)
104 |         self.relu = nn.ReLU(inplace=True)
105 |         self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
106 |         self.layer1 = self._make_layer(block, 64, layers[0])
107 |         self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
108 |         self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
109 |         self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
110 |         self.avgpool = nn.AvgPool2d(7, stride=1)
111 |         self.fc = nn.Linear(512 * block.expansion, num_classes)
112 | 
113 |         for m in self.modules():
114 |             if isinstance(m, nn.Conv2d):
115 |                 n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
116 |                 m.weight.data.normal_(0, math.sqrt(2. / n))
117 |             elif isinstance(m, nn.BatchNorm2d):
118 |                 m.weight.data.fill_(1)
119 |                 m.bias.data.zero_()
120 | 
121 |     def _make_layer(self, block, planes, blocks, stride=1):
122 |         downsample = None
123 |         if stride != 1 or self.inplanes != planes * block.expansion:
124 |             downsample = nn.Sequential(
125 |                 nn.Conv2d(self.inplanes, planes * block.expansion,
126 |                           kernel_size=1, stride=stride, bias=False),
127 |                 nn.BatchNorm2d(planes * block.expansion),
128 |             )
129 | 
130 |         layers = []
131 |         layers.append(block(self.inplanes, planes, stride, downsample))
132 |         self.inplanes = planes * block.expansion
133 |         for i in range(1, blocks):
134 |             layers.append(block(self.inplanes, planes))
135 | 
136 |         return nn.Sequential(*layers)
137 | 
138 |     def forward(self, x):
139 |         x = self.conv1(x)
140 |         x = self.bn1(x)
141 |         x = self.relu(x)
142 |         x = self.maxpool(x)
143 | 
144 |         x = self.layer1(x)
145 |         x = self.layer2(x)
146 |         x = self.layer3(x)
147 |         x = self.layer4(x)
148 | 
149 |         x = self.avgpool(x)
150 |         x = x.view(x.size(0), -1)
151 |         x = self.fc(x)
152 | 
153 |         return x
154 | 
155 | 
156 | def resnet18(pretrained=False, **kwargs):
157 |     """Constructs a ResNet-18 model.
158 |     Args:
159 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
160 |     """
161 |     model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
162 |     if pretrained:
163 |         model.load_state_dict(model_zoo.load_url(model_urls['resnet18']))
164 |     return model
165 | 
166 | 
167 | def resnet34(pretrained=False, **kwargs):
168 |     """Constructs a ResNet-34 model.
169 |     Args:
170 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
171 |     """
172 |     model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
173 |     if pretrained:
174 |         model.load_state_dict(model_zoo.load_url(model_urls['resnet34']))
175 |     return model
176 | 
177 | 
178 | def resnet50(pretrained=False, **kwargs):
179 |     """Constructs a ResNet-50 model.
180 |     Args:
181 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
182 |     """
183 |     model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
184 |     if pretrained:
185 |         model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
186 |     return model
187 | 
188 | 
189 | def resnet101(pretrained=False, **kwargs):
190 |     """Constructs a ResNet-101 model.
191 |     Args:
192 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
193 |     """
194 |     model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
195 |     if pretrained:
196 |         model.load_state_dict(model_zoo.load_url(model_urls['resnet101']))
197 |     return model
198 | 
199 | 
200 | def resnet152(pretrained=False, **kwargs):
201 |     """Constructs a ResNet-152 model.
202 |     Args:
203 |         pretrained (bool): If True, returns a model pre-trained on ImageNet
204 |     """
205 |     model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
206 |     if pretrained:
207 |         model.load_state_dict(model_zoo.load_url(model_urls['resnet152']))
208 |     return model


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | from model.data_utils import Dataset
 2 | from model.models import HANNModel
 3 | from model.config import Config
 4 | import argparse
 5 | 
 6 | parser = argparse.ArgumentParser()
 7 | 
 8 | def main():
 9 |     # create instance of config
10 |     config = Config(parser)
11 | 
12 |     # build model
13 |     model = HANNModel(config)
14 |     model.build()
15 |     if config.restore:
16 |         model.restore_session("results/test/model.weights/") # optional, restore weights
17 |     # model.reinitialize_weights("proj")
18 | 
19 |     # create datasets
20 |     dev   = Dataset(config.filename_dev, config.processing_word,
21 |                          config.processing_tag, config.max_iter)
22 |     train = Dataset(config.filename_train, config.processing_word,
23 |                          config.processing_tag, config.max_iter)
24 |     test  = Dataset(config.filename_test, config.processing_word,
25 |                          config.processing_tag, config.max_iter)
26 | 
27 |     # train model
28 |     model.train(train, dev)
29 | 
30 |     # evaluate model
31 |     model.evaluate(test)
32 | 
33 | if __name__ == "__main__":
34 |     main()
35 | 


--------------------------------------------------------------------------------