├── .gitignore ├── LICENSE ├── README.md ├── __init__.py ├── build_data.py ├── model ├── config.py ├── core.py ├── crf.py ├── data_utils.py ├── general_utils.py ├── ner_learner.py └── ner_model.py ├── test.py └── train.py /.gitignore: -------------------------------------------------------------------------------- 1 | /data/ 2 | 3 | /saves/ 4 | 5 | /.idea/ 6 | 7 | /.vscode/ 8 | 9 | /results/ 10 | /model/others/ 11 | /model/backup/ 12 | *sh 13 | /model/ent_learner.py 14 | /model/ent_model.py 15 | 16 | *.pyc 17 | 18 | example.py 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 yongyuwen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PyTorch-Elmo-BiLSTMCRF 2 | 3 | PyTorch Implementation of the BiLSTM-CRF model as described in https://guillaumegenthial.github.io/. 4 | 5 | This model builds upon that by adding including ELMO embeddings as a feature representation option. 6 | (For more detail about ELMo, please see the publication ["Deep contextualized word representations"](http://arxiv.org/abs/1802.05365)) 7 | 8 | For the Keras implementation (without ELMO) please refer to this [link](https://github.com/yongyuwen/sequence-tagging-ner). 9 | 10 | ## Usage 11 | 1. **Requirements**: 12 | a. Packages: Anaconda, Pytorch, AllenNLP (if on linux and using elmo) 13 | b. Data: Train, valid and test datasets in CoNLL 2003 NER format. 14 | c. Glove 300B embeddings (If not using Elmo) 15 | 16 | 2. **Configure Settings**: 17 | a. Change settings in model/config.py 18 | b. Main settings to change: File directories, model hyperparameters etc. 19 | 20 | 3. **Build Data**: 21 | a. Run build_data.py 22 | i. Builds embedding dictionary, text file of words, chars tags, as well as idx to word and idx to char mapping for the model to read 23 | 24 | 4. **Train Model**: 25 | a. Run train.py 26 | 27 | 5. **Test Model**: 28 | a. Run test.py 29 | b. Evaluates on test set. Also accepts other arguments to predict on custom string 30 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /build_data.py: -------------------------------------------------------------------------------- 1 | from model.config import Config 2 | from model.data_utils import CoNLLDataset, get_vocabs, UNK, NUM, \ 3 | get_glove_vocab, write_vocab, load_vocab, get_char_vocab, \ 4 | export_trimmed_glove_vectors, get_processing_word 5 | 6 | 7 | def main(): 8 | """Procedure to build data 9 | 10 | You MUST RUN this procedure. It iterates over the whole dataset (train, 11 | dev and test) and extract the vocabularies in terms of words, tags, and 12 | characters. Having built the vocabularies it writes them in a file. The 13 | writing of vocabulary in a file assigns an id (the line #) to each word. 14 | It then extract the relevant GloVe vectors and stores them in a np array 15 | such that the i-th entry corresponds to the i-th word in the vocabulary. 16 | 17 | 18 | Args: 19 | config: (instance of Config) has attributes like hyper-params... 20 | 21 | """ 22 | # 1. get config and processing of words 23 | config = Config(load=False) 24 | 25 | #2. Get processing word generator 26 | processing_word = get_processing_word(lowercase=True) 27 | 28 | # 3. Generators 29 | dev = CoNLLDataset(config.filename_dev, processing_word) 30 | test = CoNLLDataset(config.filename_test, processing_word) 31 | train = CoNLLDataset(config.filename_train, processing_word) 32 | 33 | 34 | # 4. Build Word and Tag vocab 35 | vocab_words, vocab_tags = get_vocabs([train, dev, test]) 36 | vocab_glove = get_glove_vocab(config.filename_glove) 37 | 38 | # 5. Get a vocab set for words in both vocab_words and vocab_glove 39 | vocab = vocab_words & vocab_glove 40 | vocab.add(UNK) 41 | vocab.add(NUM) 42 | 43 | # 6. Save vocab 44 | write_vocab(vocab, config.filename_words) 45 | write_vocab(vocab_tags, config.filename_tags) 46 | 47 | # 7. Trim GloVe Vectors 48 | vocab = load_vocab(config.filename_words) 49 | export_trimmed_glove_vectors(vocab, config.filename_glove, 50 | config.filename_trimmed, config.dim_word) 51 | 52 | # Build and save char vocab 53 | train = CoNLLDataset(config.filename_train) 54 | vocab_chars = get_char_vocab(train) 55 | write_vocab(vocab_chars, config.filename_chars) 56 | 57 | 58 | if __name__ == "__main__": 59 | main() 60 | -------------------------------------------------------------------------------- /model/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | from .general_utils import get_logger 5 | from .data_utils import get_trimmed_glove_vectors, load_vocab, \ 6 | get_processing_word 7 | 8 | 9 | class Config(): 10 | def __init__(self, load=True): 11 | """Initialize hyperparameters and load vocabs 12 | 13 | Args: 14 | load_embeddings: (bool) if True, load embeddings into 15 | np array, else None 16 | 17 | """ 18 | # directory for training outputs 19 | if not os.path.exists(self.dir_output): 20 | os.makedirs(self.dir_output) 21 | 22 | # create instance of logger 23 | self.logger = get_logger(self.path_log) 24 | 25 | # load if requested (default) 26 | if load: 27 | self.load() 28 | 29 | def load(self): 30 | """Loads vocabulary, processing functions and embeddings 31 | 32 | Supposes that build_data.py has been run successfully and that 33 | the corresponding files have been created (vocab and trimmed GloVe 34 | vectors) 35 | 36 | """ 37 | # 1. vocabulary 38 | self.vocab_words = load_vocab(self.filename_words) 39 | self.vocab_tags = load_vocab(self.filename_tags) 40 | self.vocab_chars = load_vocab(self.filename_chars) 41 | 42 | self.nwords = len(self.vocab_words) 43 | self.nchars = len(self.vocab_chars) 44 | self.ntags = len(self.vocab_tags) 45 | 46 | # 2. get processing functions that map str -> id 47 | self.processing_word = get_processing_word(self.vocab_words, 48 | self.vocab_chars, lowercase=True, chars=self.use_chars) 49 | self.processing_tag = get_processing_word(self.vocab_tags, 50 | lowercase=False, allow_unk=False) 51 | 52 | # 3. get pre-trained embeddings 53 | self.embeddings = (get_trimmed_glove_vectors(self.filename_trimmed) 54 | if self.use_pretrained else None) 55 | 56 | 57 | # general config 58 | dir_output = "results/test/" 59 | dir_model = dir_output 60 | path_log = dir_output + "log.txt" 61 | 62 | # embeddings 63 | dim_word = 300 64 | dim_char = 100 65 | 66 | # glove files 67 | filename_glove = "data/glove.6B/glove.6B.{}d.txt".format(dim_word) 68 | # trimmed embeddings (created from glove_filename with build_data.py) 69 | filename_trimmed = "data/glove.6B.{}d.trimmed.npz".format(dim_word) 70 | use_pretrained = True 71 | 72 | # dataset 73 | # filename_dev = "data/coNLL/eng/eng.testa.iob" 74 | # filename_test = "data/coNLL/eng/eng.testb.iob" 75 | # filename_train = "data/coNLL/eng/eng.train.iob" 76 | 77 | #filename_dev = filename_test = filename_train = "data/test.txt" # test 78 | 79 | filename_dev = "data/valid.txt" 80 | filename_test = "data/test.txt" 81 | filename_train = "data/train.txt" 82 | 83 | max_iter = None # if not None, max number of examples in Dataset 84 | 85 | # vocab (created from dataset with build_data.py) 86 | filename_words = "data/words.txt" 87 | filename_tags = "data/tags.txt" 88 | filename_chars = "data/chars.txt" 89 | 90 | # training 91 | train_embeddings = False 92 | nepochs = 15 93 | dropout = 0.5 94 | batch_size = 5 95 | lr_method = "adam" 96 | lr = 0.001 97 | lr_decay = 0.9 98 | epoch_drop = 1 # Step Decay: per # epochs to apply lr_decay 99 | clip = -1 # if negative, no clipping 100 | nepoch_no_imprv = 3 101 | 102 | # model hyperparameters 103 | hidden_size_char = 100 # lstm on chars 104 | hidden_size_lstm = 300 # lstm on word embeddings 105 | 106 | ner_model_path = "saves/ner_{}e_glove".format(nepochs) 107 | 108 | # elmo config 109 | use_elmo = False 110 | dim_elmo = 1024 111 | 112 | # NOTE: if both chars and crf, only 1.6x slower on GPU 113 | use_crf = True # if crf, training is 1.7x slower on CPU 114 | use_chars = False if use_elmo else True# if char embedding, training is 3.5x slower on CPU 115 | -------------------------------------------------------------------------------- /model/core.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn, optim 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | import numpy as np 6 | import os 7 | import spacy 8 | 9 | USE_GPU = torch.cuda.is_available() 10 | 11 | def to_gpu(x, *args, **kwargs): 12 | '''puts pytorch variable to gpu, if cuda is available and USE_GPU is set to true. ''' 13 | return x.cuda(*args, **kwargs) if USE_GPU else x 14 | 15 | def children(m): return m if isinstance(m, (list, tuple)) else list(m.children()) 16 | 17 | def set_trainable_attr(m,b): 18 | m.trainable=b 19 | for p in m.parameters(): p.requires_grad=b 20 | 21 | def apply_leaf(m, f): 22 | c = children(m) 23 | if isinstance(m, nn.Module): f(m) 24 | if len(c)>0: 25 | for l in c: apply_leaf(l,f) 26 | 27 | def set_trainable(l, b): 28 | apply_leaf(l, lambda m: set_trainable_attr(m,b)) 29 | 30 | def save_model(m, p): torch.save(m.state_dict(), p) 31 | 32 | def T(a, half=False, cuda=True): 33 | """ 34 | Convert numpy array into a pytorch tensor. 35 | if Cuda is available and USE_GPU=True, store resulting tensor in GPU. 36 | """ 37 | if not torch.is_tensor(a): 38 | a = np.array(np.ascontiguousarray(a)) 39 | if a.dtype in (np.int8, np.int16, np.int32, np.int64): 40 | a = torch.LongTensor(a.astype(np.int64)) 41 | elif a.dtype in (np.float32, np.float64): 42 | a = torch.cuda.HalfTensor(a) if half else torch.FloatTensor(a) 43 | else: raise NotImplementedError(a.dtype) 44 | if cuda: a = to_gpu(a, async=True) 45 | return a 46 | 47 | def load_ner_model(m, p, strict=True): 48 | sd = torch.load(p, map_location=lambda storage, loc: storage) 49 | names = set(m.state_dict().keys()) 50 | for n in list(sd.keys()): # list "detatches" the iterator 51 | if n not in names and n+'_raw' in names: 52 | if n+'_raw' not in sd: sd[n+'_raw'] = sd[n] 53 | del sd[n] 54 | m.load_state_dict(sd, strict=strict) 55 | 56 | -------------------------------------------------------------------------------- /model/crf.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Union 2 | 3 | from torch.autograd import Variable 4 | import torch 5 | import torch.nn as nn 6 | 7 | 8 | class CRF(nn.Module): 9 | """Conditional random field. 10 | This module implements a conditional random field [LMP]. The forward computation 11 | of this class computes the log likelihood of the given sequence of tags and 12 | emission score tensor. This class also has ``decode`` method which finds the 13 | best tag sequence given an emission score tensor using `Viterbi algorithm`_. 14 | Arguments 15 | --------- 16 | num_tags : int 17 | Number of tags. 18 | Attributes 19 | ---------- 20 | num_tags : int 21 | Number of tags passed to ``__init__``. 22 | start_transitions : :class:`~torch.nn.Parameter` 23 | Start transition score tensor of size ``(num_tags,)``. 24 | end_transitions : :class:`~torch.nn.Parameter` 25 | End transition score tensor of size ``(num_tags,)``. 26 | transitions : :class:`~torch.nn.Parameter` 27 | Transition score tensor of size ``(num_tags, num_tags)``. 28 | References 29 | ---------- 30 | .. [LMP] Lafferty, J., McCallum, A., Pereira, F. (2001). 31 | "Conditional random fields: Probabilistic models for segmenting and 32 | labeling sequence data". *Proc. 18th International Conf. on Machine 33 | Learning*. Morgan Kaufmann. pp. 282–289. 34 | .. _Viterbi algorithm: https://en.wikipedia.org/wiki/Viterbi_algorithm 35 | """ 36 | def __init__(self, num_tags: int) -> None: 37 | if num_tags <= 0: 38 | raise ValueError(f'invalid number of tags: {num_tags}') 39 | super().__init__() 40 | self.num_tags = num_tags 41 | self.start_transitions = nn.Parameter(torch.Tensor(num_tags)) 42 | self.end_transitions = nn.Parameter(torch.Tensor(num_tags)) 43 | self.transitions = nn.Parameter(torch.Tensor(num_tags, num_tags)) 44 | 45 | self.reset_parameters() 46 | 47 | def reset_parameters(self) -> None: 48 | """Initialize the transition parameters. 49 | The parameters will be initialized randomly from a uniform distribution 50 | between -0.1 and 0.1. 51 | """ 52 | nn.init.uniform(self.start_transitions, -0.1, 0.1) 53 | nn.init.uniform(self.end_transitions, -0.1, 0.1) 54 | nn.init.uniform(self.transitions, -0.1, 0.1) 55 | 56 | def __repr__(self) -> str: 57 | return f'{self.__class__.__name__}(num_tags={self.num_tags})' 58 | 59 | def forward(self, 60 | emissions: Variable, 61 | tags: Variable, 62 | mask: Optional[Variable] = None, 63 | reduce: bool = True, 64 | ) -> Variable: 65 | """Compute the log likelihood of the given sequence of tags and emission score. 66 | Arguments 67 | --------- 68 | emissions : :class:`~torch.autograd.Variable` 69 | Emission score tensor of size ``(seq_length, batch_size, num_tags)``. 70 | tags : :class:`~torch.autograd.Variable` 71 | Sequence of tags as ``LongTensor`` of size ``(seq_length, batch_size)``. 72 | mask : :class:`~torch.autograd.Variable`, optional 73 | Mask tensor as ``ByteTensor`` of size ``(seq_length, batch_size)``. 74 | reduce : bool 75 | Whether to sum the log likelihood over the batch. 76 | Returns 77 | ------- 78 | :class:`~torch.autograd.Variable` 79 | The log likelihood. This will have size (1,) if ``reduce=True``, ``(batch_size,)`` 80 | otherwise. 81 | """ 82 | if emissions.dim() != 3: 83 | raise ValueError(f'emissions must have dimension of 3, got {emissions.dim()}') 84 | if tags.dim() != 2: 85 | raise ValueError(f'tags must have dimension of 2, got {tags.dim()}') 86 | if emissions.size()[:2] != tags.size(): 87 | raise ValueError( 88 | 'the first two dimensions of emissions and tags must match, ' 89 | f'got {tuple(emissions.size()[:2])} and {tuple(tags.size())}' 90 | ) 91 | if emissions.size(2) != self.num_tags: 92 | raise ValueError( 93 | f'expected last dimension of emissions is {self.num_tags}, ' 94 | f'got {emissions.size(2)}' 95 | ) 96 | if mask is not None: 97 | if tags.size() != mask.size(): 98 | raise ValueError( 99 | f'size of tags and mask must match, got {tuple(tags.size())} ' 100 | f'and {tuple(mask.size())}' 101 | ) 102 | if not all(mask[0].data): 103 | raise ValueError('mask of the first timestep must all be on') 104 | 105 | if mask is None: 106 | mask = Variable(self._new(tags.size()).fill_(1)).byte() 107 | 108 | numerator = self._compute_joint_llh(emissions, tags, mask) 109 | denominator = self._compute_log_partition_function(emissions, mask) 110 | llh = numerator - denominator 111 | return llh if not reduce else torch.sum(llh) 112 | 113 | def decode(self, 114 | emissions: Union[Variable, torch.FloatTensor], 115 | mask: Optional[Union[Variable, torch.ByteTensor]] = None) -> List[List[int]]: 116 | """Find the most likely tag sequence using Viterbi algorithm. 117 | Arguments 118 | --------- 119 | emissions : :class:`~torch.autograd.Variable` or :class:`~torch.FloatTensor` 120 | Emission score tensor of size ``(seq_length, batch_size, num_tags)``. 121 | mask : :class:`~torch.autograd.Variable` or :class:`torch.ByteTensor` 122 | Mask tensor of size ``(seq_length, batch_size)``. 123 | Returns 124 | ------- 125 | list 126 | List of list containing the best tag sequence for each batch. 127 | """ 128 | if emissions.dim() != 3: 129 | raise ValueError(f'emissions must have dimension of 3, got {emissions.dim()}') 130 | if emissions.size(2) != self.num_tags: 131 | raise ValueError( 132 | f'expected last dimension of emissions is {self.num_tags}, ' 133 | f'got {emissions.size(2)}' 134 | ) 135 | if mask is not None and emissions.size()[:2] != mask.size(): 136 | raise ValueError( 137 | 'the first two dimensions of emissions and mask must match, ' 138 | f'got {tuple(emissions.size()[:2])} and {tuple(mask.size())}' 139 | ) 140 | 141 | if isinstance(emissions, Variable): 142 | emissions = emissions.data 143 | if mask is None: 144 | mask = self._new(emissions.size()[:2]).fill_(1).byte() 145 | elif isinstance(mask, Variable): 146 | mask = mask.data 147 | 148 | return self._viterbi_decode(emissions, mask) 149 | 150 | def _compute_joint_llh(self, 151 | emissions: Variable, 152 | tags: Variable, 153 | mask: Variable) -> Variable: 154 | # emissions: (seq_length, batch_size, num_tags) 155 | # tags: (seq_length, batch_size) 156 | # mask: (seq_length, batch_size) 157 | assert emissions.dim() == 3 and tags.dim() == 2 158 | assert emissions.size()[:2] == tags.size() 159 | assert emissions.size(2) == self.num_tags 160 | assert mask.size() == tags.size() 161 | assert all(mask[0].data) 162 | 163 | seq_length = emissions.size(0) 164 | mask = mask.float() 165 | 166 | # Start transition score 167 | llh = self.start_transitions[tags[0]] # (batch_size,) 168 | 169 | for i in range(seq_length - 1): 170 | cur_tag, next_tag = tags[i], tags[i+1] 171 | # Emission score for current tag 172 | llh += emissions[i].gather(1, cur_tag.view(-1, 1)).squeeze(1) * mask[i] 173 | # Transition score to next tag 174 | transition_score = self.transitions[cur_tag, next_tag] 175 | # Only add transition score if the next tag is not masked (mask == 1) 176 | llh += transition_score * mask[i+1] 177 | 178 | # Find last tag index 179 | last_tag_indices = mask.long().sum(0) - 1 # (batch_size,) 180 | last_tags = tags.gather(0, last_tag_indices.view(1, -1)).squeeze(0) 181 | 182 | # End transition score 183 | llh += self.end_transitions[last_tags] 184 | # Emission score for the last tag, if mask is valid (mask == 1) 185 | llh += emissions[-1].gather(1, last_tags.view(-1, 1)).squeeze(1) * mask[-1] 186 | 187 | return llh 188 | 189 | def _compute_log_partition_function(self, 190 | emissions: Variable, 191 | mask: Variable) -> Variable: 192 | # emissions: (seq_length, batch_size, num_tags) 193 | # mask: (seq_length, batch_size) 194 | assert emissions.dim() == 3 and mask.dim() == 2 195 | assert emissions.size()[:2] == mask.size() 196 | assert emissions.size(2) == self.num_tags 197 | assert all(mask[0].data) 198 | 199 | seq_length = emissions.size(0) 200 | mask = mask.float() 201 | 202 | # Start transition score and first emission 203 | log_prob = self.start_transitions.view(1, -1) + emissions[0] 204 | # Here, log_prob has size (batch_size, num_tags) where for each batch, 205 | # the j-th column stores the log probability that the current timestep has tag j 206 | 207 | for i in range(1, seq_length): 208 | # Broadcast log_prob over all possible next tags 209 | broadcast_log_prob = log_prob.unsqueeze(2) # (batch_size, num_tags, 1) 210 | # Broadcast transition score over all instances in the batch 211 | broadcast_transitions = self.transitions.unsqueeze(0) # (1, num_tags, num_tags) 212 | # Broadcast emission score over all possible current tags 213 | broadcast_emissions = emissions[i].unsqueeze(1) # (batch_size, 1, num_tags) 214 | # Sum current log probability, transition, and emission scores 215 | score = broadcast_log_prob + broadcast_transitions \ 216 | + broadcast_emissions # (batch_size, num_tags, num_tags) 217 | # Sum over all possible current tags, but we're in log prob space, so a sum 218 | # becomes a log-sum-exp 219 | score = self._log_sum_exp(score, 1) # (batch_size, num_tags) 220 | # Set log_prob to the score if this timestep is valid (mask == 1), otherwise 221 | # leave it alone 222 | log_prob = score * mask[i].unsqueeze(1) + log_prob * (1.-mask[i]).unsqueeze(1) 223 | 224 | # End transition score 225 | log_prob += self.end_transitions.view(1, -1) 226 | # Sum (log-sum-exp) over all possible tags 227 | return self._log_sum_exp(log_prob, 1) # (batch_size,) 228 | 229 | def _viterbi_decode(self, emissions: torch.FloatTensor, mask: torch.ByteTensor) \ 230 | -> List[List[int]]: 231 | # Get input sizes 232 | seq_length = emissions.size(0) 233 | batch_size = emissions.size(1) 234 | sequence_lengths = mask.long().sum(dim=0) 235 | 236 | # emissions: (seq_length, batch_size, num_tags) 237 | assert emissions.size(2) == self.num_tags 238 | 239 | # list to store the decoded paths 240 | best_tags_list = [] 241 | 242 | # Start transition 243 | viterbi_score = [] 244 | viterbi_score.append(self.start_transitions.data + emissions[0]) 245 | viterbi_path = [] 246 | 247 | # Here, viterbi_score is a list of tensors of shapes of (num_tags,) where value at 248 | # index i stores the score of the best tag sequence so far that ends with tag i 249 | # viterbi_path saves where the best tags candidate transitioned from; this is used 250 | # when we trace back the best tag sequence 251 | 252 | # Viterbi algorithm recursive case: we compute the score of the best tag sequence 253 | # for every possible next tag 254 | for i in range(1, seq_length): 255 | # Broadcast viterbi score for every possible next tag 256 | broadcast_score = viterbi_score[i - 1].view(batch_size, -1, 1) 257 | # Broadcast emission score for every possible current tag 258 | broadcast_emission = emissions[i].view(batch_size, 1, -1) 259 | # Compute the score matrix of shape (batch_size, num_tags, num_tags) where 260 | # for each sample, each entry at row i and column j stores the score of 261 | # transitioning from tag i to tag j and emitting 262 | score = broadcast_score + self.transitions.data + broadcast_emission 263 | # Find the maximum score over all possible current tag 264 | best_score, best_path = score.max(1) # (batch_size,num_tags,) 265 | # Save the score and the path 266 | viterbi_score.append(best_score) 267 | viterbi_path.append(best_path) 268 | 269 | # Now, compute the best path for each sample 270 | for idx in range(batch_size): 271 | # Find the tag which maximizes the score at the last timestep; this is our best tag 272 | # for the last timestep 273 | seq_end = sequence_lengths[idx]-1 274 | _, best_last_tag = (viterbi_score[seq_end][idx] + self.end_transitions.data).max(0) 275 | best_tags = [best_last_tag.item()] #[best_last_tag[0]] #[best_last_tag.item()] 276 | 277 | # We trace back where the best last tag comes from, append that to our best tag 278 | # sequence, and trace it back again, and so on 279 | for path in reversed(viterbi_path[:sequence_lengths[idx] - 1]): 280 | best_last_tag = path[idx][best_tags[-1]] 281 | best_tags.append(best_last_tag) 282 | 283 | # Reverse the order because we start from the last timestep 284 | best_tags.reverse() 285 | best_tags_list.append(best_tags) 286 | return best_tags_list 287 | 288 | @staticmethod 289 | def _log_sum_exp(tensor: Variable, dim: int) -> Variable: 290 | # Find the max value along `dim` 291 | offset, _ = tensor.max(dim) 292 | # Make offset broadcastable 293 | broadcast_offset = offset.unsqueeze(dim) 294 | # Perform log-sum-exp safely 295 | safe_log_sum_exp = torch.log(torch.sum(torch.exp(tensor - broadcast_offset), dim)) 296 | # Add offset back 297 | return offset + safe_log_sum_exp 298 | 299 | def _new(self, *args, **kwargs) -> torch.FloatTensor: 300 | param = next(self.parameters()) 301 | return param.data.new(*args, **kwargs) 302 | -------------------------------------------------------------------------------- /model/data_utils.py: -------------------------------------------------------------------------------- 1 | " Data utils from https://github.com/guillaumegenthial/sequence_tagging " 2 | 3 | import numpy as np 4 | import torch 5 | import os 6 | 7 | 8 | # shared global variables to be imported from model also 9 | UNK = "$UNK$" 10 | NUM = "$NUM$" 11 | NONE = "O" 12 | 13 | 14 | # special error message 15 | class MyIOError(Exception): 16 | def __init__(self, filename): 17 | # custom error message 18 | message = """ 19 | ERROR: Unable to locate file {}. 20 | 21 | FIX: Have you tried running python build_data.py first? 22 | This will build vocab file from your train, test and dev sets and 23 | trimm your word vectors. 24 | """.format(filename) 25 | super(MyIOError, self).__init__(message) 26 | 27 | 28 | class CoNLLDataset(object): 29 | """Class that iterates over CoNLL Dataset 30 | 31 | __iter__ method yields a tuple (words, tags) 32 | words: list of raw words 33 | tags: list of raw tags 34 | 35 | If processing_word and processing_tag are not None, 36 | optional preprocessing is appplied 37 | 38 | Example: 39 | ```python 40 | data = CoNLLDataset(filename) 41 | for sentence, tags in data: 42 | pass 43 | ``` 44 | 45 | """ 46 | def __init__(self, filename, processing_word=None, processing_tag=None, 47 | max_iter=None, use_crf=True): 48 | """ 49 | Args: 50 | filename: path to the file 51 | processing_words: (optional) function that takes a word as input 52 | processing_tags: (optional) function that takes a tag as input 53 | max_iter: (optional) max number of sentences to yield 54 | 55 | """ 56 | self.filename = filename 57 | self.processing_word = processing_word 58 | self.processing_tag = processing_tag 59 | self.max_iter = max_iter 60 | self.use_crf = use_crf 61 | self.length = None 62 | 63 | 64 | def __iter__(self): 65 | niter = 0 66 | with open(self.filename) as f: 67 | words, tags = [], [] 68 | for line in f: 69 | line = line.strip() 70 | if (len(line) == 0 or line.startswith("-DOCSTART-")): 71 | if len(words) != 0: 72 | niter += 1 73 | if self.max_iter is not None and niter > self.max_iter: 74 | break 75 | yield words, tags 76 | words, tags = [], [] 77 | else: 78 | ls = line.split(' ') 79 | word, tag = ls[0],ls[-1] 80 | if self.processing_word is not None: 81 | word = self.processing_word(word) 82 | if self.processing_tag is not None: 83 | if self.use_crf: 84 | tag = self.processing_tag(tag) 85 | words += [word] 86 | tags += [tag] 87 | 88 | 89 | def __len__(self): 90 | """Iterates once over the corpus to set and store length""" 91 | if self.length is None: 92 | self.length = 0 93 | for _ in self: 94 | self.length += 1 95 | 96 | return self.length 97 | 98 | 99 | def get_vocabs(datasets): 100 | """Build vocabulary from an iterable of datasets objects 101 | 102 | Args: 103 | datasets: a list of dataset objects 104 | 105 | Returns: 106 | a set of all the words in the dataset 107 | 108 | """ 109 | print("Building vocab...") 110 | vocab_words = set() 111 | vocab_tags = set() 112 | for dataset in datasets: 113 | for words, tags in dataset: 114 | vocab_words.update(words) 115 | vocab_tags.update(tags) 116 | print("- done. {} tokens".format(len(vocab_words))) 117 | return vocab_words, vocab_tags 118 | 119 | 120 | def get_char_vocab(dataset): 121 | """Build char vocabulary from an iterable of datasets objects 122 | 123 | Args: 124 | dataset: a iterator yielding tuples (sentence, tags) 125 | 126 | Returns: 127 | a set of all the characters in the dataset 128 | 129 | """ 130 | print("Building char vocab...") 131 | vocab_char = set() 132 | for words, _ in dataset: 133 | for word in words: 134 | vocab_char.update(word) 135 | print("- done. {} tokens".format(len(vocab_char))) 136 | return vocab_char 137 | 138 | 139 | def get_glove_vocab(filename): 140 | """Load vocab from file 141 | 142 | Args: 143 | filename: path to the glove vectors 144 | 145 | Returns: 146 | vocab: set() of strings 147 | """ 148 | print("Building vocab...") 149 | vocab = set() 150 | with open(filename, encoding="utf8") as f: 151 | for line in f: 152 | word = line.strip().split(' ')[0] 153 | vocab.add(word) 154 | print("- done. {} tokens".format(len(vocab))) 155 | return vocab 156 | 157 | 158 | def write_vocab(vocab, filename): 159 | """Writes a vocab to a file 160 | 161 | Writes one word per line. 162 | 163 | Args: 164 | vocab: iterable that yields word 165 | filename: path to vocab file 166 | 167 | Returns: 168 | write a word per line 169 | 170 | """ 171 | print("Writing vocab...") 172 | with open(filename, "w") as f: 173 | for i, word in enumerate(vocab): 174 | if i != len(vocab) - 1: 175 | f.write("{}\n".format(word)) 176 | else: 177 | f.write(word) 178 | print("- done. {} tokens".format(len(vocab))) 179 | 180 | 181 | def load_vocab(filename): 182 | """Loads vocab from a file 183 | 184 | Args: 185 | filename: (string) the format of the file must be one word per line. 186 | 187 | Returns: 188 | d: dict[word] = index 189 | 190 | """ 191 | try: 192 | d = dict() 193 | with open(filename) as f: 194 | for idx, word in enumerate(f): 195 | word = word.strip() 196 | d[word] = idx 197 | 198 | except IOError: 199 | raise MyIOError(filename) 200 | return d 201 | 202 | 203 | def export_trimmed_glove_vectors(vocab, glove_filename, trimmed_filename, dim): 204 | """Saves glove vectors in numpy array 205 | 206 | Args: 207 | vocab: dictionary vocab[word] = index 208 | glove_filename: a path to a glove file 209 | trimmed_filename: a path where to store a matrix in npy 210 | dim: (int) dimension of embeddings 211 | 212 | """ 213 | embeddings = np.zeros([len(vocab), dim]) 214 | with open(glove_filename, encoding="utf8") as f: 215 | for line in f: 216 | line = line.strip().split(' ') 217 | word = line[0] 218 | embedding = [float(x) for x in line[1:]] 219 | if word in vocab: 220 | word_idx = vocab[word] 221 | embeddings[word_idx] = np.asarray(embedding) 222 | 223 | np.savez_compressed(trimmed_filename, embeddings=embeddings) 224 | 225 | 226 | def get_trimmed_glove_vectors(filename): 227 | """ 228 | Args: 229 | filename: path to the npz file 230 | 231 | Returns: 232 | matrix of embeddings (np array) 233 | 234 | """ 235 | try: 236 | with np.load(filename) as data: 237 | return data["embeddings"] 238 | 239 | except IOError: 240 | raise MyIOError(filename) 241 | 242 | 243 | def get_processing_word(vocab_words=None, vocab_chars=None, 244 | lowercase=False, chars=False, allow_unk=True): 245 | """Return lambda function that transform a word (string) into list, 246 | or tuple of (list, id) of int corresponding to the ids of the word and 247 | its corresponding characters. 248 | 249 | Args: 250 | vocab: dict[word] = idx 251 | 252 | Returns: 253 | f("cat") = ([12, 4, 32], 12345) 254 | = (list of char ids, word id) 255 | 256 | """ 257 | def f(word): 258 | # 0. get chars of words 259 | if vocab_chars is not None and chars == True: 260 | char_ids = [] 261 | for char in word: 262 | # ignore chars out of vocabulary 263 | if char in vocab_chars: 264 | char_ids += [vocab_chars[char]] 265 | 266 | # 1. preprocess word 267 | if lowercase: 268 | word = word.lower() 269 | if word.isdigit(): 270 | word = NUM 271 | 272 | # 2. get id of word 273 | if vocab_words is not None: 274 | if word in vocab_words: 275 | word = vocab_words[word] 276 | else: 277 | if allow_unk: 278 | word = vocab_words[UNK] 279 | else: 280 | raise Exception("Unknow key is not allowed. Check that "\ 281 | "your vocab (tags?) is correct") 282 | 283 | # 3. return tuple char ids, word id 284 | if vocab_chars is not None and chars == True: 285 | return char_ids, word 286 | else: 287 | return word 288 | 289 | return f 290 | 291 | 292 | def _pad_sequences(sequences, pad_tok, max_length): 293 | """ 294 | Args: 295 | sequences: a generator of list or tuple 296 | pad_tok: the char to pad with 297 | 298 | Returns: 299 | a list of list where each sublist has same length 300 | """ 301 | sequence_padded, sequence_length = [], [] 302 | 303 | for seq in sequences: 304 | seq = list(seq) 305 | seq_ = seq[:max_length] + [pad_tok]*max(max_length - len(seq), 0) 306 | sequence_padded += [seq_] 307 | sequence_length += [min(len(seq), max_length)] 308 | 309 | return sequence_padded, sequence_length 310 | 311 | 312 | def pad_sequences(sequences, pad_tok, nlevels=1): 313 | """ 314 | Args: 315 | sequences: a generator of list or tuple 316 | pad_tok: the char to pad with 317 | nlevels: "depth" of padding, for the case where we have characters ids 318 | 319 | Returns: 320 | a list of list where each sublist has same length 321 | 322 | """ 323 | if nlevels == 1: 324 | max_length = max(map(lambda x : len(x), sequences)) 325 | sequence_padded, sequence_length = _pad_sequences(sequences, 326 | pad_tok, max_length) 327 | 328 | elif nlevels == 2: 329 | max_length_word = max([max(map(lambda x: len(x), seq)) 330 | for seq in sequences]) 331 | sequence_padded, sequence_length = [], [] 332 | for seq in sequences: 333 | # all words are same length now 334 | sp, sl = _pad_sequences(seq, pad_tok, max_length_word) 335 | sequence_padded += [sp] 336 | sequence_length += [sl] 337 | 338 | max_length_sentence = max(map(lambda x : len(x), sequences)) 339 | sequence_padded, _ = _pad_sequences(sequence_padded, 340 | [pad_tok]*max_length_word, max_length_sentence) 341 | sequence_length, _ = _pad_sequences(sequence_length, 0, 342 | max_length_sentence) 343 | 344 | return sequence_padded, sequence_length 345 | 346 | 347 | def minibatches(data, minibatch_size, use_crf=True): 348 | """ 349 | Args: 350 | data: generator of (sentence, tags) tuples 351 | minibatch_size: (int) 352 | 353 | Yields: 354 | list of tuples 355 | 356 | """ 357 | x_batch, y_batch = [], [] 358 | for (x, y) in data: 359 | if len(x_batch) == minibatch_size: 360 | yield x_batch, y_batch 361 | x_batch, y_batch = [], [] 362 | 363 | if type(x[0]) == tuple: 364 | x = zip(*x) 365 | x_batch += [x] 366 | if use_crf: 367 | y_batch += [y] 368 | else: 369 | if any([x.isdigit() for x in y]): 370 | y_batch.append([int(x) for x in y if x.isdigit()]) 371 | else: 372 | y_batch.append([0,0,0,0,0]) 373 | 374 | if len(x_batch) != 0: 375 | yield x_batch, y_batch 376 | 377 | 378 | def get_chunk_type(tok, idx_to_tag): 379 | """ 380 | Args: 381 | tok: id of token, ex 4 382 | idx_to_tag: dictionary {4: "B-PER", ...} 383 | 384 | Returns: 385 | tuple: "B", "PER" 386 | 387 | """ 388 | if isinstance(tok, torch.Tensor): tok = tok.item() 389 | tag_name = idx_to_tag[tok] 390 | 391 | tag_class = tag_name.split('-')[0] 392 | tag_type = tag_name.split('-')[-1] 393 | return tag_class, tag_type 394 | 395 | 396 | def get_chunks(seq, tags): 397 | """Given a sequence of tags, group entities and their position 398 | 399 | Args: 400 | seq: [4, 4, 0, 0, ...] sequence of labels 401 | tags: dict["O"] = 4 402 | 403 | Returns: 404 | list of (chunk_type, chunk_start, chunk_end) 405 | 406 | Example: 407 | seq = [4, 5, 0, 3] 408 | tags = {"B-PER": 4, "I-PER": 5, "B-LOC": 3} 409 | result = [("PER", 0, 2), ("LOC", 3, 4)] 410 | 411 | """ 412 | default = tags[NONE] 413 | idx_to_tag = {idx: tag for tag, idx in tags.items()} 414 | chunks = [] 415 | chunk_type, chunk_start = None, None 416 | for i, tok in enumerate(seq): 417 | # End of a chunk 1 418 | if tok == default and chunk_type is not None: 419 | # Add a chunk. 420 | chunk = (chunk_type, chunk_start, i) 421 | chunks.append(chunk) 422 | chunk_type, chunk_start = None, None 423 | 424 | # End of a chunk + start of a chunk! 425 | elif tok != default: 426 | tok_chunk_class, tok_chunk_type = get_chunk_type(tok, idx_to_tag) 427 | if chunk_type is None: 428 | chunk_type, chunk_start = tok_chunk_type, i 429 | elif tok_chunk_type != chunk_type or tok_chunk_class == "B": 430 | chunk = (chunk_type, chunk_start, i) 431 | chunks.append(chunk) 432 | chunk_type, chunk_start = tok_chunk_type, i 433 | else: 434 | pass 435 | 436 | # end condition 437 | if chunk_type is not None: 438 | chunk = (chunk_type, chunk_start, len(seq)) 439 | chunks.append(chunk) 440 | 441 | return chunks 442 | -------------------------------------------------------------------------------- /model/general_utils.py: -------------------------------------------------------------------------------- 1 | " Logger and Progress Bar from https://github.com/guillaumegenthial/sequence_tagging " 2 | 3 | import time 4 | import sys 5 | import logging 6 | import numpy as np 7 | 8 | 9 | def get_logger(filename): 10 | """Return a logger instance that writes in filename 11 | 12 | Args: 13 | filename: (string) path to log.txt 14 | 15 | Returns: 16 | logger: (instance of logger) 17 | 18 | """ 19 | logger = logging.getLogger('logger') 20 | logger.setLevel(logging.DEBUG) 21 | logging.basicConfig(format='%(message)s', level=logging.DEBUG) 22 | handler = logging.FileHandler(filename) 23 | handler.setLevel(logging.DEBUG) 24 | handler.setFormatter(logging.Formatter( 25 | '%(asctime)s:%(levelname)s: %(message)s')) 26 | logging.getLogger().addHandler(handler) 27 | 28 | return logger 29 | 30 | 31 | class Progbar(object): 32 | """Progbar class copied from keras (https://github.com/fchollet/keras/) 33 | 34 | Displays a progress bar. 35 | Small edit : added strict arg to update 36 | # Arguments 37 | target: Total number of steps expected. 38 | interval: Minimum visual progress update interval (in seconds). 39 | """ 40 | 41 | def __init__(self, target, width=30, verbose=1): 42 | self.width = width 43 | self.target = target 44 | self.sum_values = {} 45 | self.unique_values = [] 46 | self.start = time.time() 47 | self.total_width = 0 48 | self.seen_so_far = 0 49 | self.verbose = verbose 50 | 51 | def update(self, current, values=[], exact=[], strict=[]): 52 | """ 53 | Updates the progress bar. 54 | # Arguments 55 | current: Index of current step. 56 | values: List of tuples (name, value_for_last_step). 57 | The progress bar will display averages for these values. 58 | exact: List of tuples (name, value_for_last_step). 59 | The progress bar will display these values directly. 60 | """ 61 | 62 | for k, v in values: 63 | if k not in self.sum_values: 64 | self.sum_values[k] = [v * (current - self.seen_so_far), 65 | current - self.seen_so_far] 66 | self.unique_values.append(k) 67 | else: 68 | self.sum_values[k][0] += v * (current - self.seen_so_far) 69 | self.sum_values[k][1] += (current - self.seen_so_far) 70 | for k, v in exact: 71 | if k not in self.sum_values: 72 | self.unique_values.append(k) 73 | self.sum_values[k] = [v, 1] 74 | 75 | for k, v in strict: 76 | if k not in self.sum_values: 77 | self.unique_values.append(k) 78 | self.sum_values[k] = v 79 | 80 | self.seen_so_far = current 81 | 82 | now = time.time() 83 | if self.verbose == 1: 84 | prev_total_width = self.total_width 85 | sys.stdout.write("\b" * prev_total_width) 86 | sys.stdout.write("\r") 87 | 88 | numdigits = int(np.floor(np.log10(self.target))) + 1 89 | barstr = '%%%dd/%%%dd [' % (numdigits, numdigits) 90 | bar = barstr % (current, self.target) 91 | prog = float(current)/self.target 92 | prog_width = int(self.width*prog) 93 | if prog_width > 0: 94 | bar += ('='*(prog_width-1)) 95 | if current < self.target: 96 | bar += '>' 97 | else: 98 | bar += '=' 99 | bar += ('.'*(self.width-prog_width)) 100 | bar += ']' 101 | sys.stdout.write(bar) 102 | self.total_width = len(bar) 103 | 104 | if current: 105 | time_per_unit = (now - self.start) / current 106 | else: 107 | time_per_unit = 0 108 | eta = time_per_unit*(self.target - current) 109 | info = '' 110 | if current < self.target: 111 | info += ' - ETA: %ds' % eta 112 | else: 113 | info += ' - %ds' % (now - self.start) 114 | for k in self.unique_values: 115 | if type(self.sum_values[k]) is list: 116 | info += ' - %s: %.4f' % (k, 117 | self.sum_values[k][0] / max(1, self.sum_values[k][1])) 118 | else: 119 | info += ' - %s: %s' % (k, self.sum_values[k]) 120 | 121 | self.total_width += len(info) 122 | if prev_total_width > self.total_width: 123 | info += ((prev_total_width-self.total_width) * " ") 124 | 125 | sys.stdout.write(info) 126 | sys.stdout.flush() 127 | 128 | if current >= self.target: 129 | sys.stdout.write("\n") 130 | 131 | if self.verbose == 2: 132 | if current >= self.target: 133 | info = '%ds' % (now - self.start) 134 | for k in self.unique_values: 135 | info += ' - %s: %.4f' % (k, 136 | self.sum_values[k][0] / max(1, self.sum_values[k][1])) 137 | sys.stdout.write(info + "\n") 138 | 139 | def add(self, n, values=[]): 140 | self.update(self.seen_so_far+n, values) 141 | 142 | 143 | -------------------------------------------------------------------------------- /model/ner_learner.py: -------------------------------------------------------------------------------- 1 | """ Works with pytorch 0.4.0 """ 2 | 3 | from .core import * 4 | from .data_utils import pad_sequences, minibatches, get_chunks 5 | from .crf import CRF 6 | from .general_utils import Progbar 7 | from torch.optim.lr_scheduler import StepLR 8 | 9 | if os.name == "posix": from allennlp.modules.elmo import Elmo, batch_to_ids # AllenNLP is currently only supported on linux 10 | 11 | 12 | class NERLearner(object): 13 | """ 14 | NERLearner class that encapsulates a pytorch nn.Module model and ModelData class 15 | Contains methods for training a testing the model 16 | """ 17 | def __init__(self, config, model): 18 | super().__init__() 19 | self.config = config 20 | self.logger = self.config.logger 21 | self.model = model 22 | self.model_path = config.dir_model 23 | self.use_elmo = config.use_elmo 24 | 25 | 26 | self.idx_to_tag = {idx: tag for tag, idx in 27 | self.config.vocab_tags.items()} 28 | 29 | self.criterion = CRF(self.config.ntags) 30 | self.optimizer = optim.Adam(self.model.parameters()) 31 | 32 | if self.use_elmo: 33 | options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json" 34 | weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5" 35 | self.elmo = Elmo(options_file, weight_file, 2, dropout=0) 36 | else: 37 | self.load_emb() 38 | 39 | if USE_GPU: 40 | self.use_cuda = True 41 | self.logger.info("GPU found.") 42 | self.model = model.cuda() 43 | self.criterion = self.criterion.cuda() 44 | if self.use_elmo: 45 | self.elmo = self.elmo.cuda() 46 | print("Moved elmo to cuda") 47 | else: 48 | self.model = model.cpu() 49 | self.use_cuda = False 50 | self.logger.info("No GPU found.") 51 | 52 | def get_model_path(self, name): 53 | return os.path.join(self.model_path,name)+'.h5' 54 | 55 | def get_layer_groups(self, do_fc=False): 56 | return children(self.model) 57 | 58 | def freeze_to(self, n): 59 | c=self.get_layer_groups() 60 | for l in c: 61 | set_trainable(l, False) 62 | for l in c[n:]: 63 | set_trainable(l, True) 64 | 65 | def unfreeze(self): 66 | self.freeze_to(0) 67 | 68 | def save(self, name=None): 69 | if not name: 70 | name = self.config.ner_model_path 71 | save_model(self.model, self.get_model_path(name)) 72 | self.logger.info(f"Saved model at {self.get_model_path(name)}") 73 | 74 | def load_emb(self): 75 | self.model.emb.weight = nn.Parameter(T(self.config.embeddings)) 76 | self.model.emb.weight.requires_grad = False 77 | self.logger.info('Loading pretrained word embeddings') 78 | 79 | def load(self, fn=None): 80 | if not fn: fn = self.config.ner_model_path 81 | fn = self.get_model_path(fn) 82 | load_ner_model(self.model, fn, strict=True) 83 | self.logger.info(f"Loaded model from {fn}") 84 | 85 | def batch_iter(self, train, batch_size, return_lengths=False, shuffle=False, sorter=False): 86 | """ 87 | Builds a generator from the given dataloader to be fed into the model 88 | 89 | Args: 90 | train: DataLoader 91 | batch_size: size of each batch 92 | return_lengths: if True, generator returns a list of sequence lengths for each 93 | sample in the batch 94 | ie. sequence_lengths = [8,7,4,3] 95 | shuffle: if True, shuffles the data for each epoch 96 | sorter: if True, uses a sorter to shuffle the data 97 | 98 | Returns: 99 | nbatches: (int) number of batches 100 | data_generator: batch generator yielding 101 | dict inputs:{'word_ids' : np.array([[padded word_ids in sent1], ...]) 102 | 'char_ids': np.array([[[padded char_ids in word1_sent1], ...], 103 | [padded char_ids in word1_sent2], ...], 104 | ...])} 105 | labels: np.array([[padded label_ids in sent1], ...]) 106 | sequence_lengths: list([len(sent1), len(sent2), ...]) 107 | 108 | """ 109 | nbatches = (len(train) + batch_size - 1) // batch_size 110 | 111 | def data_generator(): 112 | while True: 113 | if shuffle: train.shuffle() 114 | elif sorter==True and train.sorter: train.sort() 115 | 116 | for i, (words, labels) in enumerate(minibatches(train, batch_size)): 117 | 118 | # perform padding of the given data 119 | if self.config.use_chars: 120 | char_ids, word_ids = zip(*words) 121 | word_ids, sequence_lengths = pad_sequences(word_ids, 1) 122 | char_ids, word_lengths = pad_sequences(char_ids, pad_tok=0, 123 | nlevels=2) 124 | 125 | else: 126 | word_ids, sequence_lengths = pad_sequences(words, 0) 127 | 128 | if self.use_elmo: 129 | word_ids = words 130 | 131 | if labels: 132 | labels, _ = pad_sequences(labels, 0) 133 | # if categorical 134 | ## labels = [to_categorical(label, num_classes=len(train.tag_itos)) for label in labels] 135 | 136 | # build dictionary 137 | inputs = { 138 | "word_ids": np.asarray(word_ids) 139 | } 140 | 141 | if self.config.use_chars: 142 | inputs["char_ids"] = np.asarray(char_ids) 143 | 144 | if return_lengths: 145 | yield(inputs, np.asarray(labels), sequence_lengths) 146 | 147 | else: 148 | yield (inputs, np.asarray(labels)) 149 | 150 | return (nbatches, data_generator()) 151 | 152 | 153 | def fine_tune(self, train, dev=None): 154 | """ 155 | Fine tune the NER model by freezing the pre-trained encoder and training the newly 156 | instantiated layers for 1 epochs 157 | """ 158 | self.logger.info("Fine Tuning Model") 159 | self.fit(train, dev, epochs=1, fine_tune=True) 160 | 161 | 162 | def fit(self, train, dev=None, epochs=None, fine_tune=False): 163 | """ 164 | Fits the model to the training dataset and evaluates on the validation set. 165 | Saves the model to disk 166 | """ 167 | if not epochs: 168 | epochs = self.config.nepochs 169 | batch_size = self.config.batch_size 170 | 171 | nbatches_train, train_generator = self.batch_iter(train, batch_size, 172 | return_lengths=True) 173 | if dev: 174 | nbatches_dev, dev_generator = self.batch_iter(dev, batch_size, 175 | return_lengths=True) 176 | 177 | scheduler = StepLR(self.optimizer, step_size=1, gamma=self.config.lr_decay) 178 | 179 | if not fine_tune: self.logger.info("Training Model") 180 | 181 | f1s = [] 182 | 183 | for epoch in range(epochs): 184 | scheduler.step() 185 | self.train(epoch, nbatches_train, train_generator, fine_tune=fine_tune) 186 | 187 | if dev: 188 | f1 = self.test(nbatches_dev, dev_generator, fine_tune=fine_tune) 189 | 190 | # Early stopping 191 | if len(f1s) > 0: 192 | if f1 < max(f1s[max(-self.config.nepoch_no_imprv, -len(f1s)):]): #if sum([f1 > f1s[max(-i, -len(f1s))] for i in range(1,self.config.nepoch_no_imprv+1)]) == 0: 193 | print("No improvement in the last 3 epochs. Stopping training") 194 | break 195 | else: 196 | f1s.append(f1) 197 | 198 | if fine_tune: 199 | self.save(self.config.ner_ft_path) 200 | else : 201 | self.save(self.config.ner_model_path) 202 | 203 | 204 | def train(self, epoch, nbatches_train, train_generator, fine_tune=False): 205 | self.logger.info('\nEpoch: %d' % epoch) 206 | self.model.train() 207 | if not self.use_elmo: self.model.emb.weight.requires_grad = False 208 | 209 | train_loss = 0 210 | correct = 0 211 | total = 0 212 | total_step = None 213 | 214 | prog = Progbar(target=nbatches_train) 215 | 216 | for batch_idx, (inputs, targets, sequence_lengths) in enumerate(train_generator): 217 | 218 | if batch_idx == nbatches_train: break 219 | if inputs['word_ids'].shape[0] == 1: 220 | self.logger.info('Skipping batch of size=1') 221 | continue 222 | 223 | total_step = batch_idx 224 | targets = T(targets, cuda=self.use_cuda).transpose(0,1).contiguous() 225 | self.optimizer.zero_grad() 226 | 227 | if self.use_elmo: 228 | sentences = inputs['word_ids'] 229 | character_ids = batch_to_ids(sentences) 230 | if self.use_cuda: 231 | character_ids = character_ids.cuda() 232 | embeddings = self.elmo(character_ids) 233 | word_input = embeddings['elmo_representations'][0] 234 | word_input, targets = Variable(word_input, requires_grad=False), \ 235 | Variable(targets) 236 | inputs = (word_input) 237 | 238 | else: 239 | word_input = T(inputs['word_ids'], cuda=self.use_cuda) 240 | char_input = T(inputs['char_ids'], cuda=self.use_cuda) 241 | word_input, char_input, targets = Variable(word_input, requires_grad=False), \ 242 | Variable(char_input, requires_grad=False),\ 243 | Variable(targets) 244 | inputs = (word_input, char_input) 245 | 246 | 247 | outputs = self.model(inputs) 248 | 249 | # Create mask 250 | if self.use_elmo: 251 | mask = Variable(embeddings['mask'].transpose(0,1)) 252 | if self.use_cuda: 253 | mask = mask.cuda() 254 | else: 255 | mask = create_mask(sequence_lengths, targets, cuda=self.use_cuda) 256 | 257 | # Get CRF Loss 258 | loss = -1*self.criterion(outputs, targets, mask=mask) 259 | loss.backward() 260 | self.optimizer.step() 261 | 262 | # Callbacks 263 | train_loss += loss.item() 264 | predictions = self.criterion.decode(outputs, mask=mask) 265 | masked_targets = mask_targets(targets, sequence_lengths) 266 | 267 | t_ = mask.type(torch.LongTensor).sum().item() 268 | total += t_ 269 | c_ = sum([1 if p[i] == mt[i] else 0 for p, mt in zip(predictions, masked_targets) for i in range(len(p))]) 270 | correct += c_ 271 | 272 | prog.update(batch_idx + 1, values=[("train loss", loss.item())], exact=[("Accuracy", 100*c_/t_)]) 273 | 274 | self.logger.info("Train Loss: %.3f, Train Accuracy: %.3f%% (%d/%d)" %(train_loss/(total_step+1), 100.*correct/total, correct, total) ) 275 | 276 | 277 | def test(self, nbatches_val, val_generator, fine_tune=False): 278 | self.model.eval() 279 | accs = [] 280 | test_loss = 0 281 | correct_preds = 0 282 | total_correct = 0 283 | total_preds = 0 284 | total_step = None 285 | 286 | for batch_idx, (inputs, targets, sequence_lengths) in enumerate(val_generator): 287 | if batch_idx == nbatches_val: break 288 | if inputs['word_ids'].shape[0] == 1: 289 | self.logger.info('Skipping batch of size=1') 290 | continue 291 | 292 | total_step = batch_idx 293 | targets = T(targets, cuda=self.use_cuda).transpose(0,1).contiguous() 294 | 295 | if self.use_elmo: 296 | sentences = inputs['word_ids'] 297 | character_ids = batch_to_ids(sentences) 298 | if self.use_cuda: 299 | character_ids = character_ids.cuda() 300 | embeddings = self.elmo(character_ids) 301 | word_input = embeddings['elmo_representations'][1] 302 | word_input, targets = Variable(word_input, requires_grad=False), \ 303 | Variable(targets) 304 | inputs = (word_input) 305 | 306 | else: 307 | word_input = T(inputs['word_ids'], cuda=self.use_cuda) 308 | char_input = T(inputs['char_ids'], cuda=self.use_cuda) 309 | word_input, char_input, targets = Variable(word_input, requires_grad=False), \ 310 | Variable(char_input, requires_grad=False),\ 311 | Variable(targets) 312 | inputs = (word_input, char_input) 313 | 314 | outputs = self.model(inputs) 315 | 316 | # Create mask 317 | if self.use_elmo: 318 | mask = Variable(embeddings['mask'].transpose(0,1)) 319 | if self.use_cuda: 320 | mask = mask.cuda() 321 | else: 322 | mask = create_mask(sequence_lengths, targets, cuda=self.use_cuda) 323 | 324 | # Get CRF Loss 325 | loss = -1*self.criterion(outputs, targets, mask=mask) 326 | 327 | # Callbacks 328 | test_loss += loss.item() 329 | predictions = self.criterion.decode(outputs, mask=mask) 330 | masked_targets = mask_targets(targets, sequence_lengths) 331 | 332 | for lab, lab_pred in zip(masked_targets, predictions): 333 | 334 | accs += [1 if a==b else 0 for (a, b) in zip(lab, lab_pred)] 335 | 336 | lab_chunks = set(get_chunks(lab, self.config.vocab_tags)) 337 | lab_pred_chunks = set(get_chunks(lab_pred, 338 | self.config.vocab_tags)) 339 | 340 | correct_preds += len(lab_chunks & lab_pred_chunks) 341 | total_preds += len(lab_pred_chunks) 342 | total_correct += len(lab_chunks) 343 | 344 | p = correct_preds / total_preds if correct_preds > 0 else 0 345 | r = correct_preds / total_correct if correct_preds > 0 else 0 346 | f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0 347 | acc = np.mean(accs) 348 | 349 | self.logger.info("Val Loss : %.3f, Val Accuracy: %.3f%%, Val F1: %.3f%%" %(test_loss/(total_step+1), 100*acc, 100*f1)) 350 | return 100*f1 351 | 352 | def evaluate(self,test): 353 | batch_size = self.config.batch_size 354 | nbatches_test, test_generator = self.batch_iter(test, batch_size, 355 | return_lengths=True) 356 | self.logger.info('Evaluating on test set') 357 | self.test(nbatches_test, test_generator) 358 | 359 | def predict_batch(self, words): 360 | self.model.eval() 361 | if len(words) == 1: 362 | mult = np.ones(2).reshape(2, 1).astype(int) 363 | 364 | if self.use_elmo: 365 | sentences = words 366 | character_ids = batch_to_ids(sentences) 367 | if self.use_cuda: 368 | character_ids = character_ids.cuda() 369 | embeddings = self.elmo(character_ids) 370 | word_input = embeddings['elmo_representations'][1] 371 | word_input = Variable(word_input, requires_grad=False) 372 | 373 | if len(words) == 1: 374 | word_input = ((mult*word_input.transpose(0,1)).transpose(0,1).contiguous()).type(torch.FloatTensor) 375 | 376 | word_input = T(word_input, cuda=self.use_cuda) 377 | inputs = (word_input) 378 | 379 | else: 380 | #char_ids, word_ids = zip(*words) 381 | char_ids = [[c[0] for c in s] for s in words] 382 | word_ids = [[x[1] for x in s] for s in words] 383 | word_ids, sequence_lengths = pad_sequences(word_ids, 1) 384 | char_ids, word_lengths = pad_sequences(char_ids, pad_tok=0, 385 | nlevels=2) 386 | word_ids = np.asarray(word_ids) 387 | char_ids = np.asarray(char_ids) 388 | 389 | if len(words) == 1: 390 | word_ids = mult*word_ids 391 | char_ids = (mult*char_ids.transpose(1,0,2)).transpose(1,0,2) 392 | word_input = T(word_ids, cuda=self.use_cuda) 393 | char_input = T(char_ids, cuda=self.use_cuda) 394 | 395 | word_input, char_input = Variable(word_input, requires_grad=False), \ 396 | Variable(char_input, requires_grad=False) 397 | 398 | inputs = (word_input, char_input) 399 | 400 | 401 | outputs = self.model(inputs) 402 | 403 | predictions = self.criterion.decode(outputs) 404 | 405 | predictions = [p[:i] for p, i in zip(predictions, sequence_lengths)] 406 | 407 | return predictions 408 | 409 | def predict(self, sentences): 410 | """Returns list of tags 411 | 412 | Args: 413 | words_raw: list of words (string), just one sentence (no batch) 414 | 415 | Returns: 416 | preds: list of tags (string), one for each word in the sentence 417 | 418 | """ 419 | nlp = spacy.load('en') 420 | doc = nlp(sentences) 421 | words_raw = [[token.text for token in sent] for sent in doc.sents] 422 | if self.use_elmo: 423 | words = words_raw 424 | else: 425 | words = [[self.config.processing_word(w) for w in s] for s in words_raw] 426 | # print(words) 427 | # raise NameError('testing') 428 | # if type(words[0]) == tuple: 429 | # words = zip(*words) 430 | 431 | pred_ids = self.predict_batch(words) 432 | preds = [[self.idx_to_tag[idx.item() if isinstance(idx, torch.Tensor) else idx] for idx in s] for s in pred_ids] 433 | 434 | return preds 435 | 436 | 437 | def create_mask(sequence_lengths, targets, cuda, batch_first=False): 438 | """ Creates binary mask """ 439 | mask = Variable(torch.ones(targets.size()).type(torch.ByteTensor)) 440 | if cuda: mask = mask.cuda() 441 | 442 | for i,l in enumerate(sequence_lengths): 443 | if batch_first: 444 | if l < targets.size(1): 445 | mask.data[i, l:] = 0 446 | else: 447 | if l < targets.size(0): 448 | mask.data[l:, i] = 0 449 | 450 | return mask 451 | 452 | 453 | def mask_targets(targets, sequence_lengths, batch_first=False): 454 | """ Masks the targets """ 455 | if not batch_first: 456 | targets = targets.transpose(0,1) 457 | t = [] 458 | for l, p in zip(targets,sequence_lengths): 459 | t.append(l[:p].data.tolist()) 460 | return t 461 | 462 | 463 | 464 | 465 | -------------------------------------------------------------------------------- /model/ner_model.py: -------------------------------------------------------------------------------- 1 | #from fastai.text import * 2 | from .core import * 3 | 4 | class NERModel(nn.Module): 5 | 6 | def __init__(self, config): 7 | super().__init__() 8 | self.config = config 9 | self.use_elmo = config.use_elmo 10 | 11 | if not self.use_elmo: 12 | self.emb = nn.Embedding(self.config.nwords, self.config.dim_word, padding_idx=0) 13 | self.char_embeddings = nn.Embedding(self.config.nchars, self.config.dim_char, padding_idx=0) 14 | self.char_lstm = nn.LSTM(self.config.dim_char, self.config.hidden_size_char, bidirectional=True) 15 | 16 | self.dropout = nn.Dropout(p=self.config.dropout) 17 | self.word_lstm = nn.LSTM(self.config.dim_elmo if self.use_elmo else self.config.dim_word+2*self.config.hidden_size_char, 18 | self.config.hidden_size_lstm, bidirectional=True) 19 | 20 | self.linear = LinearClassifier(self.config, layers=[self.config.hidden_size_lstm*2, self.config.ntags], drops=[0.5]) 21 | 22 | 23 | def forward(self, input): 24 | # Word_dim = (batch_size x sent_length) 25 | # char_dim = (batch_size x sent_length x word_length) 26 | 27 | if self.use_elmo: 28 | word_emb = self.dropout(input.transpose(0,1)) 29 | 30 | else: 31 | word_input, char_input = input[0], input[1] 32 | word_input.transpose_(0,1) 33 | 34 | # Word Embedding 35 | word_emb = self.emb(word_input) #shape= S*B*wnh 36 | 37 | # Char LSTM 38 | char_emb = self.char_embeddings(char_input.view(-1, char_input.size(2))) #https://stackoverflow.com/questions/47205762/embedding-3d-data-in-pytorch 39 | char_emb = char_emb.view(*char_input.size(), -1) #dim = BxSxWxE 40 | 41 | _, (h, c) = self.char_lstm(char_emb.view(-1, char_emb.size(2), char_emb.size(3)).transpose(0,1)) #(num_layers * num_directions, batch, hidden_size) = 2*BS*cnh 42 | char_output = torch.cat((h[0], h[1]), 1) #shape = BS*2cnh 43 | char_output = char_output.view(char_emb.size(0), char_emb.size(1), -1).transpose(0,1) #shape = S*B*2cnh 44 | 45 | # Concat char output and word output 46 | word_emb = torch.cat((word_emb, char_output), 2) #shape = S*B*(wnh+2cnh) 47 | word_emb = self.dropout(word_emb) 48 | 49 | output, (h, c) = self.word_lstm(word_emb) #shape = S*B*hidden_size_lstm 50 | output = self.dropout(output) 51 | 52 | output = self.linear(output) 53 | return output #shape = S*B*ntags 54 | 55 | class LinearBlock(nn.Module): 56 | def __init__(self, ni, nf, drop): 57 | super().__init__() 58 | self.lin = nn.Linear(ni, nf) 59 | self.drop = nn.Dropout(drop) 60 | self.bn = nn.BatchNorm1d(ni) 61 | 62 | def forward(self, x): 63 | return self.lin(self.drop(self.bn(x))) 64 | 65 | 66 | class LinearClassifier(nn.Module): 67 | def __init__(self, config, layers, drops): 68 | self.config = config 69 | super().__init__() 70 | self.layers = nn.ModuleList([ 71 | LinearBlock(layers[i], layers[i + 1], drops[i]) for i in range(len(layers) - 1)]) 72 | 73 | def forward(self, input): 74 | output = input 75 | sl,bs,_ = output.size() 76 | x = output.view(-1, 2*self.config.hidden_size_lstm) 77 | 78 | for l in self.layers: 79 | l_x = l(x) 80 | x = F.relu(l_x) 81 | return l_x.view(sl, bs, self.config.ntags) 82 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | """ Command Line Usage 2 | Args: 3 | eval: Evaluate F1 Score and Accuracy on test set 4 | pred: Predict sentence. 5 | (optional): Sentence to predict on. If none given, predicts on "Peter Johnson lives in Los Angeles" 6 | 7 | Example: 8 | > python test.py eval pred "Obama is from Hawaii" 9 | """ 10 | 11 | from model.data_utils import CoNLLDataset 12 | from model.config import Config 13 | from model.ner_model import NERModel 14 | from model.ner_learner import NERLearner 15 | import sys 16 | 17 | 18 | def main(): 19 | # create instance of config 20 | config = Config() 21 | if config.use_elmo: config.processing_word = None 22 | 23 | #build model 24 | model = NERModel(config) 25 | 26 | learn = NERLearner(config, model) 27 | learn.load() 28 | 29 | if len(sys.argv) == 1: 30 | print("No arguments given. Running full test") 31 | sys.argv.append("eval") 32 | sys.argv.append("pred") 33 | 34 | if sys.argv[1] == "eval": 35 | # create datasets 36 | test = CoNLLDataset(config.filename_test, config.processing_word, 37 | config.processing_tag, config.max_iter) 38 | learn.evaluate(test) 39 | 40 | if sys.argv[1] == "pred" or sys.argv[2] == "pred": 41 | try: 42 | sent = (sys.argv[2] if sys.argv[1] == "pred" else sys.argv[3]) 43 | except IndexError: 44 | sent = ["Peter", "Johnson", "lives", "in", "Los", "Angeles"] 45 | 46 | print("Predicting sentence: ", sent) 47 | pred = learn.predict(sent) 48 | print(pred) 49 | 50 | 51 | 52 | if __name__ == "__main__": 53 | main() 54 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | from model.data_utils import CoNLLDataset 2 | from model.config import Config 3 | from model.ner_model import NERModel 4 | from model.ner_learner import NERLearner 5 | from model.ent_model import EntModel 6 | from model.ent_learner import EntLearner 7 | 8 | 9 | def main(): 10 | # create instance of config 11 | config = Config() 12 | if config.use_elmo: config.processing_word = None 13 | 14 | #build model 15 | model = NERModel(config) 16 | 17 | # create datasets 18 | dev = CoNLLDataset(config.filename_dev, config.processing_word, 19 | config.processing_tag, config.max_iter, config.use_crf) 20 | train = CoNLLDataset(config.filename_train, config.processing_word, 21 | config.processing_tag, config.max_iter, config.use_crf) 22 | 23 | learn = NERLearner(config, model) 24 | learn.fit(train, dev) 25 | 26 | 27 | if __name__ == "__main__": 28 | main() 29 | 30 | --------------------------------------------------------------------------------