├── .gitignore
├── LICENSE
├── README.md
├── __init__.py
├── build_data.py
├── model
    ├── config.py
    ├── core.py
    ├── crf.py
    ├── data_utils.py
    ├── general_utils.py
    ├── ner_learner.py
    └── ner_model.py
├── test.py
└── train.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | /data/
 2 | 
 3 | /saves/
 4 | 
 5 | /.idea/
 6 | 
 7 | /.vscode/
 8 | 
 9 | /results/
10 | /model/others/
11 | /model/backup/
12 | *sh
13 | /model/ent_learner.py
14 | /model/ent_model.py
15 | 
16 | *.pyc
17 | 
18 | example.py
19 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 yongyuwen
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PyTorch-Elmo-BiLSTMCRF
 2 | 
 3 | PyTorch Implementation of the BiLSTM-CRF model as described in https://guillaumegenthial.github.io/. 
 4 | 
 5 | This model builds upon that by adding including ELMO embeddings as a feature representation option. 
 6 | (For more detail about ELMo, please see the publication ["Deep contextualized word representations"](http://arxiv.org/abs/1802.05365))
 7 | 
 8 | For the Keras implementation (without ELMO) please refer to this [link](https://github.com/yongyuwen/sequence-tagging-ner).
 9 | 
10 | ## Usage
11 | 1.	**Requirements**:  
12 |     a.	Packages: Anaconda, Pytorch, AllenNLP (if on linux and using elmo)  
13 |     b.	Data: Train, valid and test datasets in CoNLL 2003 NER format.  
14 |     c.	Glove 300B embeddings (If not using Elmo) 
15 |     
16 | 2.	**Configure Settings**:  
17 |     a.	Change settings in model/config.py  
18 |     b.	Main settings to change: File directories, model hyperparameters etc.  
19 |     
20 | 3.	**Build Data**:  
21 |     a.	Run build_data.py  
22 |         i.	Builds embedding dictionary, text file of words, chars tags, as well as idx to word and idx to char mapping for the model to read  
23 |         
24 | 4.	**Train Model**:  
25 |     a.	Run train.py  
26 |     
27 | 5.	**Test Model**:  
28 |     a.	Run test.py  
29 |     b.	Evaluates on test set. Also accepts other arguments to predict on custom string
30 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/build_data.py:
--------------------------------------------------------------------------------
 1 | from model.config import Config
 2 | from model.data_utils import CoNLLDataset, get_vocabs, UNK, NUM, \
 3 |     get_glove_vocab, write_vocab, load_vocab, get_char_vocab, \
 4 |     export_trimmed_glove_vectors, get_processing_word
 5 | 
 6 | 
 7 | def main():
 8 |     """Procedure to build data
 9 | 
10 |     You MUST RUN this procedure. It iterates over the whole dataset (train,
11 |     dev and test) and extract the vocabularies in terms of words, tags, and
12 |     characters. Having built the vocabularies it writes them in a file. The
13 |     writing of vocabulary in a file assigns an id (the line #) to each word.
14 |     It then extract the relevant GloVe vectors and stores them in a np array
15 |     such that the i-th entry corresponds to the i-th word in the vocabulary.
16 | 
17 | 
18 |     Args:
19 |         config: (instance of Config) has attributes like hyper-params...
20 | 
21 |     """
22 |     # 1. get config and processing of words
23 |     config = Config(load=False)
24 | 
25 |     #2. Get processing word generator
26 |     processing_word = get_processing_word(lowercase=True)
27 | 
28 |     # 3. Generators
29 |     dev   = CoNLLDataset(config.filename_dev, processing_word)
30 |     test  = CoNLLDataset(config.filename_test, processing_word)
31 |     train = CoNLLDataset(config.filename_train, processing_word)
32 | 
33 | 
34 |     # 4. Build Word and Tag vocab
35 |     vocab_words, vocab_tags = get_vocabs([train, dev, test])
36 |     vocab_glove = get_glove_vocab(config.filename_glove)
37 | 
38 |     # 5. Get a vocab set for words in both vocab_words and vocab_glove
39 |     vocab = vocab_words & vocab_glove
40 |     vocab.add(UNK)
41 |     vocab.add(NUM)
42 | 
43 |     # 6. Save vocab
44 |     write_vocab(vocab, config.filename_words)
45 |     write_vocab(vocab_tags, config.filename_tags)
46 | 
47 |     # 7. Trim GloVe Vectors
48 |     vocab = load_vocab(config.filename_words)
49 |     export_trimmed_glove_vectors(vocab, config.filename_glove,
50 |                                 config.filename_trimmed, config.dim_word)
51 | 
52 |     # Build and save char vocab
53 |     train = CoNLLDataset(config.filename_train)
54 |     vocab_chars = get_char_vocab(train)
55 |     write_vocab(vocab_chars, config.filename_chars)
56 | 
57 | 
58 | if __name__ == "__main__":
59 |     main()
60 | 


--------------------------------------------------------------------------------
/model/config.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | 
  4 | from .general_utils import get_logger
  5 | from .data_utils import get_trimmed_glove_vectors, load_vocab, \
  6 |         get_processing_word
  7 | 
  8 | 
  9 | class Config():
 10 |     def __init__(self, load=True):
 11 |         """Initialize hyperparameters and load vocabs
 12 | 
 13 |         Args:
 14 |             load_embeddings: (bool) if True, load embeddings into
 15 |                 np array, else None
 16 | 
 17 |         """
 18 |         # directory for training outputs
 19 |         if not os.path.exists(self.dir_output):
 20 |             os.makedirs(self.dir_output)
 21 | 
 22 |         # create instance of logger
 23 |         self.logger = get_logger(self.path_log)
 24 | 
 25 |         # load if requested (default)
 26 |         if load:
 27 |             self.load()
 28 | 
 29 |     def load(self):
 30 |         """Loads vocabulary, processing functions and embeddings
 31 | 
 32 |         Supposes that build_data.py has been run successfully and that
 33 |         the corresponding files have been created (vocab and trimmed GloVe
 34 |         vectors)
 35 | 
 36 |         """
 37 |         # 1. vocabulary
 38 |         self.vocab_words = load_vocab(self.filename_words)
 39 |         self.vocab_tags  = load_vocab(self.filename_tags)
 40 |         self.vocab_chars = load_vocab(self.filename_chars)
 41 | 
 42 |         self.nwords     = len(self.vocab_words)
 43 |         self.nchars     = len(self.vocab_chars)
 44 |         self.ntags      = len(self.vocab_tags)
 45 | 
 46 |         # 2. get processing functions that map str -> id
 47 |         self.processing_word = get_processing_word(self.vocab_words,
 48 |                 self.vocab_chars, lowercase=True, chars=self.use_chars)
 49 |         self.processing_tag  = get_processing_word(self.vocab_tags,
 50 |                 lowercase=False, allow_unk=False)
 51 | 
 52 |         # 3. get pre-trained embeddings
 53 |         self.embeddings = (get_trimmed_glove_vectors(self.filename_trimmed)
 54 |                 if self.use_pretrained else None)
 55 | 
 56 | 
 57 |     # general config
 58 |     dir_output = "results/test/"
 59 |     dir_model  = dir_output
 60 |     path_log   = dir_output + "log.txt"
 61 | 
 62 |     # embeddings
 63 |     dim_word = 300
 64 |     dim_char = 100
 65 | 
 66 |     # glove files
 67 |     filename_glove = "data/glove.6B/glove.6B.{}d.txt".format(dim_word)
 68 |     # trimmed embeddings (created from glove_filename with build_data.py)
 69 |     filename_trimmed = "data/glove.6B.{}d.trimmed.npz".format(dim_word)
 70 |     use_pretrained = True
 71 | 
 72 |     # dataset
 73 |     # filename_dev = "data/coNLL/eng/eng.testa.iob"
 74 |     # filename_test = "data/coNLL/eng/eng.testb.iob"
 75 |     # filename_train = "data/coNLL/eng/eng.train.iob"
 76 | 
 77 |     #filename_dev = filename_test = filename_train = "data/test.txt" # test
 78 | 
 79 |     filename_dev = "data/valid.txt"
 80 |     filename_test = "data/test.txt"
 81 |     filename_train = "data/train.txt"
 82 | 
 83 |     max_iter = None # if not None, max number of examples in Dataset
 84 | 
 85 |     # vocab (created from dataset with build_data.py)
 86 |     filename_words = "data/words.txt"
 87 |     filename_tags = "data/tags.txt"
 88 |     filename_chars = "data/chars.txt"
 89 | 
 90 |     # training
 91 |     train_embeddings = False
 92 |     nepochs          = 15
 93 |     dropout          = 0.5
 94 |     batch_size       = 5
 95 |     lr_method        = "adam"
 96 |     lr               = 0.001
 97 |     lr_decay         = 0.9
 98 |     epoch_drop       = 1 # Step Decay: per # epochs to apply lr_decay
 99 |     clip             = -1 # if negative, no clipping
100 |     nepoch_no_imprv  = 3
101 | 
102 |     # model hyperparameters
103 |     hidden_size_char = 100 # lstm on chars
104 |     hidden_size_lstm = 300 # lstm on word embeddings
105 | 
106 |     ner_model_path = "saves/ner_{}e_glove".format(nepochs)
107 | 
108 |     # elmo config
109 |     use_elmo = False
110 |     dim_elmo = 1024
111 | 
112 |     # NOTE: if both chars and crf, only 1.6x slower on GPU
113 |     use_crf = True # if crf, training is 1.7x slower on CPU
114 |     use_chars = False if use_elmo else True#  if char embedding, training is 3.5x slower on CPU
115 | 


--------------------------------------------------------------------------------
/model/core.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn, optim
 3 | import torch.nn.functional as F
 4 | from torch.autograd import Variable
 5 | import numpy as np
 6 | import os
 7 | import spacy
 8 | 
 9 | USE_GPU = torch.cuda.is_available()
10 | 
11 | def to_gpu(x, *args, **kwargs):
12 |     '''puts pytorch variable to gpu, if cuda is available and USE_GPU is set to true. '''
13 |     return x.cuda(*args, **kwargs) if USE_GPU else x
14 | 
15 | def children(m): return m if isinstance(m, (list, tuple)) else list(m.children())
16 | 
17 | def set_trainable_attr(m,b):
18 |     m.trainable=b
19 |     for p in m.parameters(): p.requires_grad=b
20 | 
21 | def apply_leaf(m, f):
22 |     c = children(m)
23 |     if isinstance(m, nn.Module): f(m)
24 |     if len(c)>0:
25 |         for l in c: apply_leaf(l,f)
26 | 
27 | def set_trainable(l, b):
28 |     apply_leaf(l, lambda m: set_trainable_attr(m,b))
29 | 
30 | def save_model(m, p): torch.save(m.state_dict(), p)
31 | 
32 | def T(a, half=False, cuda=True):
33 |     """
34 |     Convert numpy array into a pytorch tensor.
35 |     if Cuda is available and USE_GPU=True, store resulting tensor in GPU.
36 |     """
37 |     if not torch.is_tensor(a):
38 |         a = np.array(np.ascontiguousarray(a))
39 |         if a.dtype in (np.int8, np.int16, np.int32, np.int64):
40 |             a = torch.LongTensor(a.astype(np.int64))
41 |         elif a.dtype in (np.float32, np.float64):
42 |             a = torch.cuda.HalfTensor(a) if half else torch.FloatTensor(a)
43 |         else: raise NotImplementedError(a.dtype)
44 |     if cuda: a = to_gpu(a, async=True)
45 |     return a
46 | 
47 | def load_ner_model(m, p, strict=True):
48 |     sd = torch.load(p, map_location=lambda storage, loc: storage)
49 |     names = set(m.state_dict().keys())
50 |     for n in list(sd.keys()): # list "detatches" the iterator
51 |         if n not in names and n+'_raw' in names:
52 |             if n+'_raw' not in sd: sd[n+'_raw'] = sd[n]
53 |             del sd[n]
54 |     m.load_state_dict(sd, strict=strict)
55 | 
56 | 


--------------------------------------------------------------------------------
/model/crf.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Optional, Union
  2 | 
  3 | from torch.autograd import Variable
  4 | import torch
  5 | import torch.nn as nn
  6 | 
  7 | 
  8 | class CRF(nn.Module):
  9 |     """Conditional random field.
 10 |     This module implements a conditional random field [LMP]. The forward computation
 11 |     of this class computes the log likelihood of the given sequence of tags and
 12 |     emission score tensor. This class also has ``decode`` method which finds the
 13 |     best tag sequence given an emission score tensor using `Viterbi algorithm`_.
 14 |     Arguments
 15 |     ---------
 16 |     num_tags : int
 17 |         Number of tags.
 18 |     Attributes
 19 |     ----------
 20 |     num_tags : int
 21 |         Number of tags passed to ``__init__``.
 22 |     start_transitions : :class:`~torch.nn.Parameter`
 23 |         Start transition score tensor of size ``(num_tags,)``.
 24 |     end_transitions : :class:`~torch.nn.Parameter`
 25 |         End transition score tensor of size ``(num_tags,)``.
 26 |     transitions : :class:`~torch.nn.Parameter`
 27 |         Transition score tensor of size ``(num_tags, num_tags)``.
 28 |     References
 29 |     ----------
 30 |     .. [LMP] Lafferty, J., McCallum, A., Pereira, F. (2001).
 31 |              "Conditional random fields: Probabilistic models for segmenting and
 32 |              labeling sequence data". *Proc. 18th International Conf. on Machine
 33 |              Learning*. Morgan Kaufmann. pp. 282–289.
 34 |     .. _Viterbi algorithm: https://en.wikipedia.org/wiki/Viterbi_algorithm
 35 |     """
 36 |     def __init__(self, num_tags: int) -> None:
 37 |         if num_tags <= 0:
 38 |             raise ValueError(f'invalid number of tags: {num_tags}')
 39 |         super().__init__()
 40 |         self.num_tags = num_tags
 41 |         self.start_transitions = nn.Parameter(torch.Tensor(num_tags))
 42 |         self.end_transitions = nn.Parameter(torch.Tensor(num_tags))
 43 |         self.transitions = nn.Parameter(torch.Tensor(num_tags, num_tags))
 44 | 
 45 |         self.reset_parameters()
 46 | 
 47 |     def reset_parameters(self) -> None:
 48 |         """Initialize the transition parameters.
 49 |         The parameters will be initialized randomly from a uniform distribution
 50 |         between -0.1 and 0.1.
 51 |         """
 52 |         nn.init.uniform(self.start_transitions, -0.1, 0.1)
 53 |         nn.init.uniform(self.end_transitions, -0.1, 0.1)
 54 |         nn.init.uniform(self.transitions, -0.1, 0.1)
 55 | 
 56 |     def __repr__(self) -> str:
 57 |         return f'{self.__class__.__name__}(num_tags={self.num_tags})'
 58 | 
 59 |     def forward(self,
 60 |                 emissions: Variable,
 61 |                 tags: Variable,
 62 |                 mask: Optional[Variable] = None,
 63 |                 reduce: bool = True,
 64 |                 ) -> Variable:
 65 |         """Compute the log likelihood of the given sequence of tags and emission score.
 66 |         Arguments
 67 |         ---------
 68 |         emissions : :class:`~torch.autograd.Variable`
 69 |             Emission score tensor of size ``(seq_length, batch_size, num_tags)``.
 70 |         tags : :class:`~torch.autograd.Variable`
 71 |             Sequence of tags as ``LongTensor`` of size ``(seq_length, batch_size)``.
 72 |         mask : :class:`~torch.autograd.Variable`, optional
 73 |             Mask tensor as ``ByteTensor`` of size ``(seq_length, batch_size)``.
 74 |         reduce : bool
 75 |             Whether to sum the log likelihood over the batch.
 76 |         Returns
 77 |         -------
 78 |         :class:`~torch.autograd.Variable`
 79 |             The log likelihood. This will have size (1,) if ``reduce=True``, ``(batch_size,)``
 80 |             otherwise.
 81 |         """
 82 |         if emissions.dim() != 3:
 83 |             raise ValueError(f'emissions must have dimension of 3, got {emissions.dim()}')
 84 |         if tags.dim() != 2:
 85 |             raise ValueError(f'tags must have dimension of 2, got {tags.dim()}')
 86 |         if emissions.size()[:2] != tags.size():
 87 |             raise ValueError(
 88 |                 'the first two dimensions of emissions and tags must match, '
 89 |                 f'got {tuple(emissions.size()[:2])} and {tuple(tags.size())}'
 90 |             )
 91 |         if emissions.size(2) != self.num_tags:
 92 |             raise ValueError(
 93 |                 f'expected last dimension of emissions is {self.num_tags}, '
 94 |                 f'got {emissions.size(2)}'
 95 |             )
 96 |         if mask is not None:
 97 |             if tags.size() != mask.size():
 98 |                 raise ValueError(
 99 |                     f'size of tags and mask must match, got {tuple(tags.size())} '
100 |                     f'and {tuple(mask.size())}'
101 |                 )
102 |             if not all(mask[0].data):
103 |                 raise ValueError('mask of the first timestep must all be on')
104 | 
105 |         if mask is None:
106 |             mask = Variable(self._new(tags.size()).fill_(1)).byte()
107 | 
108 |         numerator = self._compute_joint_llh(emissions, tags, mask)
109 |         denominator = self._compute_log_partition_function(emissions, mask)
110 |         llh = numerator - denominator
111 |         return llh if not reduce else torch.sum(llh)
112 | 
113 |     def decode(self,
114 |                emissions: Union[Variable, torch.FloatTensor],
115 |                mask: Optional[Union[Variable, torch.ByteTensor]] = None) -> List[List[int]]:
116 |         """Find the most likely tag sequence using Viterbi algorithm.
117 |         Arguments
118 |         ---------
119 |         emissions : :class:`~torch.autograd.Variable` or :class:`~torch.FloatTensor`
120 |             Emission score tensor of size ``(seq_length, batch_size, num_tags)``.
121 |         mask : :class:`~torch.autograd.Variable` or :class:`torch.ByteTensor`
122 |             Mask tensor of size ``(seq_length, batch_size)``.
123 |         Returns
124 |         -------
125 |         list
126 |             List of list containing the best tag sequence for each batch.
127 |         """
128 |         if emissions.dim() != 3:
129 |             raise ValueError(f'emissions must have dimension of 3, got {emissions.dim()}')
130 |         if emissions.size(2) != self.num_tags:
131 |             raise ValueError(
132 |                 f'expected last dimension of emissions is {self.num_tags}, '
133 |                 f'got {emissions.size(2)}'
134 |             )
135 |         if mask is not None and emissions.size()[:2] != mask.size():
136 |             raise ValueError(
137 |                 'the first two dimensions of emissions and mask must match, '
138 |                 f'got {tuple(emissions.size()[:2])} and {tuple(mask.size())}'
139 |             )
140 | 
141 |         if isinstance(emissions, Variable):
142 |             emissions = emissions.data
143 |         if mask is None:
144 |             mask = self._new(emissions.size()[:2]).fill_(1).byte()
145 |         elif isinstance(mask, Variable):
146 |             mask = mask.data
147 | 
148 |         return self._viterbi_decode(emissions, mask)
149 | 
150 |     def _compute_joint_llh(self,
151 |                            emissions: Variable,
152 |                            tags: Variable,
153 |                            mask: Variable) -> Variable:
154 |         # emissions: (seq_length, batch_size, num_tags)
155 |         # tags: (seq_length, batch_size)
156 |         # mask: (seq_length, batch_size)
157 |         assert emissions.dim() == 3 and tags.dim() == 2
158 |         assert emissions.size()[:2] == tags.size()
159 |         assert emissions.size(2) == self.num_tags
160 |         assert mask.size() == tags.size()
161 |         assert all(mask[0].data)
162 | 
163 |         seq_length = emissions.size(0)
164 |         mask = mask.float()
165 | 
166 |         # Start transition score
167 |         llh = self.start_transitions[tags[0]]  # (batch_size,)
168 | 
169 |         for i in range(seq_length - 1):
170 |             cur_tag, next_tag = tags[i], tags[i+1]
171 |             # Emission score for current tag
172 |             llh += emissions[i].gather(1, cur_tag.view(-1, 1)).squeeze(1) * mask[i]
173 |             # Transition score to next tag
174 |             transition_score = self.transitions[cur_tag, next_tag]
175 |             # Only add transition score if the next tag is not masked (mask == 1)
176 |             llh += transition_score * mask[i+1]
177 | 
178 |         # Find last tag index
179 |         last_tag_indices = mask.long().sum(0) - 1  # (batch_size,)
180 |         last_tags = tags.gather(0, last_tag_indices.view(1, -1)).squeeze(0)
181 | 
182 |         # End transition score
183 |         llh += self.end_transitions[last_tags]
184 |         # Emission score for the last tag, if mask is valid (mask == 1)
185 |         llh += emissions[-1].gather(1, last_tags.view(-1, 1)).squeeze(1) * mask[-1]
186 | 
187 |         return llh
188 | 
189 |     def _compute_log_partition_function(self,
190 |                                         emissions: Variable,
191 |                                         mask: Variable) -> Variable:
192 |         # emissions: (seq_length, batch_size, num_tags)
193 |         # mask: (seq_length, batch_size)
194 |         assert emissions.dim() == 3 and mask.dim() == 2
195 |         assert emissions.size()[:2] == mask.size()
196 |         assert emissions.size(2) == self.num_tags
197 |         assert all(mask[0].data)
198 | 
199 |         seq_length = emissions.size(0)
200 |         mask = mask.float()
201 | 
202 |         # Start transition score and first emission
203 |         log_prob = self.start_transitions.view(1, -1) + emissions[0]
204 |         # Here, log_prob has size (batch_size, num_tags) where for each batch,
205 |         # the j-th column stores the log probability that the current timestep has tag j
206 | 
207 |         for i in range(1, seq_length):
208 |             # Broadcast log_prob over all possible next tags
209 |             broadcast_log_prob = log_prob.unsqueeze(2)  # (batch_size, num_tags, 1)
210 |             # Broadcast transition score over all instances in the batch
211 |             broadcast_transitions = self.transitions.unsqueeze(0)  # (1, num_tags, num_tags)
212 |             # Broadcast emission score over all possible current tags
213 |             broadcast_emissions = emissions[i].unsqueeze(1)  # (batch_size, 1, num_tags)
214 |             # Sum current log probability, transition, and emission scores
215 |             score = broadcast_log_prob + broadcast_transitions \
216 |                 + broadcast_emissions  # (batch_size, num_tags, num_tags)
217 |             # Sum over all possible current tags, but we're in log prob space, so a sum
218 |             # becomes a log-sum-exp
219 |             score = self._log_sum_exp(score, 1)  # (batch_size, num_tags)
220 |             # Set log_prob to the score if this timestep is valid (mask == 1), otherwise
221 |             # leave it alone
222 |             log_prob = score * mask[i].unsqueeze(1) + log_prob * (1.-mask[i]).unsqueeze(1)
223 | 
224 |         # End transition score
225 |         log_prob += self.end_transitions.view(1, -1)
226 |         # Sum (log-sum-exp) over all possible tags
227 |         return self._log_sum_exp(log_prob, 1)  # (batch_size,)
228 | 
229 |     def _viterbi_decode(self, emissions: torch.FloatTensor, mask: torch.ByteTensor) \
230 |             -> List[List[int]]:
231 |         # Get input sizes
232 |         seq_length = emissions.size(0)
233 |         batch_size = emissions.size(1)
234 |         sequence_lengths = mask.long().sum(dim=0)
235 | 
236 |         # emissions: (seq_length, batch_size, num_tags)
237 |         assert emissions.size(2) == self.num_tags
238 | 
239 |         # list to store the decoded paths
240 |         best_tags_list = []
241 | 
242 |         # Start transition
243 |         viterbi_score = []
244 |         viterbi_score.append(self.start_transitions.data + emissions[0])
245 |         viterbi_path = []
246 | 
247 |         # Here, viterbi_score is a list of tensors of shapes of (num_tags,) where value at
248 |         # index i stores the score of the best tag sequence so far that ends with tag i
249 |         # viterbi_path saves where the best tags candidate transitioned from; this is used
250 |         # when we trace back the best tag sequence
251 | 
252 |         # Viterbi algorithm recursive case: we compute the score of the best tag sequence
253 |         # for every possible next tag
254 |         for i in range(1, seq_length):
255 |             # Broadcast viterbi score for every possible next tag
256 |             broadcast_score = viterbi_score[i - 1].view(batch_size, -1, 1)
257 |             # Broadcast emission score for every possible current tag
258 |             broadcast_emission = emissions[i].view(batch_size, 1, -1)
259 |             # Compute the score matrix of shape (batch_size, num_tags, num_tags) where
260 |             # for each sample, each entry at row i and column j stores the score of
261 |             # transitioning from tag i to tag j and emitting
262 |             score = broadcast_score + self.transitions.data + broadcast_emission
263 |             # Find the maximum score over all possible current tag
264 |             best_score, best_path = score.max(1)  # (batch_size,num_tags,)
265 |             # Save the score and the path
266 |             viterbi_score.append(best_score)
267 |             viterbi_path.append(best_path)
268 | 
269 |         # Now, compute the best path for each sample
270 |         for idx in range(batch_size):
271 |             # Find the tag which maximizes the score at the last timestep; this is our best tag
272 |             # for the last timestep
273 |             seq_end = sequence_lengths[idx]-1
274 |             _, best_last_tag = (viterbi_score[seq_end][idx] + self.end_transitions.data).max(0)
275 |             best_tags = [best_last_tag.item()] #[best_last_tag[0]] #[best_last_tag.item()]
276 | 
277 |             # We trace back where the best last tag comes from, append that to our best tag
278 |             # sequence, and trace it back again, and so on
279 |             for path in reversed(viterbi_path[:sequence_lengths[idx] - 1]):
280 |                 best_last_tag = path[idx][best_tags[-1]]
281 |                 best_tags.append(best_last_tag)
282 | 
283 |             # Reverse the order because we start from the last timestep
284 |             best_tags.reverse()
285 |             best_tags_list.append(best_tags)
286 |         return best_tags_list
287 | 
288 |     @staticmethod
289 |     def _log_sum_exp(tensor: Variable, dim: int) -> Variable:
290 |         # Find the max value along `dim`
291 |         offset, _ = tensor.max(dim)
292 |         # Make offset broadcastable
293 |         broadcast_offset = offset.unsqueeze(dim)
294 |         # Perform log-sum-exp safely
295 |         safe_log_sum_exp = torch.log(torch.sum(torch.exp(tensor - broadcast_offset), dim))
296 |         # Add offset back
297 |         return offset + safe_log_sum_exp
298 | 
299 |     def _new(self, *args, **kwargs) -> torch.FloatTensor:
300 |         param = next(self.parameters())
301 |         return param.data.new(*args, **kwargs)
302 | 


--------------------------------------------------------------------------------
/model/data_utils.py:
--------------------------------------------------------------------------------
  1 | " Data utils from https://github.com/guillaumegenthial/sequence_tagging "
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | import os
  6 | 
  7 | 
  8 | # shared global variables to be imported from model also
  9 | UNK = "$UNK$"
 10 | NUM = "$NUM$"
 11 | NONE = "O"
 12 | 
 13 | 
 14 | # special error message
 15 | class MyIOError(Exception):
 16 |     def __init__(self, filename):
 17 |         # custom error message
 18 |         message = """
 19 | ERROR: Unable to locate file {}.
 20 | 
 21 | FIX: Have you tried running python build_data.py first?
 22 | This will build vocab file from your train, test and dev sets and
 23 | trimm your word vectors.
 24 | """.format(filename)
 25 |         super(MyIOError, self).__init__(message)
 26 | 
 27 | 
 28 | class CoNLLDataset(object):
 29 |     """Class that iterates over CoNLL Dataset
 30 | 
 31 |     __iter__ method yields a tuple (words, tags)
 32 |         words: list of raw words
 33 |         tags: list of raw tags
 34 | 
 35 |     If processing_word and processing_tag are not None,
 36 |     optional preprocessing is appplied
 37 | 
 38 |     Example:
 39 |         ```python
 40 |         data = CoNLLDataset(filename)
 41 |         for sentence, tags in data:
 42 |             pass
 43 |         ```
 44 | 
 45 |     """
 46 |     def __init__(self, filename, processing_word=None, processing_tag=None,
 47 |                  max_iter=None, use_crf=True):
 48 |         """
 49 |         Args:
 50 |             filename: path to the file
 51 |             processing_words: (optional) function that takes a word as input
 52 |             processing_tags: (optional) function that takes a tag as input
 53 |             max_iter: (optional) max number of sentences to yield
 54 | 
 55 |         """
 56 |         self.filename = filename
 57 |         self.processing_word = processing_word
 58 |         self.processing_tag = processing_tag
 59 |         self.max_iter = max_iter
 60 |         self.use_crf = use_crf
 61 |         self.length = None
 62 | 
 63 | 
 64 |     def __iter__(self):
 65 |         niter = 0
 66 |         with open(self.filename) as f:
 67 |             words, tags = [], []
 68 |             for line in f:
 69 |                 line = line.strip()
 70 |                 if (len(line) == 0 or line.startswith("-DOCSTART-")):
 71 |                     if len(words) != 0:
 72 |                         niter += 1
 73 |                         if self.max_iter is not None and niter > self.max_iter:
 74 |                             break
 75 |                         yield words, tags
 76 |                         words, tags = [], []
 77 |                 else:
 78 |                     ls = line.split(' ')
 79 |                     word, tag = ls[0],ls[-1]
 80 |                     if self.processing_word is not None:
 81 |                         word = self.processing_word(word)
 82 |                     if self.processing_tag is not None:
 83 |                         if self.use_crf:
 84 |                             tag = self.processing_tag(tag)
 85 |                     words += [word]
 86 |                     tags += [tag]
 87 | 
 88 | 
 89 |     def __len__(self):
 90 |         """Iterates once over the corpus to set and store length"""
 91 |         if self.length is None:
 92 |             self.length = 0
 93 |             for _ in self:
 94 |                 self.length += 1
 95 | 
 96 |         return self.length
 97 | 
 98 | 
 99 | def get_vocabs(datasets):
100 |     """Build vocabulary from an iterable of datasets objects
101 | 
102 |     Args:
103 |         datasets: a list of dataset objects
104 | 
105 |     Returns:
106 |         a set of all the words in the dataset
107 | 
108 |     """
109 |     print("Building vocab...")
110 |     vocab_words = set()
111 |     vocab_tags = set()
112 |     for dataset in datasets:
113 |         for words, tags in dataset:
114 |             vocab_words.update(words)
115 |             vocab_tags.update(tags)
116 |     print("- done. {} tokens".format(len(vocab_words)))
117 |     return vocab_words, vocab_tags
118 | 
119 | 
120 | def get_char_vocab(dataset):
121 |     """Build char vocabulary from an iterable of datasets objects
122 | 
123 |     Args:
124 |         dataset: a iterator yielding tuples (sentence, tags)
125 | 
126 |     Returns:
127 |         a set of all the characters in the dataset
128 | 
129 |     """
130 |     print("Building char vocab...")
131 |     vocab_char = set()
132 |     for words, _ in dataset:
133 |         for word in words:
134 |             vocab_char.update(word)
135 |     print("- done. {} tokens".format(len(vocab_char)))
136 |     return vocab_char
137 | 
138 | 
139 | def get_glove_vocab(filename):
140 |     """Load vocab from file
141 | 
142 |     Args:
143 |         filename: path to the glove vectors
144 | 
145 |     Returns:
146 |         vocab: set() of strings
147 |     """
148 |     print("Building vocab...")
149 |     vocab = set()
150 |     with open(filename, encoding="utf8") as f:
151 |         for line in f:
152 |             word = line.strip().split(' ')[0]
153 |             vocab.add(word)
154 |     print("- done. {} tokens".format(len(vocab)))
155 |     return vocab
156 | 
157 | 
158 | def write_vocab(vocab, filename):
159 |     """Writes a vocab to a file
160 | 
161 |     Writes one word per line.
162 | 
163 |     Args:
164 |         vocab: iterable that yields word
165 |         filename: path to vocab file
166 | 
167 |     Returns:
168 |         write a word per line
169 | 
170 |     """
171 |     print("Writing vocab...")
172 |     with open(filename, "w") as f:
173 |         for i, word in enumerate(vocab):
174 |             if i != len(vocab) - 1:
175 |                 f.write("{}\n".format(word))
176 |             else:
177 |                 f.write(word)
178 |     print("- done. {} tokens".format(len(vocab)))
179 | 
180 | 
181 | def load_vocab(filename):
182 |     """Loads vocab from a file
183 | 
184 |     Args:
185 |         filename: (string) the format of the file must be one word per line.
186 | 
187 |     Returns:
188 |         d: dict[word] = index
189 | 
190 |     """
191 |     try:
192 |         d = dict()
193 |         with open(filename) as f:
194 |             for idx, word in enumerate(f):
195 |                 word = word.strip()
196 |                 d[word] = idx
197 | 
198 |     except IOError:
199 |         raise MyIOError(filename)
200 |     return d
201 | 
202 | 
203 | def export_trimmed_glove_vectors(vocab, glove_filename, trimmed_filename, dim):
204 |     """Saves glove vectors in numpy array
205 | 
206 |     Args:
207 |         vocab: dictionary vocab[word] = index
208 |         glove_filename: a path to a glove file
209 |         trimmed_filename: a path where to store a matrix in npy
210 |         dim: (int) dimension of embeddings
211 | 
212 |     """
213 |     embeddings = np.zeros([len(vocab), dim])
214 |     with open(glove_filename, encoding="utf8") as f:
215 |         for line in f:
216 |             line = line.strip().split(' ')
217 |             word = line[0]
218 |             embedding = [float(x) for x in line[1:]]
219 |             if word in vocab:
220 |                 word_idx = vocab[word]
221 |                 embeddings[word_idx] = np.asarray(embedding)
222 | 
223 |     np.savez_compressed(trimmed_filename, embeddings=embeddings)
224 | 
225 | 
226 | def get_trimmed_glove_vectors(filename):
227 |     """
228 |     Args:
229 |         filename: path to the npz file
230 | 
231 |     Returns:
232 |         matrix of embeddings (np array)
233 | 
234 |     """
235 |     try:
236 |         with np.load(filename) as data:
237 |             return data["embeddings"]
238 | 
239 |     except IOError:
240 |         raise MyIOError(filename)
241 | 
242 | 
243 | def get_processing_word(vocab_words=None, vocab_chars=None,
244 |                     lowercase=False, chars=False, allow_unk=True):
245 |     """Return lambda function that transform a word (string) into list,
246 |     or tuple of (list, id) of int corresponding to the ids of the word and
247 |     its corresponding characters.
248 | 
249 |     Args:
250 |         vocab: dict[word] = idx
251 | 
252 |     Returns:
253 |         f("cat") = ([12, 4, 32], 12345)
254 |                  = (list of char ids, word id)
255 | 
256 |     """
257 |     def f(word):
258 |         # 0. get chars of words
259 |         if vocab_chars is not None and chars == True:
260 |             char_ids = []
261 |             for char in word:
262 |                 # ignore chars out of vocabulary
263 |                 if char in vocab_chars:
264 |                     char_ids += [vocab_chars[char]]
265 | 
266 |         # 1. preprocess word
267 |         if lowercase:
268 |             word = word.lower()
269 |         if word.isdigit():
270 |             word = NUM
271 | 
272 |         # 2. get id of word
273 |         if vocab_words is not None:
274 |             if word in vocab_words:
275 |                 word = vocab_words[word]
276 |             else:
277 |                 if allow_unk:
278 |                     word = vocab_words[UNK]
279 |                 else:
280 |                     raise Exception("Unknow key is not allowed. Check that "\
281 |                                     "your vocab (tags?) is correct")
282 | 
283 |         # 3. return tuple char ids, word id
284 |         if vocab_chars is not None and chars == True:
285 |             return char_ids, word
286 |         else:
287 |             return word
288 | 
289 |     return f
290 | 
291 | 
292 | def _pad_sequences(sequences, pad_tok, max_length):
293 |     """
294 |     Args:
295 |         sequences: a generator of list or tuple
296 |         pad_tok: the char to pad with
297 | 
298 |     Returns:
299 |         a list of list where each sublist has same length
300 |     """
301 |     sequence_padded, sequence_length = [], []
302 | 
303 |     for seq in sequences:
304 |         seq = list(seq)
305 |         seq_ = seq[:max_length] + [pad_tok]*max(max_length - len(seq), 0)
306 |         sequence_padded +=  [seq_]
307 |         sequence_length += [min(len(seq), max_length)]
308 | 
309 |     return sequence_padded, sequence_length
310 | 
311 | 
312 | def pad_sequences(sequences, pad_tok, nlevels=1):
313 |     """
314 |     Args:
315 |         sequences: a generator of list or tuple
316 |         pad_tok: the char to pad with
317 |         nlevels: "depth" of padding, for the case where we have characters ids
318 | 
319 |     Returns:
320 |         a list of list where each sublist has same length
321 | 
322 |     """
323 |     if nlevels == 1:
324 |         max_length = max(map(lambda x : len(x), sequences))
325 |         sequence_padded, sequence_length = _pad_sequences(sequences,
326 |                                             pad_tok, max_length)
327 | 
328 |     elif nlevels == 2:
329 |         max_length_word = max([max(map(lambda x: len(x), seq))
330 |                                for seq in sequences])
331 |         sequence_padded, sequence_length = [], []
332 |         for seq in sequences:
333 |             # all words are same length now
334 |             sp, sl = _pad_sequences(seq, pad_tok, max_length_word)
335 |             sequence_padded += [sp]
336 |             sequence_length += [sl]
337 | 
338 |         max_length_sentence = max(map(lambda x : len(x), sequences))
339 |         sequence_padded, _ = _pad_sequences(sequence_padded,
340 |                 [pad_tok]*max_length_word, max_length_sentence)
341 |         sequence_length, _ = _pad_sequences(sequence_length, 0,
342 |                 max_length_sentence)
343 | 
344 |     return sequence_padded, sequence_length
345 | 
346 | 
347 | def minibatches(data, minibatch_size, use_crf=True):
348 |     """
349 |     Args:
350 |         data: generator of (sentence, tags) tuples
351 |         minibatch_size: (int)
352 | 
353 |     Yields:
354 |         list of tuples
355 | 
356 |     """
357 |     x_batch, y_batch = [], []
358 |     for (x, y) in data:
359 |         if len(x_batch) == minibatch_size:
360 |             yield x_batch, y_batch
361 |             x_batch, y_batch = [], []
362 | 
363 |         if type(x[0]) == tuple:
364 |             x = zip(*x)
365 |         x_batch += [x]
366 |         if use_crf:
367 |             y_batch += [y]
368 |         else:
369 |             if any([x.isdigit() for x in y]):
370 |                 y_batch.append([int(x) for x in y if x.isdigit()])
371 |             else:
372 |                 y_batch.append([0,0,0,0,0])
373 | 
374 |     if len(x_batch) != 0:
375 |         yield x_batch, y_batch
376 | 
377 | 
378 | def get_chunk_type(tok, idx_to_tag):
379 |     """
380 |     Args:
381 |         tok: id of token, ex 4
382 |         idx_to_tag: dictionary {4: "B-PER", ...}
383 | 
384 |     Returns:
385 |         tuple: "B", "PER"
386 | 
387 |     """
388 |     if isinstance(tok, torch.Tensor): tok = tok.item()
389 |     tag_name = idx_to_tag[tok]
390 | 
391 |     tag_class = tag_name.split('-')[0]
392 |     tag_type = tag_name.split('-')[-1]
393 |     return tag_class, tag_type
394 | 
395 | 
396 | def get_chunks(seq, tags):
397 |     """Given a sequence of tags, group entities and their position
398 | 
399 |     Args:
400 |         seq: [4, 4, 0, 0, ...] sequence of labels
401 |         tags: dict["O"] = 4
402 | 
403 |     Returns:
404 |         list of (chunk_type, chunk_start, chunk_end)
405 | 
406 |     Example:
407 |         seq = [4, 5, 0, 3]
408 |         tags = {"B-PER": 4, "I-PER": 5, "B-LOC": 3}
409 |         result = [("PER", 0, 2), ("LOC", 3, 4)]
410 | 
411 |     """
412 |     default = tags[NONE]
413 |     idx_to_tag = {idx: tag for tag, idx in tags.items()}
414 |     chunks = []
415 |     chunk_type, chunk_start = None, None
416 |     for i, tok in enumerate(seq):
417 |         # End of a chunk 1
418 |         if tok == default and chunk_type is not None:
419 |             # Add a chunk.
420 |             chunk = (chunk_type, chunk_start, i)
421 |             chunks.append(chunk)
422 |             chunk_type, chunk_start = None, None
423 | 
424 |         # End of a chunk + start of a chunk!
425 |         elif tok != default:
426 |             tok_chunk_class, tok_chunk_type = get_chunk_type(tok, idx_to_tag)
427 |             if chunk_type is None:
428 |                 chunk_type, chunk_start = tok_chunk_type, i
429 |             elif tok_chunk_type != chunk_type or tok_chunk_class == "B":
430 |                 chunk = (chunk_type, chunk_start, i)
431 |                 chunks.append(chunk)
432 |                 chunk_type, chunk_start = tok_chunk_type, i
433 |         else:
434 |             pass
435 | 
436 |     # end condition
437 |     if chunk_type is not None:
438 |         chunk = (chunk_type, chunk_start, len(seq))
439 |         chunks.append(chunk)
440 | 
441 |     return chunks
442 | 


--------------------------------------------------------------------------------
/model/general_utils.py:
--------------------------------------------------------------------------------
  1 | " Logger and Progress Bar from https://github.com/guillaumegenthial/sequence_tagging "
  2 | 
  3 | import time
  4 | import sys
  5 | import logging
  6 | import numpy as np
  7 | 
  8 | 
  9 | def get_logger(filename):
 10 |     """Return a logger instance that writes in filename
 11 | 
 12 |     Args:
 13 |         filename: (string) path to log.txt
 14 | 
 15 |     Returns:
 16 |         logger: (instance of logger)
 17 | 
 18 |     """
 19 |     logger = logging.getLogger('logger')
 20 |     logger.setLevel(logging.DEBUG)
 21 |     logging.basicConfig(format='%(message)s', level=logging.DEBUG)
 22 |     handler = logging.FileHandler(filename)
 23 |     handler.setLevel(logging.DEBUG)
 24 |     handler.setFormatter(logging.Formatter(
 25 |             '%(asctime)s:%(levelname)s: %(message)s'))
 26 |     logging.getLogger().addHandler(handler)
 27 | 
 28 |     return logger
 29 | 
 30 | 
 31 | class Progbar(object):
 32 |     """Progbar class copied from keras (https://github.com/fchollet/keras/)
 33 | 
 34 |     Displays a progress bar.
 35 |     Small edit : added strict arg to update
 36 |     # Arguments
 37 |         target: Total number of steps expected.
 38 |         interval: Minimum visual progress update interval (in seconds).
 39 |     """
 40 | 
 41 |     def __init__(self, target, width=30, verbose=1):
 42 |         self.width = width
 43 |         self.target = target
 44 |         self.sum_values = {}
 45 |         self.unique_values = []
 46 |         self.start = time.time()
 47 |         self.total_width = 0
 48 |         self.seen_so_far = 0
 49 |         self.verbose = verbose
 50 | 
 51 |     def update(self, current, values=[], exact=[], strict=[]):
 52 |         """
 53 |         Updates the progress bar.
 54 |         # Arguments
 55 |             current: Index of current step.
 56 |             values: List of tuples (name, value_for_last_step).
 57 |                 The progress bar will display averages for these values.
 58 |             exact: List of tuples (name, value_for_last_step).
 59 |                 The progress bar will display these values directly.
 60 |         """
 61 | 
 62 |         for k, v in values:
 63 |             if k not in self.sum_values:
 64 |                 self.sum_values[k] = [v * (current - self.seen_so_far),
 65 |                                       current - self.seen_so_far]
 66 |                 self.unique_values.append(k)
 67 |             else:
 68 |                 self.sum_values[k][0] += v * (current - self.seen_so_far)
 69 |                 self.sum_values[k][1] += (current - self.seen_so_far)
 70 |         for k, v in exact:
 71 |             if k not in self.sum_values:
 72 |                 self.unique_values.append(k)
 73 |             self.sum_values[k] = [v, 1]
 74 | 
 75 |         for k, v in strict:
 76 |             if k not in self.sum_values:
 77 |                 self.unique_values.append(k)
 78 |             self.sum_values[k] = v
 79 | 
 80 |         self.seen_so_far = current
 81 | 
 82 |         now = time.time()
 83 |         if self.verbose == 1:
 84 |             prev_total_width = self.total_width
 85 |             sys.stdout.write("\b" * prev_total_width)
 86 |             sys.stdout.write("\r")
 87 | 
 88 |             numdigits = int(np.floor(np.log10(self.target))) + 1
 89 |             barstr = '%%%dd/%%%dd [' % (numdigits, numdigits)
 90 |             bar = barstr % (current, self.target)
 91 |             prog = float(current)/self.target
 92 |             prog_width = int(self.width*prog)
 93 |             if prog_width > 0:
 94 |                 bar += ('='*(prog_width-1))
 95 |                 if current < self.target:
 96 |                     bar += '>'
 97 |                 else:
 98 |                     bar += '='
 99 |             bar += ('.'*(self.width-prog_width))
100 |             bar += ']'
101 |             sys.stdout.write(bar)
102 |             self.total_width = len(bar)
103 | 
104 |             if current:
105 |                 time_per_unit = (now - self.start) / current
106 |             else:
107 |                 time_per_unit = 0
108 |             eta = time_per_unit*(self.target - current)
109 |             info = ''
110 |             if current < self.target:
111 |                 info += ' - ETA: %ds' % eta
112 |             else:
113 |                 info += ' - %ds' % (now - self.start)
114 |             for k in self.unique_values:
115 |                 if type(self.sum_values[k]) is list:
116 |                     info += ' - %s: %.4f' % (k,
117 |                         self.sum_values[k][0] / max(1, self.sum_values[k][1]))
118 |                 else:
119 |                     info += ' - %s: %s' % (k, self.sum_values[k])
120 | 
121 |             self.total_width += len(info)
122 |             if prev_total_width > self.total_width:
123 |                 info += ((prev_total_width-self.total_width) * " ")
124 | 
125 |             sys.stdout.write(info)
126 |             sys.stdout.flush()
127 | 
128 |             if current >= self.target:
129 |                 sys.stdout.write("\n")
130 | 
131 |         if self.verbose == 2:
132 |             if current >= self.target:
133 |                 info = '%ds' % (now - self.start)
134 |                 for k in self.unique_values:
135 |                     info += ' - %s: %.4f' % (k,
136 |                         self.sum_values[k][0] / max(1, self.sum_values[k][1]))
137 |                 sys.stdout.write(info + "\n")
138 | 
139 |     def add(self, n, values=[]):
140 |         self.update(self.seen_so_far+n, values)
141 | 
142 | 
143 | 


--------------------------------------------------------------------------------
/model/ner_learner.py:
--------------------------------------------------------------------------------
  1 | """ Works with pytorch 0.4.0 """
  2 | 
  3 | from .core import *
  4 | from .data_utils import pad_sequences, minibatches, get_chunks
  5 | from .crf import CRF
  6 | from .general_utils import Progbar
  7 | from torch.optim.lr_scheduler import StepLR
  8 | 
  9 | if os.name == "posix": from allennlp.modules.elmo import Elmo, batch_to_ids # AllenNLP is currently only supported on linux
 10 | 
 11 | 
 12 | class NERLearner(object):
 13 |     """
 14 |     NERLearner class that encapsulates a pytorch nn.Module model and ModelData class
 15 |     Contains methods for training a testing the model
 16 |     """
 17 |     def __init__(self, config, model):
 18 |         super().__init__()
 19 |         self.config = config
 20 |         self.logger = self.config.logger
 21 |         self.model = model
 22 |         self.model_path = config.dir_model
 23 |         self.use_elmo = config.use_elmo
 24 | 
 25 | 
 26 |         self.idx_to_tag = {idx: tag for tag, idx in
 27 |                            self.config.vocab_tags.items()}
 28 | 
 29 |         self.criterion = CRF(self.config.ntags)
 30 |         self.optimizer = optim.Adam(self.model.parameters())
 31 | 
 32 |         if self.use_elmo:
 33 |             options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
 34 |             weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"
 35 |             self.elmo = Elmo(options_file, weight_file, 2, dropout=0)
 36 |         else:
 37 |             self.load_emb()
 38 | 
 39 |         if USE_GPU:
 40 |             self.use_cuda = True
 41 |             self.logger.info("GPU found.")
 42 |             self.model = model.cuda()
 43 |             self.criterion = self.criterion.cuda()
 44 |             if self.use_elmo:
 45 |                 self.elmo = self.elmo.cuda()
 46 |                 print("Moved elmo to cuda")
 47 |         else:
 48 |             self.model = model.cpu()
 49 |             self.use_cuda = False
 50 |             self.logger.info("No GPU found.")
 51 | 
 52 |     def get_model_path(self, name):
 53 |         return os.path.join(self.model_path,name)+'.h5'
 54 | 
 55 |     def get_layer_groups(self, do_fc=False):
 56 |         return children(self.model)
 57 | 
 58 |     def freeze_to(self, n):
 59 |         c=self.get_layer_groups()
 60 |         for l in c:
 61 |             set_trainable(l, False)
 62 |         for l in c[n:]:
 63 |             set_trainable(l, True)
 64 | 
 65 |     def unfreeze(self):
 66 |         self.freeze_to(0)
 67 | 
 68 |     def save(self, name=None):
 69 |         if not name:
 70 |             name = self.config.ner_model_path
 71 |         save_model(self.model, self.get_model_path(name))
 72 |         self.logger.info(f"Saved model at {self.get_model_path(name)}")
 73 | 
 74 |     def load_emb(self):
 75 |         self.model.emb.weight = nn.Parameter(T(self.config.embeddings))
 76 |         self.model.emb.weight.requires_grad = False
 77 |         self.logger.info('Loading pretrained word embeddings')
 78 | 
 79 |     def load(self, fn=None):
 80 |         if not fn: fn = self.config.ner_model_path
 81 |         fn = self.get_model_path(fn)
 82 |         load_ner_model(self.model, fn, strict=True)
 83 |         self.logger.info(f"Loaded model from {fn}")
 84 | 
 85 |     def batch_iter(self, train, batch_size, return_lengths=False, shuffle=False, sorter=False):
 86 |         """
 87 |         Builds a generator from the given dataloader to be fed into the model
 88 | 
 89 |         Args:
 90 |             train: DataLoader
 91 |             batch_size: size of each batch
 92 |             return_lengths: if True, generator returns a list of sequence lengths for each
 93 |                             sample in the batch
 94 |                             ie. sequence_lengths = [8,7,4,3]
 95 |             shuffle: if True, shuffles the data for each epoch
 96 |             sorter: if True, uses a sorter to shuffle the data
 97 | 
 98 |         Returns:
 99 |             nbatches: (int) number of batches
100 |             data_generator: batch generator yielding
101 |                                 dict inputs:{'word_ids' : np.array([[padded word_ids in sent1], ...])
102 |                                              'char_ids': np.array([[[padded char_ids in word1_sent1], ...],
103 |                                                                     [padded char_ids in word1_sent2], ...],
104 |                                                                     ...])}
105 |                                 labels: np.array([[padded label_ids in sent1], ...])
106 |                                 sequence_lengths: list([len(sent1), len(sent2), ...])
107 | 
108 |         """
109 |         nbatches = (len(train) + batch_size - 1) // batch_size
110 | 
111 |         def data_generator():
112 |             while True:
113 |                 if shuffle: train.shuffle()
114 |                 elif sorter==True and train.sorter: train.sort()
115 | 
116 |                 for i, (words, labels) in enumerate(minibatches(train, batch_size)):
117 | 
118 |                     # perform padding of the given data
119 |                     if self.config.use_chars:
120 |                         char_ids, word_ids = zip(*words)
121 |                         word_ids, sequence_lengths = pad_sequences(word_ids, 1)
122 |                         char_ids, word_lengths = pad_sequences(char_ids, pad_tok=0,
123 |                         nlevels=2)
124 | 
125 |                     else:
126 |                         word_ids, sequence_lengths = pad_sequences(words, 0)
127 | 
128 |                     if self.use_elmo:
129 |                         word_ids = words
130 | 
131 |                     if labels:
132 |                         labels, _ = pad_sequences(labels, 0)
133 |                         # if categorical
134 |                         ## labels = [to_categorical(label, num_classes=len(train.tag_itos)) for label in labels]
135 | 
136 |                     # build dictionary
137 |                     inputs = {
138 |                         "word_ids": np.asarray(word_ids)
139 |                     }
140 | 
141 |                     if self.config.use_chars:
142 |                         inputs["char_ids"] = np.asarray(char_ids)
143 | 
144 |                     if return_lengths:
145 |                         yield(inputs, np.asarray(labels), sequence_lengths)
146 | 
147 |                     else:
148 |                         yield (inputs, np.asarray(labels))
149 | 
150 |         return (nbatches, data_generator())
151 | 
152 | 
153 |     def fine_tune(self, train, dev=None):
154 |         """
155 |         Fine tune the NER model by freezing the pre-trained encoder and training the newly
156 |         instantiated layers for 1 epochs
157 |         """
158 |         self.logger.info("Fine Tuning Model")
159 |         self.fit(train, dev, epochs=1, fine_tune=True)
160 | 
161 | 
162 |     def fit(self, train, dev=None, epochs=None, fine_tune=False):
163 |         """
164 |         Fits the model to the training dataset and evaluates on the validation set.
165 |         Saves the model to disk
166 |         """
167 |         if not epochs:
168 |             epochs = self.config.nepochs
169 |         batch_size = self.config.batch_size
170 | 
171 |         nbatches_train, train_generator = self.batch_iter(train, batch_size,
172 |                                                           return_lengths=True)
173 |         if dev:
174 |             nbatches_dev, dev_generator = self.batch_iter(dev, batch_size,
175 |                                                       return_lengths=True)
176 | 
177 |         scheduler = StepLR(self.optimizer, step_size=1, gamma=self.config.lr_decay)
178 | 
179 |         if not fine_tune: self.logger.info("Training Model")
180 | 
181 |         f1s = []
182 | 
183 |         for epoch in range(epochs):
184 |             scheduler.step()
185 |             self.train(epoch, nbatches_train, train_generator, fine_tune=fine_tune)
186 | 
187 |             if dev:
188 |                 f1 = self.test(nbatches_dev, dev_generator, fine_tune=fine_tune)
189 | 
190 |             # Early stopping
191 |             if len(f1s) > 0:
192 |                 if f1 < max(f1s[max(-self.config.nepoch_no_imprv, -len(f1s)):]): #if sum([f1 > f1s[max(-i, -len(f1s))] for i in range(1,self.config.nepoch_no_imprv+1)]) == 0:
193 |                     print("No improvement in the last 3 epochs. Stopping training")
194 |                     break
195 |             else:
196 |                 f1s.append(f1)
197 | 
198 |         if fine_tune:
199 |             self.save(self.config.ner_ft_path)
200 |         else :
201 |             self.save(self.config.ner_model_path)
202 | 
203 | 
204 |     def train(self, epoch, nbatches_train, train_generator, fine_tune=False):
205 |         self.logger.info('\nEpoch: %d' % epoch)
206 |         self.model.train()
207 |         if not self.use_elmo: self.model.emb.weight.requires_grad = False
208 | 
209 |         train_loss = 0
210 |         correct = 0
211 |         total = 0
212 |         total_step = None
213 | 
214 |         prog = Progbar(target=nbatches_train)
215 | 
216 |         for batch_idx, (inputs, targets, sequence_lengths) in enumerate(train_generator):
217 | 
218 |             if batch_idx == nbatches_train: break
219 |             if inputs['word_ids'].shape[0] == 1:
220 |                 self.logger.info('Skipping batch of size=1')
221 |                 continue
222 | 
223 |             total_step = batch_idx
224 |             targets = T(targets, cuda=self.use_cuda).transpose(0,1).contiguous()
225 |             self.optimizer.zero_grad()
226 | 
227 |             if self.use_elmo:
228 |                 sentences = inputs['word_ids']
229 |                 character_ids = batch_to_ids(sentences)
230 |                 if self.use_cuda:
231 |                     character_ids = character_ids.cuda()
232 |                 embeddings = self.elmo(character_ids)
233 |                 word_input = embeddings['elmo_representations'][0]
234 |                 word_input, targets = Variable(word_input, requires_grad=False), \
235 |                                       Variable(targets)
236 |                 inputs = (word_input)
237 | 
238 |             else:
239 |                 word_input = T(inputs['word_ids'], cuda=self.use_cuda)
240 |                 char_input = T(inputs['char_ids'], cuda=self.use_cuda)
241 |                 word_input, char_input, targets = Variable(word_input, requires_grad=False), \
242 |                                                   Variable(char_input, requires_grad=False),\
243 |                                                   Variable(targets)
244 |                 inputs = (word_input, char_input)
245 | 
246 | 
247 |             outputs = self.model(inputs)
248 | 
249 |             # Create mask
250 |             if self.use_elmo:
251 |                 mask = Variable(embeddings['mask'].transpose(0,1))
252 |                 if self.use_cuda:
253 |                     mask = mask.cuda()
254 |             else:
255 |                 mask = create_mask(sequence_lengths, targets, cuda=self.use_cuda)
256 | 
257 |             # Get CRF Loss
258 |             loss = -1*self.criterion(outputs, targets, mask=mask)
259 |             loss.backward()
260 |             self.optimizer.step()
261 | 
262 |             # Callbacks
263 |             train_loss += loss.item()
264 |             predictions = self.criterion.decode(outputs, mask=mask)
265 |             masked_targets = mask_targets(targets, sequence_lengths)
266 | 
267 |             t_ = mask.type(torch.LongTensor).sum().item()
268 |             total += t_
269 |             c_ = sum([1 if p[i] == mt[i] else 0 for p, mt in zip(predictions, masked_targets) for i in range(len(p))])
270 |             correct += c_
271 | 
272 |             prog.update(batch_idx + 1, values=[("train loss", loss.item())], exact=[("Accuracy", 100*c_/t_)])
273 | 
274 |         self.logger.info("Train Loss: %.3f, Train Accuracy: %.3f%% (%d/%d)" %(train_loss/(total_step+1), 100.*correct/total, correct, total) )
275 | 
276 | 
277 |     def test(self, nbatches_val, val_generator, fine_tune=False):
278 |         self.model.eval()
279 |         accs = []
280 |         test_loss = 0
281 |         correct_preds = 0
282 |         total_correct = 0
283 |         total_preds = 0
284 |         total_step = None
285 | 
286 |         for batch_idx, (inputs, targets, sequence_lengths) in enumerate(val_generator):
287 |             if batch_idx == nbatches_val: break
288 |             if inputs['word_ids'].shape[0] == 1:
289 |                 self.logger.info('Skipping batch of size=1')
290 |                 continue
291 | 
292 |             total_step = batch_idx
293 |             targets = T(targets, cuda=self.use_cuda).transpose(0,1).contiguous()
294 | 
295 |             if self.use_elmo:
296 |                 sentences = inputs['word_ids']
297 |                 character_ids = batch_to_ids(sentences)
298 |                 if self.use_cuda:
299 |                     character_ids = character_ids.cuda()
300 |                 embeddings = self.elmo(character_ids)
301 |                 word_input = embeddings['elmo_representations'][1]
302 |                 word_input, targets = Variable(word_input, requires_grad=False), \
303 |                                       Variable(targets)
304 |                 inputs = (word_input)
305 | 
306 |             else:
307 |                 word_input = T(inputs['word_ids'], cuda=self.use_cuda)
308 |                 char_input = T(inputs['char_ids'], cuda=self.use_cuda)
309 |                 word_input, char_input, targets = Variable(word_input, requires_grad=False), \
310 |                                                   Variable(char_input, requires_grad=False),\
311 |                                                   Variable(targets)
312 |                 inputs = (word_input, char_input)
313 | 
314 |             outputs = self.model(inputs)
315 | 
316 |             # Create mask
317 |             if self.use_elmo:
318 |                 mask = Variable(embeddings['mask'].transpose(0,1))
319 |                 if self.use_cuda:
320 |                     mask = mask.cuda()
321 |             else:
322 |                 mask = create_mask(sequence_lengths, targets, cuda=self.use_cuda)
323 | 
324 |             # Get CRF Loss
325 |             loss = -1*self.criterion(outputs, targets, mask=mask)
326 | 
327 |             # Callbacks
328 |             test_loss += loss.item()
329 |             predictions = self.criterion.decode(outputs, mask=mask)
330 |             masked_targets = mask_targets(targets, sequence_lengths)
331 | 
332 |             for lab, lab_pred in zip(masked_targets, predictions):
333 | 
334 |                 accs    += [1 if a==b else 0 for (a, b) in zip(lab, lab_pred)]
335 | 
336 |                 lab_chunks      = set(get_chunks(lab, self.config.vocab_tags))
337 |                 lab_pred_chunks = set(get_chunks(lab_pred,
338 |                                                  self.config.vocab_tags))
339 | 
340 |                 correct_preds += len(lab_chunks & lab_pred_chunks)
341 |                 total_preds   += len(lab_pred_chunks)
342 |                 total_correct += len(lab_chunks)
343 | 
344 |         p   = correct_preds / total_preds if correct_preds > 0 else 0
345 |         r   = correct_preds / total_correct if correct_preds > 0 else 0
346 |         f1  = 2 * p * r / (p + r) if correct_preds > 0 else 0
347 |         acc = np.mean(accs)
348 | 
349 |         self.logger.info("Val Loss : %.3f, Val Accuracy: %.3f%%, Val F1: %.3f%%" %(test_loss/(total_step+1), 100*acc, 100*f1))
350 |         return 100*f1
351 | 
352 |     def evaluate(self,test):
353 |         batch_size = self.config.batch_size
354 |         nbatches_test, test_generator = self.batch_iter(test, batch_size,
355 |                                                         return_lengths=True)
356 |         self.logger.info('Evaluating on test set')
357 |         self.test(nbatches_test, test_generator)
358 | 
359 |     def predict_batch(self, words):
360 |         self.model.eval()
361 |         if len(words) == 1:
362 |             mult = np.ones(2).reshape(2, 1).astype(int)
363 | 
364 |         if self.use_elmo:
365 |             sentences = words
366 |             character_ids = batch_to_ids(sentences)
367 |             if self.use_cuda:
368 |                 character_ids = character_ids.cuda()
369 |             embeddings = self.elmo(character_ids)
370 |             word_input = embeddings['elmo_representations'][1]
371 |             word_input = Variable(word_input, requires_grad=False)
372 | 
373 |             if len(words) == 1:
374 |                 word_input = ((mult*word_input.transpose(0,1)).transpose(0,1).contiguous()).type(torch.FloatTensor)
375 | 
376 |             word_input = T(word_input, cuda=self.use_cuda)
377 |             inputs = (word_input)
378 | 
379 |         else:
380 |             #char_ids, word_ids = zip(*words)
381 |             char_ids = [[c[0] for c in s] for s in words]
382 |             word_ids = [[x[1] for x in s] for s in words]
383 |             word_ids, sequence_lengths = pad_sequences(word_ids, 1)
384 |             char_ids, word_lengths = pad_sequences(char_ids, pad_tok=0,
385 |                                                    nlevels=2)
386 |             word_ids = np.asarray(word_ids)
387 |             char_ids = np.asarray(char_ids)
388 | 
389 |             if len(words) == 1:
390 |                 word_ids = mult*word_ids
391 |                 char_ids = (mult*char_ids.transpose(1,0,2)).transpose(1,0,2)
392 |             word_input = T(word_ids, cuda=self.use_cuda)
393 |             char_input = T(char_ids, cuda=self.use_cuda)
394 | 
395 |             word_input, char_input = Variable(word_input, requires_grad=False), \
396 |                                      Variable(char_input, requires_grad=False)
397 | 
398 |             inputs = (word_input, char_input)
399 | 
400 | 
401 |         outputs = self.model(inputs)
402 | 
403 |         predictions = self.criterion.decode(outputs)
404 | 
405 |         predictions = [p[:i] for p, i in zip(predictions, sequence_lengths)]
406 | 
407 |         return predictions
408 | 
409 |     def predict(self, sentences):
410 |         """Returns list of tags
411 | 
412 |         Args:
413 |             words_raw: list of words (string), just one sentence (no batch)
414 | 
415 |         Returns:
416 |             preds: list of tags (string), one for each word in the sentence
417 | 
418 |         """
419 |         nlp = spacy.load('en')
420 |         doc = nlp(sentences)
421 |         words_raw = [[token.text for token in sent] for sent in doc.sents]
422 |         if self.use_elmo:
423 |             words = words_raw
424 |         else:
425 |             words = [[self.config.processing_word(w) for w in s] for s in words_raw]
426 |             # print(words)
427 |             # raise NameError('testing')
428 |             # if type(words[0]) == tuple:
429 |             #     words = zip(*words)
430 | 
431 |         pred_ids = self.predict_batch(words)
432 |         preds = [[self.idx_to_tag[idx.item() if isinstance(idx, torch.Tensor) else idx]  for idx in s] for s in pred_ids]
433 | 
434 |         return preds
435 | 
436 | 
437 | def create_mask(sequence_lengths, targets, cuda, batch_first=False):
438 |     """ Creates binary mask """
439 |     mask = Variable(torch.ones(targets.size()).type(torch.ByteTensor))
440 |     if cuda: mask = mask.cuda()
441 | 
442 |     for i,l in enumerate(sequence_lengths):
443 |         if batch_first:
444 |             if l < targets.size(1):
445 |                 mask.data[i, l:] = 0
446 |         else:
447 |             if l < targets.size(0):
448 |                 mask.data[l:, i] = 0
449 | 
450 |     return mask
451 | 
452 | 
453 | def mask_targets(targets, sequence_lengths, batch_first=False):
454 |     """ Masks the targets """
455 |     if not batch_first:
456 |          targets = targets.transpose(0,1)
457 |     t = []
458 |     for l, p in zip(targets,sequence_lengths):
459 |         t.append(l[:p].data.tolist())
460 |     return t
461 | 
462 | 
463 | 
464 | 
465 | 


--------------------------------------------------------------------------------
/model/ner_model.py:
--------------------------------------------------------------------------------
 1 | #from fastai.text import *
 2 | from .core import *
 3 | 
 4 | class NERModel(nn.Module):
 5 | 
 6 |     def __init__(self, config):
 7 |         super().__init__()
 8 |         self.config = config
 9 |         self.use_elmo = config.use_elmo
10 | 
11 |         if not self.use_elmo:
12 |             self.emb = nn.Embedding(self.config.nwords, self.config.dim_word, padding_idx=0)
13 |             self.char_embeddings = nn.Embedding(self.config.nchars, self.config.dim_char, padding_idx=0)
14 |             self.char_lstm = nn.LSTM(self.config.dim_char, self.config.hidden_size_char, bidirectional=True)
15 | 
16 |         self.dropout = nn.Dropout(p=self.config.dropout)
17 |         self.word_lstm = nn.LSTM(self.config.dim_elmo if self.use_elmo else self.config.dim_word+2*self.config.hidden_size_char,
18 |                                  self.config.hidden_size_lstm, bidirectional=True)
19 | 
20 |         self.linear = LinearClassifier(self.config, layers=[self.config.hidden_size_lstm*2, self.config.ntags], drops=[0.5])
21 | 
22 | 
23 |     def forward(self, input):
24 |         # Word_dim = (batch_size x sent_length)
25 |         # char_dim = (batch_size x sent_length x word_length)
26 | 
27 |         if self.use_elmo:
28 |             word_emb = self.dropout(input.transpose(0,1))
29 | 
30 |         else:
31 |             word_input, char_input = input[0], input[1]
32 |             word_input.transpose_(0,1)
33 | 
34 |             # Word Embedding
35 |             word_emb = self.emb(word_input) #shape= S*B*wnh
36 | 
37 |             # Char LSTM
38 |             char_emb = self.char_embeddings(char_input.view(-1, char_input.size(2))) #https://stackoverflow.com/questions/47205762/embedding-3d-data-in-pytorch
39 |             char_emb = char_emb.view(*char_input.size(), -1) #dim = BxSxWxE
40 | 
41 |             _, (h, c) = self.char_lstm(char_emb.view(-1, char_emb.size(2), char_emb.size(3)).transpose(0,1)) #(num_layers * num_directions, batch, hidden_size) = 2*BS*cnh
42 |             char_output = torch.cat((h[0], h[1]), 1) #shape = BS*2cnh
43 |             char_output = char_output.view(char_emb.size(0), char_emb.size(1), -1).transpose(0,1) #shape = S*B*2cnh
44 | 
45 |             # Concat char output and word output
46 |             word_emb = torch.cat((word_emb, char_output), 2) #shape = S*B*(wnh+2cnh)
47 |             word_emb = self.dropout(word_emb)
48 | 
49 |         output, (h, c) = self.word_lstm(word_emb) #shape = S*B*hidden_size_lstm
50 |         output = self.dropout(output)
51 | 
52 |         output = self.linear(output)
53 |         return output #shape = S*B*ntags
54 | 
55 | class LinearBlock(nn.Module):
56 |     def __init__(self, ni, nf, drop):
57 |         super().__init__()
58 |         self.lin = nn.Linear(ni, nf)
59 |         self.drop = nn.Dropout(drop)
60 |         self.bn = nn.BatchNorm1d(ni)
61 | 
62 |     def forward(self, x):
63 |         return self.lin(self.drop(self.bn(x)))
64 | 
65 | 
66 | class LinearClassifier(nn.Module):
67 |     def __init__(self, config, layers, drops):
68 |         self.config = config
69 |         super().__init__()
70 |         self.layers = nn.ModuleList([
71 |             LinearBlock(layers[i], layers[i + 1], drops[i]) for i in range(len(layers) - 1)])
72 | 
73 |     def forward(self, input):
74 |         output = input
75 |         sl,bs,_ = output.size()
76 |         x = output.view(-1, 2*self.config.hidden_size_lstm)
77 | 
78 |         for l in self.layers:
79 |             l_x = l(x)
80 |             x = F.relu(l_x)
81 |         return l_x.view(sl, bs, self.config.ntags)
82 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | """ Command Line Usage
 2 | Args:
 3 |     eval: Evaluate F1 Score and Accuracy on test set
 4 |     pred: Predict sentence.
 5 |     (optional): Sentence to predict on. If none given, predicts on "Peter Johnson lives in Los Angeles"
 6 | 
 7 | Example:
 8 |     > python test.py eval pred "Obama is from Hawaii"
 9 | """
10 | 
11 | from model.data_utils import CoNLLDataset
12 | from model.config import Config
13 | from model.ner_model import NERModel
14 | from model.ner_learner import NERLearner
15 | import sys
16 | 
17 | 
18 | def main():
19 |     # create instance of config
20 |     config = Config()
21 |     if config.use_elmo: config.processing_word = None
22 | 
23 |     #build model
24 |     model = NERModel(config)
25 | 
26 |     learn = NERLearner(config, model)
27 |     learn.load()
28 | 
29 |     if len(sys.argv) == 1:
30 |         print("No arguments given. Running full test")
31 |         sys.argv.append("eval")
32 |         sys.argv.append("pred")
33 | 
34 |     if sys.argv[1] == "eval":
35 |         # create datasets
36 |         test = CoNLLDataset(config.filename_test, config.processing_word,
37 |                              config.processing_tag, config.max_iter)
38 |         learn.evaluate(test)
39 | 
40 |     if sys.argv[1] == "pred" or sys.argv[2] == "pred":
41 |         try:
42 |             sent = (sys.argv[2] if sys.argv[1] == "pred" else sys.argv[3])
43 |         except IndexError:
44 |             sent = ["Peter", "Johnson", "lives", "in", "Los", "Angeles"]
45 | 
46 |         print("Predicting sentence: ", sent)
47 |         pred = learn.predict(sent)
48 |         print(pred)
49 | 
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     main()
54 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | from model.data_utils import CoNLLDataset
 2 | from model.config import Config
 3 | from model.ner_model import NERModel
 4 | from model.ner_learner import NERLearner
 5 | from model.ent_model import EntModel
 6 | from model.ent_learner import EntLearner
 7 | 
 8 | 
 9 | def main():
10 |     # create instance of config
11 |     config = Config()
12 |     if config.use_elmo: config.processing_word = None
13 | 
14 |     #build model
15 |     model = NERModel(config)
16 | 
17 |     # create datasets
18 |     dev = CoNLLDataset(config.filename_dev, config.processing_word,
19 |                          config.processing_tag, config.max_iter, config.use_crf)
20 |     train = CoNLLDataset(config.filename_train, config.processing_word,
21 |                          config.processing_tag, config.max_iter, config.use_crf)
22 | 
23 |     learn = NERLearner(config, model)
24 |     learn.fit(train, dev)
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     main()
29 | 
30 | 


--------------------------------------------------------------------------------