├── .gitignore
├── LICENSE.txt
├── README.md
├── build_data.py
├── data
    └── test.txt
├── evaluate.py
├── makefile
├── model
    ├── __init__.py
    ├── base_model.py
    ├── config.py
    ├── data_utils.py
    ├── general_utils.py
    └── ner_model.py
├── requirements.txt
└── train.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *sh
2 | *pyc
3 | /data
4 | !/data/.gitkeep
5 | /results
6 | *.DS_Store
7 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    Copyright 2017 Guillaume Genthial
179 | 
180 |    Licensed under the Apache License, Version 2.0 (the "License");
181 |    you may not use this file except in compliance with the License.
182 |    You may obtain a copy of the License at
183 | 
184 |        http://www.apache.org/licenses/LICENSE-2.0
185 | 
186 |    Unless required by applicable law or agreed to in writing, software
187 |    distributed under the License is distributed on an "AS IS" BASIS,
188 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
189 |    See the License for the specific language governing permissions and
190 |    limitations under the License.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Named Entity Recognition with Tensorflow
  2 | 
  3 | This repo implements a NER model using Tensorflow (LSTM + CRF + chars embeddings).
  4 | 
  5 | __A [better implementation is available here, using `tf.data` and `tf.estimator`, and achieves an F1 of 91.21](https://github.com/guillaumegenthial/tf_ner)__
  6 | 
  7 | State-of-the-art performance (F1 score between 90 and 91).
  8 | 
  9 | Check the [blog post](https://guillaumegenthial.github.io/sequence-tagging-with-tensorflow.html)
 10 | 
 11 | ## Task
 12 | 
 13 | Given a sentence, give a tag to each word. A classical application is Named Entity Recognition (NER). Here is an example
 14 | 
 15 | ```
 16 | John   lives in New   York
 17 | B-PER  O     O  B-LOC I-LOC
 18 | ```
 19 | 
 20 | 
 21 | ## Model
 22 | 
 23 | Similar to [Lample et al.](https://arxiv.org/abs/1603.01360) and [Ma and Hovy](https://arxiv.org/pdf/1603.01354.pdf).
 24 | 
 25 | - concatenate final states of a bi-lstm on character embeddings to get a character-based representation of each word
 26 | - concatenate this representation to a standard word vector representation (GloVe here)
 27 | - run a bi-lstm on each sentence to extract contextual representation of each word
 28 | - decode with a linear chain CRF
 29 | 
 30 | 
 31 | 
 32 | ## Getting started
 33 | 
 34 | 
 35 | 1. Download the GloVe vectors with
 36 | 
 37 | ```
 38 | make glove
 39 | ```
 40 | 
 41 | Alternatively, you can download them manually [here](https://nlp.stanford.edu/projects/glove/) and update the `glove_filename` entry in `config.py`. You can also choose not to load pretrained word vectors by changing the entry `use_pretrained` to `False` in `model/config.py`.
 42 | 
 43 | 2. Build the training data, train and evaluate the model with
 44 | ```
 45 | make run
 46 | ```
 47 | 
 48 | 
 49 | ## Details
 50 | 
 51 | 
 52 | Here is the breakdown of the commands executed in `make run`:
 53 | 
 54 | 1. [DO NOT MISS THIS STEP] Build vocab from the data and extract trimmed glove vectors according to the config in `model/config.py`.
 55 | 
 56 | ```
 57 | python build_data.py
 58 | ```
 59 | 
 60 | 2. Train the model with
 61 | 
 62 | ```
 63 | python train.py
 64 | ```
 65 | 
 66 | 
 67 | 3. Evaluate and interact with the model with
 68 | ```
 69 | python evaluate.py
 70 | ```
 71 | 
 72 | 
 73 | Data iterators and utils are in `model/data_utils.py` and the model with training/test procedures is in `model/ner_model.py`
 74 | 
 75 | Training time on NVidia Tesla K80 is 110 seconds per epoch on CoNLL train set using characters embeddings and CRF.
 76 | 
 77 | 
 78 | 
 79 | ## Training Data
 80 | 
 81 | 
 82 | The training data must be in the following format (identical to the CoNLL2003 dataset).
 83 | 
 84 | A default test file is provided to help you getting started.
 85 | 
 86 | 
 87 | ```
 88 | John B-PER
 89 | lives O
 90 | in O
 91 | New B-LOC
 92 | York I-LOC
 93 | . O
 94 | 
 95 | This O
 96 | is O
 97 | another O
 98 | sentence
 99 | ```
100 | 
101 | 
102 | Once you have produced your data files, change the parameters in `config.py` like
103 | 
104 | ```
105 | # dataset
106 | dev_filename = "data/coNLL/eng/eng.testa.iob"
107 | test_filename = "data/coNLL/eng/eng.testb.iob"
108 | train_filename = "data/coNLL/eng/eng.train.iob"
109 | ```
110 | 
111 | 
112 | 
113 | 
114 | ## License
115 | 
116 | This project is licensed under the terms of the apache 2.0 license (as Tensorflow and derivatives). If used for research, citation would be appreciated.
117 | 
118 | 


--------------------------------------------------------------------------------
/build_data.py:
--------------------------------------------------------------------------------
 1 | from model.config import Config
 2 | from model.data_utils import CoNLLDataset, get_vocabs, UNK, NUM, \
 3 |     get_glove_vocab, write_vocab, load_vocab, get_char_vocab, \
 4 |     export_trimmed_glove_vectors, get_processing_word
 5 | 
 6 | 
 7 | def main():
 8 |     """Procedure to build data
 9 | 
10 |     You MUST RUN this procedure. It iterates over the whole dataset (train,
11 |     dev and test) and extract the vocabularies in terms of words, tags, and
12 |     characters. Having built the vocabularies it writes them in a file. The
13 |     writing of vocabulary in a file assigns an id (the line #) to each word.
14 |     It then extract the relevant GloVe vectors and stores them in a np array
15 |     such that the i-th entry corresponds to the i-th word in the vocabulary.
16 | 
17 | 
18 |     Args:
19 |         config: (instance of Config) has attributes like hyper-params...
20 | 
21 |     """
22 |     # get config and processing of words
23 |     config = Config(load=False)
24 |     processing_word = get_processing_word(lowercase=True)
25 | 
26 |     # Generators
27 |     dev   = CoNLLDataset(config.filename_dev, processing_word)
28 |     test  = CoNLLDataset(config.filename_test, processing_word)
29 |     train = CoNLLDataset(config.filename_train, processing_word)
30 | 
31 |     # Build Word and Tag vocab
32 |     vocab_words, vocab_tags = get_vocabs([train, dev, test])
33 |     vocab_glove = get_glove_vocab(config.filename_glove)
34 | 
35 |     vocab = vocab_words & vocab_glove
36 |     vocab.add(UNK)
37 |     vocab.add(NUM)
38 | 
39 |     # Save vocab
40 |     write_vocab(vocab, config.filename_words)
41 |     write_vocab(vocab_tags, config.filename_tags)
42 | 
43 |     # Trim GloVe Vectors
44 |     vocab = load_vocab(config.filename_words)
45 |     export_trimmed_glove_vectors(vocab, config.filename_glove,
46 |                                 config.filename_trimmed, config.dim_word)
47 | 
48 |     # Build and save char vocab
49 |     train = CoNLLDataset(config.filename_train)
50 |     vocab_chars = get_char_vocab(train)
51 |     write_vocab(vocab_chars, config.filename_chars)
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     main()
56 | 


--------------------------------------------------------------------------------
/data/test.txt:
--------------------------------------------------------------------------------
  1 | Jean B-PER
  2 | Pierre I-PER
  3 | lives O
  4 | in O
  5 | New B-LOC
  6 | York I-LOC
  7 | . O
  8 | 
  9 | The O
 10 | European B-ORG
 11 | Union I-ORG
 12 | is O
 13 | a O
 14 | political O
 15 | and O
 16 | economic O
 17 | union O
 18 | 
 19 | A O
 20 | French B-MISC
 21 | American I-MISC
 22 | actor O
 23 | won O
 24 | an O
 25 | oscar O
 26 | 
 27 | Jean B-PER
 28 | Pierre I-PER
 29 | lives O
 30 | in O
 31 | New B-LOC
 32 | York I-LOC
 33 | . O
 34 | 
 35 | The O
 36 | European B-ORG
 37 | Union I-ORG
 38 | is O
 39 | a O
 40 | political O
 41 | and O
 42 | economic O
 43 | union O
 44 | 
 45 | A O
 46 | French B-MISC
 47 | American I-MISC
 48 | actor O
 49 | won O
 50 | an O
 51 | oscar O
 52 | 
 53 | Jean B-PER
 54 | Pierre I-PER
 55 | lives O
 56 | in O
 57 | New B-LOC
 58 | York I-LOC
 59 | . O
 60 | 
 61 | The O
 62 | European B-ORG
 63 | Union I-ORG
 64 | is O
 65 | a O
 66 | political O
 67 | and O
 68 | economic O
 69 | union O
 70 | 
 71 | A O
 72 | French B-MISC
 73 | American I-MISC
 74 | actor O
 75 | won O
 76 | an O
 77 | oscar O
 78 | 
 79 | Jean B-PER
 80 | Pierre I-PER
 81 | lives O
 82 | in O
 83 | New B-LOC
 84 | York I-LOC
 85 | . O
 86 | 
 87 | The O
 88 | European B-ORG
 89 | Union I-ORG
 90 | is O
 91 | a O
 92 | political O
 93 | and O
 94 | economic O
 95 | union O
 96 | 
 97 | A O
 98 | French B-MISC
 99 | American I-MISC
100 | actor O
101 | won O
102 | an O
103 | oscar O
104 | 


--------------------------------------------------------------------------------
/evaluate.py:
--------------------------------------------------------------------------------
 1 | from model.data_utils import CoNLLDataset
 2 | from model.ner_model import NERModel
 3 | from model.config import Config
 4 | 
 5 | 
 6 | def align_data(data):
 7 |     """Given dict with lists, creates aligned strings
 8 | 
 9 |     Adapted from Assignment 3 of CS224N
10 | 
11 |     Args:
12 |         data: (dict) data["x"] = ["I", "love", "you"]
13 |               (dict) data["y"] = ["O", "O", "O"]
14 | 
15 |     Returns:
16 |         data_aligned: (dict) data_align["x"] = "I love you"
17 |                            data_align["y"] = "O O    O  "
18 | 
19 |     """
20 |     spacings = [max([len(seq[i]) for seq in data.values()])
21 |                 for i in range(len(data[list(data.keys())[0]]))]
22 |     data_aligned = dict()
23 | 
24 |     # for each entry, create aligned string
25 |     for key, seq in data.items():
26 |         str_aligned = ""
27 |         for token, spacing in zip(seq, spacings):
28 |             str_aligned += token + " " * (spacing - len(token) + 1)
29 | 
30 |         data_aligned[key] = str_aligned
31 | 
32 |     return data_aligned
33 | 
34 | 
35 | 
36 | def interactive_shell(model):
37 |     """Creates interactive shell to play with model
38 | 
39 |     Args:
40 |         model: instance of NERModel
41 | 
42 |     """
43 |     model.logger.info("""
44 | This is an interactive mode.
45 | To exit, enter 'exit'.
46 | You can enter a sentence like
47 | input> I love Paris""")
48 | 
49 |     while True:
50 |         try:
51 |             # for python 2
52 |             sentence = raw_input("input> ")
53 |         except NameError:
54 |             # for python 3
55 |             sentence = input("input> ")
56 | 
57 |         words_raw = sentence.strip().split(" ")
58 | 
59 |         if words_raw == ["exit"]:
60 |             break
61 | 
62 |         preds = model.predict(words_raw)
63 |         to_print = align_data({"input": words_raw, "output": preds})
64 | 
65 |         for key, seq in to_print.items():
66 |             model.logger.info(seq)
67 | 
68 | 
69 | def main():
70 |     # create instance of config
71 |     config = Config()
72 | 
73 |     # build model
74 |     model = NERModel(config)
75 |     model.build()
76 |     model.restore_session(config.dir_model)
77 | 
78 |     # create dataset
79 |     test  = CoNLLDataset(config.filename_test, config.processing_word,
80 |                          config.processing_tag, config.max_iter)
81 | 
82 |     # evaluate and interact
83 |     model.evaluate(test)
84 |     interactive_shell(model)
85 | 
86 | 
87 | if __name__ == "__main__":
88 |     main()
89 | 


--------------------------------------------------------------------------------
/makefile:
--------------------------------------------------------------------------------
 1 | glove:
 2 | 	wget -P ./data/ "http://nlp.stanford.edu/data/glove.6B.zip"
 3 | 	unzip ./data/glove.6B.zip -d data/glove.6B/
 4 | 	rm ./data/glove.6B.zip
 5 | 
 6 | run:
 7 | 	python build_data.py
 8 | 	python train.py
 9 | 	python evaluate.py
10 | 


--------------------------------------------------------------------------------
/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guillaumegenthial/sequence_tagging/5cb9890e7c52e35a1239378da43a8de7be508003/model/__init__.py


--------------------------------------------------------------------------------
/model/base_model.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tensorflow as tf
  3 | 
  4 | 
  5 | class BaseModel(object):
  6 |     """Generic class for general methods that are not specific to NER"""
  7 | 
  8 |     def __init__(self, config):
  9 |         """Defines self.config and self.logger
 10 | 
 11 |         Args:
 12 |             config: (Config instance) class with hyper parameters,
 13 |                 vocab and embeddings
 14 | 
 15 |         """
 16 |         self.config = config
 17 |         self.logger = config.logger
 18 |         self.sess   = None
 19 |         self.saver  = None
 20 | 
 21 | 
 22 |     def reinitialize_weights(self, scope_name):
 23 |         """Reinitializes the weights of a given layer"""
 24 |         variables = tf.contrib.framework.get_variables(scope_name)
 25 |         init = tf.variables_initializer(variables)
 26 |         self.sess.run(init)
 27 | 
 28 | 
 29 |     def add_train_op(self, lr_method, lr, loss, clip=-1):
 30 |         """Defines self.train_op that performs an update on a batch
 31 | 
 32 |         Args:
 33 |             lr_method: (string) sgd method, for example "adam"
 34 |             lr: (tf.placeholder) tf.float32, learning rate
 35 |             loss: (tensor) tf.float32 loss to minimize
 36 |             clip: (python float) clipping of gradient. If < 0, no clipping
 37 | 
 38 |         """
 39 |         _lr_m = lr_method.lower() # lower to make sure
 40 | 
 41 |         with tf.variable_scope("train_step"):
 42 |             if _lr_m == 'adam': # sgd method
 43 |                 optimizer = tf.train.AdamOptimizer(lr)
 44 |             elif _lr_m == 'adagrad':
 45 |                 optimizer = tf.train.AdagradOptimizer(lr)
 46 |             elif _lr_m == 'sgd':
 47 |                 optimizer = tf.train.GradientDescentOptimizer(lr)
 48 |             elif _lr_m == 'rmsprop':
 49 |                 optimizer = tf.train.RMSPropOptimizer(lr)
 50 |             else:
 51 |                 raise NotImplementedError("Unknown method {}".format(_lr_m))
 52 | 
 53 |             if clip > 0: # gradient clipping if clip is positive
 54 |                 grads, vs     = zip(*optimizer.compute_gradients(loss))
 55 |                 grads, gnorm  = tf.clip_by_global_norm(grads, clip)
 56 |                 self.train_op = optimizer.apply_gradients(zip(grads, vs))
 57 |             else:
 58 |                 self.train_op = optimizer.minimize(loss)
 59 | 
 60 | 
 61 |     def initialize_session(self):
 62 |         """Defines self.sess and initialize the variables"""
 63 |         self.logger.info("Initializing tf session")
 64 |         self.sess = tf.Session()
 65 |         self.sess.run(tf.global_variables_initializer())
 66 |         self.saver = tf.train.Saver()
 67 | 
 68 | 
 69 |     def restore_session(self, dir_model):
 70 |         """Reload weights into session
 71 | 
 72 |         Args:
 73 |             sess: tf.Session()
 74 |             dir_model: dir with weights
 75 | 
 76 |         """
 77 |         self.logger.info("Reloading the latest trained model...")
 78 |         self.saver.restore(self.sess, dir_model)
 79 | 
 80 | 
 81 |     def save_session(self):
 82 |         """Saves session = weights"""
 83 |         if not os.path.exists(self.config.dir_model):
 84 |             os.makedirs(self.config.dir_model)
 85 |         self.saver.save(self.sess, self.config.dir_model)
 86 | 
 87 | 
 88 |     def close_session(self):
 89 |         """Closes the session"""
 90 |         self.sess.close()
 91 | 
 92 | 
 93 |     def add_summary(self):
 94 |         """Defines variables for Tensorboard
 95 | 
 96 |         Args:
 97 |             dir_output: (string) where the results are written
 98 | 
 99 |         """
100 |         self.merged      = tf.summary.merge_all()
101 |         self.file_writer = tf.summary.FileWriter(self.config.dir_output,
102 |                 self.sess.graph)
103 | 
104 | 
105 |     def train(self, train, dev):
106 |         """Performs training with early stopping and lr exponential decay
107 | 
108 |         Args:
109 |             train: dataset that yields tuple of (sentences, tags)
110 |             dev: dataset
111 | 
112 |         """
113 |         best_score = 0
114 |         nepoch_no_imprv = 0 # for early stopping
115 |         self.add_summary() # tensorboard
116 | 
117 |         for epoch in range(self.config.nepochs):
118 |             self.logger.info("Epoch {:} out of {:}".format(epoch + 1,
119 |                         self.config.nepochs))
120 | 
121 |             score = self.run_epoch(train, dev, epoch)
122 |             self.config.lr *= self.config.lr_decay # decay learning rate
123 | 
124 |             # early stopping and saving best parameters
125 |             if score >= best_score:
126 |                 nepoch_no_imprv = 0
127 |                 self.save_session()
128 |                 best_score = score
129 |                 self.logger.info("- new best score!")
130 |             else:
131 |                 nepoch_no_imprv += 1
132 |                 if nepoch_no_imprv >= self.config.nepoch_no_imprv:
133 |                     self.logger.info("- early stopping {} epochs without "\
134 |                             "improvement".format(nepoch_no_imprv))
135 |                     break
136 | 
137 | 
138 |     def evaluate(self, test):
139 |         """Evaluate model on test set
140 | 
141 |         Args:
142 |             test: instance of class Dataset
143 | 
144 |         """
145 |         self.logger.info("Testing model over test set")
146 |         metrics = self.run_evaluate(test)
147 |         msg = " - ".join(["{} {:04.2f}".format(k, v)
148 |                 for k, v in metrics.items()])
149 |         self.logger.info(msg)
150 | 


--------------------------------------------------------------------------------
/model/config.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | 
  4 | from .general_utils import get_logger
  5 | from .data_utils import get_trimmed_glove_vectors, load_vocab, \
  6 |         get_processing_word
  7 | 
  8 | 
  9 | class Config():
 10 |     def __init__(self, load=True):
 11 |         """Initialize hyperparameters and load vocabs
 12 | 
 13 |         Args:
 14 |             load_embeddings: (bool) if True, load embeddings into
 15 |                 np array, else None
 16 | 
 17 |         """
 18 |         # directory for training outputs
 19 |         if not os.path.exists(self.dir_output):
 20 |             os.makedirs(self.dir_output)
 21 | 
 22 |         # create instance of logger
 23 |         self.logger = get_logger(self.path_log)
 24 | 
 25 |         # load if requested (default)
 26 |         if load:
 27 |             self.load()
 28 | 
 29 | 
 30 |     def load(self):
 31 |         """Loads vocabulary, processing functions and embeddings
 32 | 
 33 |         Supposes that build_data.py has been run successfully and that
 34 |         the corresponding files have been created (vocab and trimmed GloVe
 35 |         vectors)
 36 | 
 37 |         """
 38 |         # 1. vocabulary
 39 |         self.vocab_words = load_vocab(self.filename_words)
 40 |         self.vocab_tags  = load_vocab(self.filename_tags)
 41 |         self.vocab_chars = load_vocab(self.filename_chars)
 42 | 
 43 |         self.nwords     = len(self.vocab_words)
 44 |         self.nchars     = len(self.vocab_chars)
 45 |         self.ntags      = len(self.vocab_tags)
 46 | 
 47 |         # 2. get processing functions that map str -> id
 48 |         self.processing_word = get_processing_word(self.vocab_words,
 49 |                 self.vocab_chars, lowercase=True, chars=self.use_chars)
 50 |         self.processing_tag  = get_processing_word(self.vocab_tags,
 51 |                 lowercase=False, allow_unk=False)
 52 | 
 53 |         # 3. get pre-trained embeddings
 54 |         self.embeddings = (get_trimmed_glove_vectors(self.filename_trimmed)
 55 |                 if self.use_pretrained else None)
 56 | 
 57 | 
 58 |     # general config
 59 |     dir_output = "results/test/"
 60 |     dir_model  = dir_output + "model.weights/"
 61 |     path_log   = dir_output + "log.txt"
 62 | 
 63 |     # embeddings
 64 |     dim_word = 300
 65 |     dim_char = 100
 66 | 
 67 |     # glove files
 68 |     filename_glove = "data/glove.6B/glove.6B.{}d.txt".format(dim_word)
 69 |     # trimmed embeddings (created from glove_filename with build_data.py)
 70 |     filename_trimmed = "data/glove.6B.{}d.trimmed.npz".format(dim_word)
 71 |     use_pretrained = True
 72 | 
 73 |     # dataset
 74 |     # filename_dev = "data/coNLL/eng/eng.testa.iob"
 75 |     # filename_test = "data/coNLL/eng/eng.testb.iob"
 76 |     # filename_train = "data/coNLL/eng/eng.train.iob"
 77 | 
 78 |     filename_dev = filename_test = filename_train = "data/test.txt" # test
 79 | 
 80 |     max_iter = None # if not None, max number of examples in Dataset
 81 | 
 82 |     # vocab (created from dataset with build_data.py)
 83 |     filename_words = "data/words.txt"
 84 |     filename_tags = "data/tags.txt"
 85 |     filename_chars = "data/chars.txt"
 86 | 
 87 |     # training
 88 |     train_embeddings = False
 89 |     nepochs          = 15
 90 |     dropout          = 0.5
 91 |     batch_size       = 20
 92 |     lr_method        = "adam"
 93 |     lr               = 0.001
 94 |     lr_decay         = 0.9
 95 |     clip             = -1 # if negative, no clipping
 96 |     nepoch_no_imprv  = 3
 97 | 
 98 |     # model hyperparameters
 99 |     hidden_size_char = 100 # lstm on chars
100 |     hidden_size_lstm = 300 # lstm on word embeddings
101 | 
102 |     # NOTE: if both chars and crf, only 1.6x slower on GPU
103 |     use_crf = True # if crf, training is 1.7x slower on CPU
104 |     use_chars = True # if char embedding, training is 3.5x slower on CPU
105 | 


--------------------------------------------------------------------------------
/model/data_utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | 
  4 | 
  5 | # shared global variables to be imported from model also
  6 | UNK = "$UNK$"
  7 | NUM = "$NUM$"
  8 | NONE = "O"
  9 | 
 10 | 
 11 | # special error message
 12 | class MyIOError(Exception):
 13 |     def __init__(self, filename):
 14 |         # custom error message
 15 |         message = """
 16 | ERROR: Unable to locate file {}.
 17 | 
 18 | FIX: Have you tried running python build_data.py first?
 19 | This will build vocab file from your train, test and dev sets and
 20 | trimm your word vectors.
 21 | """.format(filename)
 22 |         super(MyIOError, self).__init__(message)
 23 | 
 24 | 
 25 | class CoNLLDataset(object):
 26 |     """Class that iterates over CoNLL Dataset
 27 | 
 28 |     __iter__ method yields a tuple (words, tags)
 29 |         words: list of raw words
 30 |         tags: list of raw tags
 31 | 
 32 |     If processing_word and processing_tag are not None,
 33 |     optional preprocessing is appplied
 34 | 
 35 |     Example:
 36 |         ```python
 37 |         data = CoNLLDataset(filename)
 38 |         for sentence, tags in data:
 39 |             pass
 40 |         ```
 41 | 
 42 |     """
 43 |     def __init__(self, filename, processing_word=None, processing_tag=None,
 44 |                  max_iter=None):
 45 |         """
 46 |         Args:
 47 |             filename: path to the file
 48 |             processing_words: (optional) function that takes a word as input
 49 |             processing_tags: (optional) function that takes a tag as input
 50 |             max_iter: (optional) max number of sentences to yield
 51 | 
 52 |         """
 53 |         self.filename = filename
 54 |         self.processing_word = processing_word
 55 |         self.processing_tag = processing_tag
 56 |         self.max_iter = max_iter
 57 |         self.length = None
 58 | 
 59 | 
 60 |     def __iter__(self):
 61 |         niter = 0
 62 |         with open(self.filename) as f:
 63 |             words, tags = [], []
 64 |             for line in f:
 65 |                 line = line.strip()
 66 |                 if (len(line) == 0 or line.startswith("-DOCSTART-")):
 67 |                     if len(words) != 0:
 68 |                         niter += 1
 69 |                         if self.max_iter is not None and niter > self.max_iter:
 70 |                             break
 71 |                         yield words, tags
 72 |                         words, tags = [], []
 73 |                 else:
 74 |                     ls = line.split(' ')
 75 |                     word, tag = ls[0],ls[1]
 76 |                     if self.processing_word is not None:
 77 |                         word = self.processing_word(word)
 78 |                     if self.processing_tag is not None:
 79 |                         tag = self.processing_tag(tag)
 80 |                     words += [word]
 81 |                     tags += [tag]
 82 | 
 83 | 
 84 |     def __len__(self):
 85 |         """Iterates once over the corpus to set and store length"""
 86 |         if self.length is None:
 87 |             self.length = 0
 88 |             for _ in self:
 89 |                 self.length += 1
 90 | 
 91 |         return self.length
 92 | 
 93 | 
 94 | def get_vocabs(datasets):
 95 |     """Build vocabulary from an iterable of datasets objects
 96 | 
 97 |     Args:
 98 |         datasets: a list of dataset objects
 99 | 
100 |     Returns:
101 |         a set of all the words in the dataset
102 | 
103 |     """
104 |     print("Building vocab...")
105 |     vocab_words = set()
106 |     vocab_tags = set()
107 |     for dataset in datasets:
108 |         for words, tags in dataset:
109 |             vocab_words.update(words)
110 |             vocab_tags.update(tags)
111 |     print("- done. {} tokens".format(len(vocab_words)))
112 |     return vocab_words, vocab_tags
113 | 
114 | 
115 | def get_char_vocab(dataset):
116 |     """Build char vocabulary from an iterable of datasets objects
117 | 
118 |     Args:
119 |         dataset: a iterator yielding tuples (sentence, tags)
120 | 
121 |     Returns:
122 |         a set of all the characters in the dataset
123 | 
124 |     """
125 |     vocab_char = set()
126 |     for words, _ in dataset:
127 |         for word in words:
128 |             vocab_char.update(word)
129 | 
130 |     return vocab_char
131 | 
132 | 
133 | def get_glove_vocab(filename):
134 |     """Load vocab from file
135 | 
136 |     Args:
137 |         filename: path to the glove vectors
138 | 
139 |     Returns:
140 |         vocab: set() of strings
141 |     """
142 |     print("Building vocab...")
143 |     vocab = set()
144 |     with open(filename) as f:
145 |         for line in f:
146 |             word = line.strip().split(' ')[0]
147 |             vocab.add(word)
148 |     print("- done. {} tokens".format(len(vocab)))
149 |     return vocab
150 | 
151 | 
152 | def write_vocab(vocab, filename):
153 |     """Writes a vocab to a file
154 | 
155 |     Writes one word per line.
156 | 
157 |     Args:
158 |         vocab: iterable that yields word
159 |         filename: path to vocab file
160 | 
161 |     Returns:
162 |         write a word per line
163 | 
164 |     """
165 |     print("Writing vocab...")
166 |     with open(filename, "w") as f:
167 |         for i, word in enumerate(vocab):
168 |             if i != len(vocab) - 1:
169 |                 f.write("{}\n".format(word))
170 |             else:
171 |                 f.write(word)
172 |     print("- done. {} tokens".format(len(vocab)))
173 | 
174 | 
175 | def load_vocab(filename):
176 |     """Loads vocab from a file
177 | 
178 |     Args:
179 |         filename: (string) the format of the file must be one word per line.
180 | 
181 |     Returns:
182 |         d: dict[word] = index
183 | 
184 |     """
185 |     try:
186 |         d = dict()
187 |         with open(filename) as f:
188 |             for idx, word in enumerate(f):
189 |                 word = word.strip()
190 |                 d[word] = idx
191 | 
192 |     except IOError:
193 |         raise MyIOError(filename)
194 |     return d
195 | 
196 | 
197 | def export_trimmed_glove_vectors(vocab, glove_filename, trimmed_filename, dim):
198 |     """Saves glove vectors in numpy array
199 | 
200 |     Args:
201 |         vocab: dictionary vocab[word] = index
202 |         glove_filename: a path to a glove file
203 |         trimmed_filename: a path where to store a matrix in npy
204 |         dim: (int) dimension of embeddings
205 | 
206 |     """
207 |     embeddings = np.zeros([len(vocab), dim])
208 |     with open(glove_filename) as f:
209 |         for line in f:
210 |             line = line.strip().split(' ')
211 |             word = line[0]
212 |             embedding = [float(x) for x in line[1:]]
213 |             if word in vocab:
214 |                 word_idx = vocab[word]
215 |                 embeddings[word_idx] = np.asarray(embedding)
216 | 
217 |     np.savez_compressed(trimmed_filename, embeddings=embeddings)
218 | 
219 | 
220 | def get_trimmed_glove_vectors(filename):
221 |     """
222 |     Args:
223 |         filename: path to the npz file
224 | 
225 |     Returns:
226 |         matrix of embeddings (np array)
227 | 
228 |     """
229 |     try:
230 |         with np.load(filename) as data:
231 |             return data["embeddings"]
232 | 
233 |     except IOError:
234 |         raise MyIOError(filename)
235 | 
236 | 
237 | def get_processing_word(vocab_words=None, vocab_chars=None,
238 |                     lowercase=False, chars=False, allow_unk=True):
239 |     """Return lambda function that transform a word (string) into list,
240 |     or tuple of (list, id) of int corresponding to the ids of the word and
241 |     its corresponding characters.
242 | 
243 |     Args:
244 |         vocab: dict[word] = idx
245 | 
246 |     Returns:
247 |         f("cat") = ([12, 4, 32], 12345)
248 |                  = (list of char ids, word id)
249 | 
250 |     """
251 |     def f(word):
252 |         # 0. get chars of words
253 |         if vocab_chars is not None and chars == True:
254 |             char_ids = []
255 |             for char in word:
256 |                 # ignore chars out of vocabulary
257 |                 if char in vocab_chars:
258 |                     char_ids += [vocab_chars[char]]
259 | 
260 |         # 1. preprocess word
261 |         if lowercase:
262 |             word = word.lower()
263 |         if word.isdigit():
264 |             word = NUM
265 | 
266 |         # 2. get id of word
267 |         if vocab_words is not None:
268 |             if word in vocab_words:
269 |                 word = vocab_words[word]
270 |             else:
271 |                 if allow_unk:
272 |                     word = vocab_words[UNK]
273 |                 else:
274 |                     raise Exception("Unknow key is not allowed. Check that "\
275 |                                     "your vocab (tags?) is correct")
276 | 
277 |         # 3. return tuple char ids, word id
278 |         if vocab_chars is not None and chars == True:
279 |             return char_ids, word
280 |         else:
281 |             return word
282 | 
283 |     return f
284 | 
285 | 
286 | def _pad_sequences(sequences, pad_tok, max_length):
287 |     """
288 |     Args:
289 |         sequences: a generator of list or tuple
290 |         pad_tok: the char to pad with
291 | 
292 |     Returns:
293 |         a list of list where each sublist has same length
294 |     """
295 |     sequence_padded, sequence_length = [], []
296 | 
297 |     for seq in sequences:
298 |         seq = list(seq)
299 |         seq_ = seq[:max_length] + [pad_tok]*max(max_length - len(seq), 0)
300 |         sequence_padded +=  [seq_]
301 |         sequence_length += [min(len(seq), max_length)]
302 | 
303 |     return sequence_padded, sequence_length
304 | 
305 | 
306 | def pad_sequences(sequences, pad_tok, nlevels=1):
307 |     """
308 |     Args:
309 |         sequences: a generator of list or tuple
310 |         pad_tok: the char to pad with
311 |         nlevels: "depth" of padding, for the case where we have characters ids
312 | 
313 |     Returns:
314 |         a list of list where each sublist has same length
315 | 
316 |     """
317 |     if nlevels == 1:
318 |         max_length = max(map(lambda x : len(x), sequences))
319 |         sequence_padded, sequence_length = _pad_sequences(sequences,
320 |                                             pad_tok, max_length)
321 | 
322 |     elif nlevels == 2:
323 |         max_length_word = max([max(map(lambda x: len(x), seq))
324 |                                for seq in sequences])
325 |         sequence_padded, sequence_length = [], []
326 |         for seq in sequences:
327 |             # all words are same length now
328 |             sp, sl = _pad_sequences(seq, pad_tok, max_length_word)
329 |             sequence_padded += [sp]
330 |             sequence_length += [sl]
331 | 
332 |         max_length_sentence = max(map(lambda x : len(x), sequences))
333 |         sequence_padded, _ = _pad_sequences(sequence_padded,
334 |                 [pad_tok]*max_length_word, max_length_sentence)
335 |         sequence_length, _ = _pad_sequences(sequence_length, 0,
336 |                 max_length_sentence)
337 | 
338 |     return sequence_padded, sequence_length
339 | 
340 | 
341 | def minibatches(data, minibatch_size):
342 |     """
343 |     Args:
344 |         data: generator of (sentence, tags) tuples
345 |         minibatch_size: (int)
346 | 
347 |     Yields:
348 |         list of tuples
349 | 
350 |     """
351 |     x_batch, y_batch = [], []
352 |     for (x, y) in data:
353 |         if len(x_batch) == minibatch_size:
354 |             yield x_batch, y_batch
355 |             x_batch, y_batch = [], []
356 | 
357 |         if type(x[0]) == tuple:
358 |             x = zip(*x)
359 |         x_batch += [x]
360 |         y_batch += [y]
361 | 
362 |     if len(x_batch) != 0:
363 |         yield x_batch, y_batch
364 | 
365 | 
366 | def get_chunk_type(tok, idx_to_tag):
367 |     """
368 |     Args:
369 |         tok: id of token, ex 4
370 |         idx_to_tag: dictionary {4: "B-PER", ...}
371 | 
372 |     Returns:
373 |         tuple: "B", "PER"
374 | 
375 |     """
376 |     tag_name = idx_to_tag[tok]
377 |     tag_class = tag_name.split('-')[0]
378 |     tag_type = tag_name.split('-')[-1]
379 |     return tag_class, tag_type
380 | 
381 | 
382 | def get_chunks(seq, tags):
383 |     """Given a sequence of tags, group entities and their position
384 | 
385 |     Args:
386 |         seq: [4, 4, 0, 0, ...] sequence of labels
387 |         tags: dict["O"] = 4
388 | 
389 |     Returns:
390 |         list of (chunk_type, chunk_start, chunk_end)
391 | 
392 |     Example:
393 |         seq = [4, 5, 0, 3]
394 |         tags = {"B-PER": 4, "I-PER": 5, "B-LOC": 3}
395 |         result = [("PER", 0, 2), ("LOC", 3, 4)]
396 | 
397 |     """
398 |     default = tags[NONE]
399 |     idx_to_tag = {idx: tag for tag, idx in tags.items()}
400 |     chunks = []
401 |     chunk_type, chunk_start = None, None
402 |     for i, tok in enumerate(seq):
403 |         # End of a chunk 1
404 |         if tok == default and chunk_type is not None:
405 |             # Add a chunk.
406 |             chunk = (chunk_type, chunk_start, i)
407 |             chunks.append(chunk)
408 |             chunk_type, chunk_start = None, None
409 | 
410 |         # End of a chunk + start of a chunk!
411 |         elif tok != default:
412 |             tok_chunk_class, tok_chunk_type = get_chunk_type(tok, idx_to_tag)
413 |             if chunk_type is None:
414 |                 chunk_type, chunk_start = tok_chunk_type, i
415 |             elif tok_chunk_type != chunk_type or tok_chunk_class == "B":
416 |                 chunk = (chunk_type, chunk_start, i)
417 |                 chunks.append(chunk)
418 |                 chunk_type, chunk_start = tok_chunk_type, i
419 |         else:
420 |             pass
421 | 
422 |     # end condition
423 |     if chunk_type is not None:
424 |         chunk = (chunk_type, chunk_start, len(seq))
425 |         chunks.append(chunk)
426 | 
427 |     return chunks
428 | 


--------------------------------------------------------------------------------
/model/general_utils.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import sys
  3 | import logging
  4 | import numpy as np
  5 | 
  6 | 
  7 | def get_logger(filename):
  8 |     """Return a logger instance that writes in filename
  9 | 
 10 |     Args:
 11 |         filename: (string) path to log.txt
 12 | 
 13 |     Returns:
 14 |         logger: (instance of logger)
 15 | 
 16 |     """
 17 |     logger = logging.getLogger('logger')
 18 |     logger.setLevel(logging.DEBUG)
 19 |     logging.basicConfig(format='%(message)s', level=logging.DEBUG)
 20 |     handler = logging.FileHandler(filename)
 21 |     handler.setLevel(logging.DEBUG)
 22 |     handler.setFormatter(logging.Formatter(
 23 |             '%(asctime)s:%(levelname)s: %(message)s'))
 24 |     logging.getLogger().addHandler(handler)
 25 | 
 26 |     return logger
 27 | 
 28 | 
 29 | class Progbar(object):
 30 |     """Progbar class copied from keras (https://github.com/fchollet/keras/)
 31 | 
 32 |     Displays a progress bar.
 33 |     Small edit : added strict arg to update
 34 |     # Arguments
 35 |         target: Total number of steps expected.
 36 |         interval: Minimum visual progress update interval (in seconds).
 37 |     """
 38 | 
 39 |     def __init__(self, target, width=30, verbose=1):
 40 |         self.width = width
 41 |         self.target = target
 42 |         self.sum_values = {}
 43 |         self.unique_values = []
 44 |         self.start = time.time()
 45 |         self.total_width = 0
 46 |         self.seen_so_far = 0
 47 |         self.verbose = verbose
 48 | 
 49 |     def update(self, current, values=[], exact=[], strict=[]):
 50 |         """
 51 |         Updates the progress bar.
 52 |         # Arguments
 53 |             current: Index of current step.
 54 |             values: List of tuples (name, value_for_last_step).
 55 |                 The progress bar will display averages for these values.
 56 |             exact: List of tuples (name, value_for_last_step).
 57 |                 The progress bar will display these values directly.
 58 |         """
 59 | 
 60 |         for k, v in values:
 61 |             if k not in self.sum_values:
 62 |                 self.sum_values[k] = [v * (current - self.seen_so_far),
 63 |                                       current - self.seen_so_far]
 64 |                 self.unique_values.append(k)
 65 |             else:
 66 |                 self.sum_values[k][0] += v * (current - self.seen_so_far)
 67 |                 self.sum_values[k][1] += (current - self.seen_so_far)
 68 |         for k, v in exact:
 69 |             if k not in self.sum_values:
 70 |                 self.unique_values.append(k)
 71 |             self.sum_values[k] = [v, 1]
 72 | 
 73 |         for k, v in strict:
 74 |             if k not in self.sum_values:
 75 |                 self.unique_values.append(k)
 76 |             self.sum_values[k] = v
 77 | 
 78 |         self.seen_so_far = current
 79 | 
 80 |         now = time.time()
 81 |         if self.verbose == 1:
 82 |             prev_total_width = self.total_width
 83 |             sys.stdout.write("\b" * prev_total_width)
 84 |             sys.stdout.write("\r")
 85 | 
 86 |             numdigits = int(np.floor(np.log10(self.target))) + 1
 87 |             barstr = '%%%dd/%%%dd [' % (numdigits, numdigits)
 88 |             bar = barstr % (current, self.target)
 89 |             prog = float(current)/self.target
 90 |             prog_width = int(self.width*prog)
 91 |             if prog_width > 0:
 92 |                 bar += ('='*(prog_width-1))
 93 |                 if current < self.target:
 94 |                     bar += '>'
 95 |                 else:
 96 |                     bar += '='
 97 |             bar += ('.'*(self.width-prog_width))
 98 |             bar += ']'
 99 |             sys.stdout.write(bar)
100 |             self.total_width = len(bar)
101 | 
102 |             if current:
103 |                 time_per_unit = (now - self.start) / current
104 |             else:
105 |                 time_per_unit = 0
106 |             eta = time_per_unit*(self.target - current)
107 |             info = ''
108 |             if current < self.target:
109 |                 info += ' - ETA: %ds' % eta
110 |             else:
111 |                 info += ' - %ds' % (now - self.start)
112 |             for k in self.unique_values:
113 |                 if type(self.sum_values[k]) is list:
114 |                     info += ' - %s: %.4f' % (k,
115 |                         self.sum_values[k][0] / max(1, self.sum_values[k][1]))
116 |                 else:
117 |                     info += ' - %s: %s' % (k, self.sum_values[k])
118 | 
119 |             self.total_width += len(info)
120 |             if prev_total_width > self.total_width:
121 |                 info += ((prev_total_width-self.total_width) * " ")
122 | 
123 |             sys.stdout.write(info)
124 |             sys.stdout.flush()
125 | 
126 |             if current >= self.target:
127 |                 sys.stdout.write("\n")
128 | 
129 |         if self.verbose == 2:
130 |             if current >= self.target:
131 |                 info = '%ds' % (now - self.start)
132 |                 for k in self.unique_values:
133 |                     info += ' - %s: %.4f' % (k,
134 |                         self.sum_values[k][0] / max(1, self.sum_values[k][1]))
135 |                 sys.stdout.write(info + "\n")
136 | 
137 |     def add(self, n, values=[]):
138 |         self.update(self.seen_so_far+n, values)
139 | 
140 | 
141 | 


--------------------------------------------------------------------------------
/model/ner_model.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import tensorflow as tf
  4 | 
  5 | 
  6 | from .data_utils import minibatches, pad_sequences, get_chunks
  7 | from .general_utils import Progbar
  8 | from .base_model import BaseModel
  9 | 
 10 | 
 11 | class NERModel(BaseModel):
 12 |     """Specialized class of Model for NER"""
 13 | 
 14 |     def __init__(self, config):
 15 |         super(NERModel, self).__init__(config)
 16 |         self.idx_to_tag = {idx: tag for tag, idx in
 17 |                            self.config.vocab_tags.items()}
 18 | 
 19 | 
 20 |     def add_placeholders(self):
 21 |         """Define placeholders = entries to computational graph"""
 22 |         # shape = (batch size, max length of sentence in batch)
 23 |         self.word_ids = tf.placeholder(tf.int32, shape=[None, None],
 24 |                         name="word_ids")
 25 | 
 26 |         # shape = (batch size)
 27 |         self.sequence_lengths = tf.placeholder(tf.int32, shape=[None],
 28 |                         name="sequence_lengths")
 29 | 
 30 |         # shape = (batch size, max length of sentence, max length of word)
 31 |         self.char_ids = tf.placeholder(tf.int32, shape=[None, None, None],
 32 |                         name="char_ids")
 33 | 
 34 |         # shape = (batch_size, max_length of sentence)
 35 |         self.word_lengths = tf.placeholder(tf.int32, shape=[None, None],
 36 |                         name="word_lengths")
 37 | 
 38 |         # shape = (batch size, max length of sentence in batch)
 39 |         self.labels = tf.placeholder(tf.int32, shape=[None, None],
 40 |                         name="labels")
 41 | 
 42 |         # hyper parameters
 43 |         self.dropout = tf.placeholder(dtype=tf.float32, shape=[],
 44 |                         name="dropout")
 45 |         self.lr = tf.placeholder(dtype=tf.float32, shape=[],
 46 |                         name="lr")
 47 | 
 48 | 
 49 |     def get_feed_dict(self, words, labels=None, lr=None, dropout=None):
 50 |         """Given some data, pad it and build a feed dictionary
 51 | 
 52 |         Args:
 53 |             words: list of sentences. A sentence is a list of ids of a list of
 54 |                 words. A word is a list of ids
 55 |             labels: list of ids
 56 |             lr: (float) learning rate
 57 |             dropout: (float) keep prob
 58 | 
 59 |         Returns:
 60 |             dict {placeholder: value}
 61 | 
 62 |         """
 63 |         # perform padding of the given data
 64 |         if self.config.use_chars:
 65 |             char_ids, word_ids = zip(*words)
 66 |             word_ids, sequence_lengths = pad_sequences(word_ids, 0)
 67 |             char_ids, word_lengths = pad_sequences(char_ids, pad_tok=0,
 68 |                 nlevels=2)
 69 |         else:
 70 |             word_ids, sequence_lengths = pad_sequences(words, 0)
 71 | 
 72 |         # build feed dictionary
 73 |         feed = {
 74 |             self.word_ids: word_ids,
 75 |             self.sequence_lengths: sequence_lengths
 76 |         }
 77 | 
 78 |         if self.config.use_chars:
 79 |             feed[self.char_ids] = char_ids
 80 |             feed[self.word_lengths] = word_lengths
 81 | 
 82 |         if labels is not None:
 83 |             labels, _ = pad_sequences(labels, 0)
 84 |             feed[self.labels] = labels
 85 | 
 86 |         if lr is not None:
 87 |             feed[self.lr] = lr
 88 | 
 89 |         if dropout is not None:
 90 |             feed[self.dropout] = dropout
 91 | 
 92 |         return feed, sequence_lengths
 93 | 
 94 | 
 95 |     def add_word_embeddings_op(self):
 96 |         """Defines self.word_embeddings
 97 | 
 98 |         If self.config.embeddings is not None and is a np array initialized
 99 |         with pre-trained word vectors, the word embeddings is just a look-up
100 |         and we don't train the vectors. Otherwise, a random matrix with
101 |         the correct shape is initialized.
102 |         """
103 |         with tf.variable_scope("words"):
104 |             if self.config.embeddings is None:
105 |                 self.logger.info("WARNING: randomly initializing word vectors")
106 |                 _word_embeddings = tf.get_variable(
107 |                         name="_word_embeddings",
108 |                         dtype=tf.float32,
109 |                         shape=[self.config.nwords, self.config.dim_word])
110 |             else:
111 |                 _word_embeddings = tf.Variable(
112 |                         self.config.embeddings,
113 |                         name="_word_embeddings",
114 |                         dtype=tf.float32,
115 |                         trainable=self.config.train_embeddings)
116 | 
117 |             word_embeddings = tf.nn.embedding_lookup(_word_embeddings,
118 |                     self.word_ids, name="word_embeddings")
119 | 
120 |         with tf.variable_scope("chars"):
121 |             if self.config.use_chars:
122 |                 # get char embeddings matrix
123 |                 _char_embeddings = tf.get_variable(
124 |                         name="_char_embeddings",
125 |                         dtype=tf.float32,
126 |                         shape=[self.config.nchars, self.config.dim_char])
127 |                 char_embeddings = tf.nn.embedding_lookup(_char_embeddings,
128 |                         self.char_ids, name="char_embeddings")
129 | 
130 |                 # put the time dimension on axis=1
131 |                 s = tf.shape(char_embeddings)
132 |                 char_embeddings = tf.reshape(char_embeddings,
133 |                         shape=[s[0]*s[1], s[-2], self.config.dim_char])
134 |                 word_lengths = tf.reshape(self.word_lengths, shape=[s[0]*s[1]])
135 | 
136 |                 # bi lstm on chars
137 |                 cell_fw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_char,
138 |                         state_is_tuple=True)
139 |                 cell_bw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_char,
140 |                         state_is_tuple=True)
141 |                 _output = tf.nn.bidirectional_dynamic_rnn(
142 |                         cell_fw, cell_bw, char_embeddings,
143 |                         sequence_length=word_lengths, dtype=tf.float32)
144 | 
145 |                 # read and concat output
146 |                 _, ((_, output_fw), (_, output_bw)) = _output
147 |                 output = tf.concat([output_fw, output_bw], axis=-1)
148 | 
149 |                 # shape = (batch size, max sentence length, char hidden size)
150 |                 output = tf.reshape(output,
151 |                         shape=[s[0], s[1], 2*self.config.hidden_size_char])
152 |                 word_embeddings = tf.concat([word_embeddings, output], axis=-1)
153 | 
154 |         self.word_embeddings =  tf.nn.dropout(word_embeddings, self.dropout)
155 | 
156 | 
157 |     def add_logits_op(self):
158 |         """Defines self.logits
159 | 
160 |         For each word in each sentence of the batch, it corresponds to a vector
161 |         of scores, of dimension equal to the number of tags.
162 |         """
163 |         with tf.variable_scope("bi-lstm"):
164 |             cell_fw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_lstm)
165 |             cell_bw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_lstm)
166 |             (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn(
167 |                     cell_fw, cell_bw, self.word_embeddings,
168 |                     sequence_length=self.sequence_lengths, dtype=tf.float32)
169 |             output = tf.concat([output_fw, output_bw], axis=-1)
170 |             output = tf.nn.dropout(output, self.dropout)
171 | 
172 |         with tf.variable_scope("proj"):
173 |             W = tf.get_variable("W", dtype=tf.float32,
174 |                     shape=[2*self.config.hidden_size_lstm, self.config.ntags])
175 | 
176 |             b = tf.get_variable("b", shape=[self.config.ntags],
177 |                     dtype=tf.float32, initializer=tf.zeros_initializer())
178 | 
179 |             nsteps = tf.shape(output)[1]
180 |             output = tf.reshape(output, [-1, 2*self.config.hidden_size_lstm])
181 |             pred = tf.matmul(output, W) + b
182 |             self.logits = tf.reshape(pred, [-1, nsteps, self.config.ntags])
183 | 
184 | 
185 |     def add_pred_op(self):
186 |         """Defines self.labels_pred
187 | 
188 |         This op is defined only in the case where we don't use a CRF since in
189 |         that case we can make the prediction "in the graph" (thanks to tf
190 |         functions in other words). With theCRF, as the inference is coded
191 |         in python and not in pure tensroflow, we have to make the prediciton
192 |         outside the graph.
193 |         """
194 |         if not self.config.use_crf:
195 |             self.labels_pred = tf.cast(tf.argmax(self.logits, axis=-1),
196 |                     tf.int32)
197 | 
198 | 
199 |     def add_loss_op(self):
200 |         """Defines the loss"""
201 |         if self.config.use_crf:
202 |             log_likelihood, trans_params = tf.contrib.crf.crf_log_likelihood(
203 |                     self.logits, self.labels, self.sequence_lengths)
204 |             self.trans_params = trans_params # need to evaluate it for decoding
205 |             self.loss = tf.reduce_mean(-log_likelihood)
206 |         else:
207 |             losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
208 |                     logits=self.logits, labels=self.labels)
209 |             mask = tf.sequence_mask(self.sequence_lengths)
210 |             losses = tf.boolean_mask(losses, mask)
211 |             self.loss = tf.reduce_mean(losses)
212 | 
213 |         # for tensorboard
214 |         tf.summary.scalar("loss", self.loss)
215 | 
216 | 
217 |     def build(self):
218 |         # NER specific functions
219 |         self.add_placeholders()
220 |         self.add_word_embeddings_op()
221 |         self.add_logits_op()
222 |         self.add_pred_op()
223 |         self.add_loss_op()
224 | 
225 |         # Generic functions that add training op and initialize session
226 |         self.add_train_op(self.config.lr_method, self.lr, self.loss,
227 |                 self.config.clip)
228 |         self.initialize_session() # now self.sess is defined and vars are init
229 | 
230 | 
231 |     def predict_batch(self, words):
232 |         """
233 |         Args:
234 |             words: list of sentences
235 | 
236 |         Returns:
237 |             labels_pred: list of labels for each sentence
238 |             sequence_length
239 | 
240 |         """
241 |         fd, sequence_lengths = self.get_feed_dict(words, dropout=1.0)
242 | 
243 |         if self.config.use_crf:
244 |             # get tag scores and transition params of CRF
245 |             viterbi_sequences = []
246 |             logits, trans_params = self.sess.run(
247 |                     [self.logits, self.trans_params], feed_dict=fd)
248 | 
249 |             # iterate over the sentences because no batching in vitervi_decode
250 |             for logit, sequence_length in zip(logits, sequence_lengths):
251 |                 logit = logit[:sequence_length] # keep only the valid steps
252 |                 viterbi_seq, viterbi_score = tf.contrib.crf.viterbi_decode(
253 |                         logit, trans_params)
254 |                 viterbi_sequences += [viterbi_seq]
255 | 
256 |             return viterbi_sequences, sequence_lengths
257 | 
258 |         else:
259 |             labels_pred = self.sess.run(self.labels_pred, feed_dict=fd)
260 | 
261 |             return labels_pred, sequence_lengths
262 | 
263 | 
264 |     def run_epoch(self, train, dev, epoch):
265 |         """Performs one complete pass over the train set and evaluate on dev
266 | 
267 |         Args:
268 |             train: dataset that yields tuple of sentences, tags
269 |             dev: dataset
270 |             epoch: (int) index of the current epoch
271 | 
272 |         Returns:
273 |             f1: (python float), score to select model on, higher is better
274 | 
275 |         """
276 |         # progbar stuff for logging
277 |         batch_size = self.config.batch_size
278 |         nbatches = (len(train) + batch_size - 1) // batch_size
279 |         prog = Progbar(target=nbatches)
280 | 
281 |         # iterate over dataset
282 |         for i, (words, labels) in enumerate(minibatches(train, batch_size)):
283 |             fd, _ = self.get_feed_dict(words, labels, self.config.lr,
284 |                     self.config.dropout)
285 | 
286 |             _, train_loss, summary = self.sess.run(
287 |                     [self.train_op, self.loss, self.merged], feed_dict=fd)
288 | 
289 |             prog.update(i + 1, [("train loss", train_loss)])
290 | 
291 |             # tensorboard
292 |             if i % 10 == 0:
293 |                 self.file_writer.add_summary(summary, epoch*nbatches + i)
294 | 
295 |         metrics = self.run_evaluate(dev)
296 |         msg = " - ".join(["{} {:04.2f}".format(k, v)
297 |                 for k, v in metrics.items()])
298 |         self.logger.info(msg)
299 | 
300 |         return metrics["f1"]
301 | 
302 | 
303 |     def run_evaluate(self, test):
304 |         """Evaluates performance on test set
305 | 
306 |         Args:
307 |             test: dataset that yields tuple of (sentences, tags)
308 | 
309 |         Returns:
310 |             metrics: (dict) metrics["acc"] = 98.4, ...
311 | 
312 |         """
313 |         accs = []
314 |         correct_preds, total_correct, total_preds = 0., 0., 0.
315 |         for words, labels in minibatches(test, self.config.batch_size):
316 |             labels_pred, sequence_lengths = self.predict_batch(words)
317 | 
318 |             for lab, lab_pred, length in zip(labels, labels_pred,
319 |                                              sequence_lengths):
320 |                 lab      = lab[:length]
321 |                 lab_pred = lab_pred[:length]
322 |                 accs    += [a==b for (a, b) in zip(lab, lab_pred)]
323 | 
324 |                 lab_chunks      = set(get_chunks(lab, self.config.vocab_tags))
325 |                 lab_pred_chunks = set(get_chunks(lab_pred,
326 |                                                  self.config.vocab_tags))
327 | 
328 |                 correct_preds += len(lab_chunks & lab_pred_chunks)
329 |                 total_preds   += len(lab_pred_chunks)
330 |                 total_correct += len(lab_chunks)
331 | 
332 |         p   = correct_preds / total_preds if correct_preds > 0 else 0
333 |         r   = correct_preds / total_correct if correct_preds > 0 else 0
334 |         f1  = 2 * p * r / (p + r) if correct_preds > 0 else 0
335 |         acc = np.mean(accs)
336 | 
337 |         return {"acc": 100*acc, "f1": 100*f1}
338 | 
339 | 
340 |     def predict(self, words_raw):
341 |         """Returns list of tags
342 | 
343 |         Args:
344 |             words_raw: list of words (string), just one sentence (no batch)
345 | 
346 |         Returns:
347 |             preds: list of tags (string), one for each word in the sentence
348 | 
349 |         """
350 |         words = [self.config.processing_word(w) for w in words_raw]
351 |         if type(words[0]) == tuple:
352 |             words = zip(*words)
353 |         pred_ids, _ = self.predict_batch([words])
354 |         preds = [self.idx_to_tag[idx] for idx in list(pred_ids[0])]
355 | 
356 |         return preds
357 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow>=1.0
2 | numpy
3 | logging
4 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | from model.data_utils import CoNLLDataset
 2 | from model.ner_model import NERModel
 3 | from model.config import Config
 4 | 
 5 | 
 6 | def main():
 7 |     # create instance of config
 8 |     config = Config()
 9 | 
10 |     # build model
11 |     model = NERModel(config)
12 |     model.build()
13 |     # model.restore_session("results/crf/model.weights/") # optional, restore weights
14 |     # model.reinitialize_weights("proj")
15 | 
16 |     # create datasets
17 |     dev   = CoNLLDataset(config.filename_dev, config.processing_word,
18 |                          config.processing_tag, config.max_iter)
19 |     train = CoNLLDataset(config.filename_train, config.processing_word,
20 |                          config.processing_tag, config.max_iter)
21 | 
22 |     # train model
23 |     model.train(train, dev)
24 | 
25 | if __name__ == "__main__":
26 |     main()
27 | 


--------------------------------------------------------------------------------