├── .gitignore
├── LICENSE
├── README.md
├── data
    ├── cdr_corpus.rar
    └── chemdner_corpus.rar
├── models
    ├── chem_Att-BiLSTM-CRF_word_char.rar
    ├── chemdner_word2vec.rar
    └── chemner_BiLSTM-CRF_word_char.rar
└── src
    ├── AttenTrain.py
    ├── Atten_tagger.py
    ├── activations.py
    ├── backend
        ├── __init__.py
        ├── common.py
        └── theano_backend.py
    ├── evaluation
        └── conlleval
    ├── initializations.py
    ├── loader.py
    ├── model.py
    ├── nn.py
    ├── optimization.py
    ├── tagger.py
    ├── train.py
    └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Att-ChemdNER
 2 | ***
 3 | This repo contains the source code and dataset for the following paper:
 4 | - [Ling Luo, Zhihao Yang, Pei Yang, Yin Zhang, Lei Wang, Hongfei Lin, Jian Wang. An Attention-based BiLSTM-CRF Approach to Document-level Chemical Named Entity Recognition. Bioinformatics, 2017.](https://academic.oup.com/bioinformatics/advance-article-abstract/doi/10.1093/bioinformatics/btx761/4657076?redirectedFrom=fulltext)
 5 | ## Dependency package
 6 | 
 7 | Att-ChemdNER uses the following dependencies:
 8 | 
 9 | - [Python 2.7](https://www.python.org/)
10 | - [Theano 0.9.0](http://www.deeplearning.net/software/theano/)
11 | - [numpy 1.12.1](http://www.numpy.org/)
12 | 
13 | 
14 | ## Content
15 | - data
16 | 	- CHENDNER corpus
17 | 	- CDR corpus
18 | - models
19 | 	- The basic BiLSTM-CRF model
20 | 	- The Att-BiLSTM-CRF model
21 | 	- The 50-dimensional word embedding
22 | - src
23 | 	- backend
24 | 	- evaluation: evaluate result of NER task
25 | 	- activations.py: activation functions
26 | 	- initializations.py
27 | 	- loader.py: load the data set
28 | 	- model.py: build the model
29 | 	- nn.py: the layers of the network architecture
30 | 	- optimizaiton.py: optimization method
31 | 	- utils.py
32 | 	- train.py: train a basic BiLSTM-CRF model
33 | 	- AttenTrain.py: train a Att-BiLSTM-CRF model
34 | 	- tagger.py: tag the document using the BiLSTM-CRF model
35 | 	- AttenTrain.py: tag the document using the Att-BiLSTM-CRF model
36 | 
37 | ## Train a basic BiLSTM-CRF model
38 | To train a basic BiLSTM-CRF model, you need to provide the file of the training set, development set,testing set and word embedding model, and run the train.py script:
39 | 
40 | ```
41 | python train.py --train trainfile --dev devfile --test testfile --pre_emb word_embedding.model 
42 | ```
43 | ## Train a Att-BiLSTM-CRF model
44 | To train our Att-BiLSTM-CRF model, you need to provide the file of the training set, development set,testing set and word embedding model, and run the AttenTrain.py script:
45 | 
46 | ```
47 | python AttenTrain.py --train trainfile --dev devfile --test testfile --pre_emb word_embedding.model 
48 | ```
49 | ## Tag the documents using the BiLSTM-CRF model
50 | Recognize the chemical entities from the documents using the pretrained BiLSTM-CRF model, and you need to provide the pretrained model, inputfile and outputfile:
51 | 
52 | ```
53 | python tagger.py --model BiLSTM-CRF.model --input inputfile --output outputfile
54 | ```
55 | The inputfile should contain one document by line, and they have to be tokenized.
56 | 
57 | ## Tag the documents using the Att-BiLSTM-CRF model
58 | Recognize the chemical entities from the documents using the pretrained Att-BiLSTM-CRF model, and you need to provide the pretrained model, inputfile and outputfile:
59 | 
60 | ```
61 | python Atten_tagger.py --model Att-BiLSTM-CRF.model --input inputfile --output outputfile
62 | ```
63 | 
64 | The inputfile should contain one document by line, and they have to be tokenized.
65 | 
66 | 
67 | ***
68 | 
69 | 


--------------------------------------------------------------------------------
/data/cdr_corpus.rar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lingluodlut/Att-ChemdNER/81db44f5cbd5bbbb1d1dee72a528280425de7bc9/data/cdr_corpus.rar


--------------------------------------------------------------------------------
/data/chemdner_corpus.rar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lingluodlut/Att-ChemdNER/81db44f5cbd5bbbb1d1dee72a528280425de7bc9/data/chemdner_corpus.rar


--------------------------------------------------------------------------------
/models/chem_Att-BiLSTM-CRF_word_char.rar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lingluodlut/Att-ChemdNER/81db44f5cbd5bbbb1d1dee72a528280425de7bc9/models/chem_Att-BiLSTM-CRF_word_char.rar


--------------------------------------------------------------------------------
/models/chemdner_word2vec.rar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lingluodlut/Att-ChemdNER/81db44f5cbd5bbbb1d1dee72a528280425de7bc9/models/chemdner_word2vec.rar


--------------------------------------------------------------------------------
/models/chemner_BiLSTM-CRF_word_char.rar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lingluodlut/Att-ChemdNER/81db44f5cbd5bbbb1d1dee72a528280425de7bc9/models/chemner_BiLSTM-CRF_word_char.rar


--------------------------------------------------------------------------------
/src/AttenTrain.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import os
  4 | import numpy as np 
  5 | SEED=1234;
  6 | np.random.seed(1234);
  7 | import optparse
  8 | import itertools
  9 | import time
 10 | import subprocess
 11 | from collections import OrderedDict
 12 | from utils import create_input
 13 | import loader
 14 | 
 15 | from utils import models_path, evaluate, eval_script, eval_temp,create_mapping;
 16 | from loader import word_mapping, char_mapping, tag_mapping
 17 | from loader import update_tag_scheme, prepare_dataset
 18 | from loader import augment_with_pretrained,feature_mapping;
 19 | from model import Model 
 20 | from utils import generateDocSentLen;
 21 | #import random ;
 22 | #for bash color 
 23 | BASH_RED="\033[0;31m";
 24 | BASH_GREEN="\033[0;32m"
 25 | BASH_YELLOW="\033[0;33m"
 26 | BASH_CYAN="\033[0;36m"
 27 | BASH_CLEAR="\033[0m"
 28 | 
 29 | #prepare for model 
 30 | #{{{
 31 | # Read parameters from command line
 32 | #{{{
 33 | optparser = optparse.OptionParser()
 34 | optparser.add_option(
 35 |     "-T", "--train", default="training.ner.doc.token4.BIO",
 36 |     help="Train set location"
 37 | )
 38 | optparser.add_option(
 39 |     "-d", "--dev", default="development.ner.doc.token4.BIO",
 40 |     help="Dev set location"
 41 | )
 42 | optparser.add_option(
 43 |     "-t", "--test", default="evaluation.ner.doc.token4.BIO",
 44 |     help="Test set location"
 45 | )
 46 | optparser.add_option(
 47 |     "-s", "--tag_scheme", default="iob",
 48 |     help="Tagging scheme (IOB or IOBES)"
 49 | )
 50 | optparser.add_option(
 51 |     "-l", "--lower", default="0",
 52 |     type='int', help="Lowercase words (this will not affect character inputs)"
 53 | )
 54 | optparser.add_option(
 55 |     "-z", "--zeros", default="0",
 56 |     type='int', help="Replace digits with 0"
 57 | )
 58 | optparser.add_option(
 59 |     "-c", "--char_dim", default="25",
 60 |     type='int', help="Char embedding dimension"
 61 | )
 62 | optparser.add_option(
 63 |     "-C", "--char_lstm_dim", default="25",
 64 |     type='int', help="Char LSTM hidden layer size"
 65 | )
 66 | optparser.add_option(
 67 |     "-b", "--char_bidirect", default="1",
 68 |     type='int', help="Use a bidirectional LSTM for chars"
 69 | )
 70 | optparser.add_option(
 71 |     "-w", "--word_dim", default="50",
 72 |     type='int', help="Token embedding dimension"
 73 | )
 74 | optparser.add_option(
 75 |     "-W", "--word_lstm_dim", default="100",
 76 |     type='int', help="Token LSTM hidden layer size"
 77 | )
 78 | optparser.add_option(
 79 |     "-B", "--word_bidirect", default="1",
 80 |     type='int', help="Use a bidirectional LSTM for words"
 81 | )
 82 | optparser.add_option(
 83 |     "-p", "--pre_emb", default="./word2vec_model/chemdner_pubmed_drug.word2vec_model_token4_d50",
 84 |     help="Location of pretrained embeddings"
 85 | )
 86 | optparser.add_option(
 87 |     "-A", "--all_emb", default="0",
 88 |     type='int', help="Load all embeddings"
 89 | )
 90 | optparser.add_option(
 91 |     "-a", "--cap_dim", default="1",
 92 |     type='int', help="Capitalization feature dimension (0 to disable)"
 93 | )
 94 | optparser.add_option(
 95 |     "-f", "--crf", default="1",
 96 |     type='int', help="Use CRF (0 to disable)"
 97 | )
 98 | optparser.add_option(
 99 |     "-D", "--dropout", default="0.5",
100 |     type='float', help="Droupout on the input (0 = no dropout)"
101 | )
102 | optparser.add_option(
103 |     "-L", "--lr_method", default="sgd-lr_.001",
104 |     help="Learning method (SGD, Adadelta, Adam..)"
105 | )
106 | optparser.add_option(
107 |     "-r", "--reload", default="0",
108 |     type='int', help="Reload the last saved model"
109 | )
110 | optparser.add_option(
111 |     "-S","--String",default="",
112 |     help="some about this model"
113 |     )
114 | opts = optparser.parse_args()[0]
115 | #}}}
116 | 
117 | 
118 | #according corpus to set some parameter for loading file 
119 | CORPUS="chem";
120 | tagFilter=None;
121 | attenScoreFunTotal=['Euclidean','forwardNN','Cosine','Manhatten'];
122 | attenScoreFun=attenScoreFunTotal[0]
123 | if CORPUS == "chem":
124 | #{{{
125 |     opts.train="./chemdner_corpus/chemdner_training.ner.doc.token4.BIO_allfea";
126 |     opts.dev="./chemdner_corpus/chemdner_development.ner.doc.token4.BIO_allfea";
127 |     opts.test="./chemdner_corpus/chemdner_evaluation.ner.doc.token4.BIO_allfea";
128 |     opts.pre_emb="./word2vec_model/chemdner_pubmed_drug.word2vec_model_token4_d50";
129 |     ssplitTrainFName="./chemdner_corpus/training.ner.ssplit.token4.BIO";
130 |     ssplitDevFName="./chemdner_corpus/development.ner.ssplit.token4.BIO";
131 |     ssplitTestFName="./chemdner_corpus/evaluation.ner.ssplit.token4.BIO";
132 |     tagFilter=None;
133 | #}}}
134 | elif CORPUS == "CDR":
135 | #{{{
136 |     opts.train="./cdr_corpus/cdr_training.ner.doc.token4.BIO_allfea_drug";
137 |     opts.dev="./chemdner_corpus/cdr_development.ner.doc.token4.BIO_allfea_drug";
138 |     opts.test="./chemdner_corpus/cdr_test.ner.doc.token4.BIO_allfea_drug";
139 |     opts.pre_emb="./word2vec_model/chemdner_pubmed_drug.word2vec_model_token4_d50";
140 |     ssplitTrainFName="./chemdner_corpus/cdr_training.ner.sen.token4.BIO_allfea_drug";
141 |     ssplitDevFName="./chemdner_corpus/cdr_development.ner.sen.token4.BIO_allfea_drug";
142 |     ssplitTestFName="./chemdner_corpus/cdr_dtest.ner.sen.token4.BIO_allfea_drug";
143 |     tagFilter=['Disease'];
144 | #}}}
145 | 
146 | else:
147 |     assert 0,"unknown corpus";
148 | 
149 | #read word_dim from word2vec_model
150 | #{{{
151 | with open(opts.pre_emb) as file:
152 |     first_line = file.readline()
153 |     #create vec_table
154 |     frequency = int(first_line.split()[0]);
155 |     vec_size = int(first_line.split()[1]);
156 |     opts.word_dim=vec_size;
157 |     opts.word_lstm_dim=vec_size;
158 | #}}}
159 | 
160 | # Parse parameters 
161 | #{{{
162 | parameters = OrderedDict()
163 | parameters['tag_scheme'] = opts.tag_scheme
164 | parameters['lower'] = opts.lower == 1
165 | parameters['zeros'] = opts.zeros == 1
166 | parameters['char_dim'] = opts.char_dim
167 | parameters['char_lstm_dim'] = opts.char_lstm_dim
168 | parameters['char_bidirect'] = opts.char_bidirect == 1
169 | parameters['word_dim'] = opts.word_dim
170 | parameters['word_lstm_dim'] = opts.word_lstm_dim
171 | parameters['word_bidirect'] = opts.word_bidirect == 1
172 | parameters['pre_emb'] = opts.pre_emb
173 | parameters['all_emb'] = opts.all_emb == 1
174 | parameters['cap_dim'] = opts.cap_dim
175 | parameters['crf'] = opts.crf == 1
176 | parameters['dropout'] = opts.dropout
177 | parameters['lr_method'] = opts.lr_method
178 | #}}}
179 | 
180 | # Check parameters validity
181 | #{{{
182 | assert os.path.isfile(opts.train)
183 | assert os.path.isfile(opts.dev)
184 | assert os.path.isfile(opts.test)
185 | assert parameters['char_dim'] > 0 or parameters['word_dim'] > 0
186 | assert 0. <= parameters['dropout'] < 1.0
187 | assert parameters['tag_scheme'] in ['iob', 'iobes']
188 | assert not parameters['all_emb'] or parameters['pre_emb']
189 | assert not parameters['pre_emb'] or parameters['word_dim'] > 0
190 | assert not parameters['pre_emb'] or os.path.isfile(parameters['pre_emb'])
191 | #}}}
192 | # Check evaluation script / folders
193 | if not os.path.isfile(eval_script):
194 |     raise Exception('CoNLL evaluation script not found at "%s"' % eval_script)
195 | if not os.path.exists(eval_temp):
196 |     os.makedirs(eval_temp)
197 | if not os.path.exists(models_path):
198 |     os.makedirs(models_path)
199 | #}}}
200 | #prepare for train 
201 | #{{{
202 | 
203 | # Data parameters
204 | lower = parameters['lower']
205 | zeros = parameters['zeros']
206 | tag_scheme = parameters['tag_scheme']
207 | 
208 | # Load sentences
209 | train_sentences = loader.load_sentences(opts.train, lower, zeros)
210 | dev_sentences = loader.load_sentences(opts.dev, lower, zeros)
211 | test_sentences = loader.load_sentences(opts.test, lower, zeros)
212 | 
213 | #check 1 word sentences 
214 | def check1word(sentences):
215 |     Lens=[];
216 |     for elem in sentences:
217 |         Lens.append(len(elem));
218 |     if min(Lens)==1:
219 |         assert 0;
220 | #check1word(train_sentences);
221 | #check1word(dev_sentences);
222 | #check1word(test_sentences);
223 | 
224 | #get doc Len  for calcuate loss at sentences level
225 | train_Lens=generateDocSentLen(opts.train,ssplitTrainFName);
226 | dev_Lens=generateDocSentLen(opts.dev,ssplitDevFName);
227 | test_Lens=generateDocSentLen(opts.test,ssplitTestFName);
228 | 
229 | #merge dev to train 
230 | totalSentences=train_sentences+dev_sentences;
231 | totalLens=train_Lens+dev_Lens;
232 | #redefine train and dev 
233 | #corpus are already random genergated, so no need to shuffly
234 | #random.seed(SEED);
235 | #random.shuffle(totalSentences);
236 | #random.seed(SEED);
237 | #random.shuffle(totalLens);
238 | devRatio=0.1;
239 | devBoundary=int(len(totalSentences)*(1-devRatio))
240 | train_sentences=totalSentences[:devBoundary];
241 | train_Lens=totalLens[:devBoundary];
242 | dev_sentences=totalSentences[devBoundary:];
243 | dev_Lens=totalLens[devBoundary:];
244 | 
245 | # Use selected tagging scheme (IOB / IOBES)
246 | update_tag_scheme(train_sentences, tag_scheme,tagFilter);
247 | update_tag_scheme(dev_sentences, tag_scheme,tagFilter);
248 | update_tag_scheme(test_sentences, tag_scheme,tagFilter);
249 | 
250 | # Create a dictionary / mapping of words
251 | # If we use pretrained embeddings, we add them to the dictionary.
252 | if parameters['pre_emb']:
253 |     dico_words_train = word_mapping(train_sentences, lower)[0]
254 |     dico_words, word_to_id, id_to_word = augment_with_pretrained(
255 |         dico_words_train.copy(),
256 |         parameters['pre_emb'],
257 |         list(itertools.chain.from_iterable(
258 |             [[w[0] for w in s] for s in dev_sentences + test_sentences])
259 |         ) if not parameters['all_emb'] else None
260 |     )
261 | else:
262 |     dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower)
263 |     dico_words_train = dico_words
264 | 
265 | # Create a dictionary and a mapping for words / POS tags / tags
266 | dico_chars, char_to_id, id_to_char = char_mapping(train_sentences)
267 | dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)
268 | 
269 | #feature mapping 
270 | #{{{
271 | featureMap={#{{{
272 |             'word':{
273 |                         'index':1,
274 |                         'isUsed':1,
275 |                         'lstm-input':1,
276 |                         'attended':1,
277 |             },
278 |             'char':{    
279 |                         'index':0,
280 |                         'isUsed':0,
281 |                         'lstm-input':1,
282 |                         'attended':1,
283 |             },
284 |             'lemma':{   'index':1,
285 |                         'isUsed':0,
286 |                         'num':0,
287 |                         'dim':25,
288 |                         'lstm-input':0,
289 |                         'attended':0,
290 |                         'pre_emb':''},
291 |             'pos':{     'index':2,
292 |                         'isUsed':0,
293 |                         'num':0,
294 |                         'dim':50,
295 |                         'lstm-input':0,
296 |                         'attended':0,
297 |                         'pre_emb':''},
298 |             'chunk':{   'index':3,
299 |                         'isUsed':0,
300 |                         'num':0,
301 |                         'lstm-input':0,
302 |                         'attended':0,
303 |                         'dim':10},
304 |             'dic':{     'index':4,
305 |                         'isUsed':0,
306 |                         'num':3,
307 |                         'lstm-input':0,
308 |                         'attended':0,
309 |                         'dim':5},
310 |            }#}}}
311 | def featureMapCheck(featureMap):
312 |     for item in featureMap:
313 |         assert (not featureMap[item]['isUsed']) or \
314 |             (featureMap[item]['lstm-input'] or featureMap[item]['attended'])
315 | feature2IdMap={'word':word_to_id,
316 |                    'char':char_to_id,
317 |                    'tag':tag_to_id};
318 | featureMapCheck(featureMap);
319 | if featureMap['lemma']['isUsed'] :
320 |     dico_lemma,lemma_to_id,id_to_lemma=feature_mapping(train_sentences,
321 |                                             featureMap['lemma']['index'],'lemma');
322 |     featureMap['lemma']['num']=len(dico_lemma)
323 |     feature2IdMap['lemma']=lemma_to_id;
324 | 
325 | if featureMap['pos']['isUsed'] :
326 |     dico_pos,pos_to_id,id_to_pos=feature_mapping(train_sentences,
327 |                                             featureMap['pos']['index'],'pos');
328 |     featureMap['pos']['num']=len(dico_pos)
329 |     feature2IdMap['pos']=pos_to_id;
330 | if featureMap['chunk']['isUsed']:
331 |     dico_chunk,chunk_to_id,id_to_chunk=feature_mapping(train_sentences,
332 |                                             featureMap['chunk']['index'],'chunk');
333 |     featureMap['chunk']['num']=len(dico_chunk)
334 |     feature2IdMap['chunk']=chunk_to_id;
335 |     
336 | if featureMap['dic']['isUsed']:
337 |     dico_NER={'B':0,'I':1,'O':2};
338 |     NER_to_id,id_to_NER=create_mapping(dico_NER);
339 |     feature2IdMap['dic']=NER_to_id;
340 | print BASH_YELLOW+str(featureMap)+BASH_CLEAR;
341 | featureMap['feature2IdMap']=feature2IdMap;
342 | parameters['features']=featureMap;
343 | #}}}
344 | 
345 | 
346 | 
347 | # Build the model 
348 | parameters['loading']=False;
349 | parameters['loading_path']="./models/bilstm-crf-chemdner50d/";
350 | parameters['sentencesLevelLoss']=False;
351 | saveModel=False;
352 | parameters['training']=True;
353 | parameters['attenScoreFun']=attenScoreFun;
354 | parameters['useAttend']=True;
355 | useEarlyStopping=False;
356 | # Initialize model
357 | model = Model(parameters=parameters, models_path=models_path,model_path="./models/attention_test/",Training=True)
358 | # Save the mappings to disk
359 | print 'Saving the mappings to disk...'
360 | model.save_mappings(id_to_word, id_to_char, id_to_tag)
361 | print BASH_YELLOW+"Model location: "+BASH_CLEAR+ "%s" % model.model_path
362 | print BASH_YELLOW+"model important point:"+BASH_CLEAR,opts.String;
363 | if parameters['loading']:
364 |     print BASH_YELLOW+"loading:"+BASH_CLEAR,parameters['loading_path'];
365 | print BASH_YELLOW+'save model:'+BASH_CLEAR,saveModel;
366 | print BASH_YELLOW+"sentences Level Loss:"+BASH_CLEAR,parameters['sentencesLevelLoss'];
367 | 
368 | # Index data
369 | train_data = prepare_dataset(
370 |     train_sentences,train_Lens, parameters, lower
371 | )
372 | dev_data = prepare_dataset(
373 |     dev_sentences,dev_Lens,parameters, lower
374 | )
375 | test_data = prepare_dataset(
376 |     test_sentences,test_Lens, parameters, lower
377 | )
378 | 
379 | print "%i / %i / %i sentences in train / dev / test." % (
380 |     len(train_data), len(dev_data), len(test_data))
381 | 
382 | #load pre-train word_embending 
383 | f_train, f_eval = model.build4(parameters)
384 | 
385 | 
386 | # Reload previous model values
387 | if opts.reload:
388 |     print 'Reloading previous model...'
389 |     model.reload()
390 | #}}}
391 | #
392 | # Train network
393 | #
394 | singletons = set([word_to_id[k] for k, v
395 |                   in dico_words_train.items() if v == 1])
396 | freq_eval = int(len(train_data)*0.3)  # evaluate on dev every freq_eval steps
397 | count = 0
398 | limitPrint=0;
399 | param = {
400 |          #'lr':0.005,
401 |          'lr':0.001,
402 |          'verbose':1,
403 |          'decay':True, # decay on the learning rate if improvement stops
404 |          'bs':5, # number of backprop through time steps
405 |          'seed':345,
406 |          'epochs':30,
407 |          'crf':True,
408 |          'shuffle':True};
409 | folder_out = '../log/Attention/'
410 | print BASH_YELLOW+"folder_out:"+BASH_CLEAR,folder_out;
411 | best_f1=-np.inf;
412 | 
413 | def attenVisualFun(words,energy,index):
414 | #{{{
415 |     print "energy should:",energy[index][index],words[index];
416 |     print "filter energy:";
417 |     energyInd=energy[index].argsort()[::-1][:10];
418 |     attenVisual=[];
419 |     for i in energyInd:
420 |         attenVisual.append([words[i],energy[index][i]]);
421 |     print attenVisual;
422 |     
423 |     #print energyInd;
424 |     #for i in range(len(words)):
425 |     #    attenVisual.append([words[i],energy[0][i]]);
426 |     #print attenVisual;
427 |     
428 |     return ;
429 | #}}}
430 | 
431 | #generate FILE NAME PREFIX 
432 | fileNamePrefix="";
433 | if opts.String != "":
434 |     fileNamePrefix=opts.String;
435 |     fileNamePrefix.replace(",","_");
436 |     fileNamePrefix.replace(" ","_");
437 | #train model 
438 | if useEarlyStopping:
439 | #{{{
440 |     from utils import EarlyStopping;
441 |     eStop=EarlyStopping(mode='max');
442 |     eStop.on_train_begin();
443 |     
444 |     #start train our model
445 |     for epoch in xrange(param['epochs']):
446 |         epoch_costs = []
447 |         startTime=time.time();
448 |         
449 |         #decide whether early stop 
450 |         if eStop.stop_training:
451 |             break;
452 |         
453 |         print "Starting epoch %i..." % epoch
454 |         for i, index in enumerate(np.random.permutation(len(train_data))):
455 |             count += 1
456 |             input = create_input(train_data[index], parameters, True, singletons)
457 |             new_cost = f_train(*input)
458 |             if np.isnan(new_cost):
459 |                 print index,"nan"
460 |             epoch_costs.append(new_cost)
461 |         #validation
462 |         res_dev = evaluate(parameters, f_eval, dev_sentences,
463 |                               dev_data, id_to_tag, dico_tags,
464 |                             folder_out+fileNamePrefix+'.dev.txt')
465 |         eStop.on_epoch_end(epoch,res_dev['f1']) ;
466 |         print BASH_YELLOW+"avg error:"+BASH_CLEAR,np.mean(epoch_costs),\
467 |                     " dev F1:",res_dev['f1'];
468 |         print BASH_YELLOW+"One epch espliced:"+BASH_CLEAR,time.time()-startTime;
469 | 
470 |     #start evaluate on test
471 |     res_test = evaluate(parameters, f_eval, test_sentences,
472 |                       test_data, id_to_tag, dico_tags,
473 |                     folder_out+fileNamePrefix+'.test.txt')
474 |     if saveModel:
475 |         print "Saving model to disk..."
476 |         model.save()
477 |     print BASH_RED+'TEST: epoch'+BASH_CLEAR, epoch, 'F1', res_test['f1'],'p:',res_test['p'],'r:',res_test['r'],  ' '*15
478 |     print BASH_YELLOW+"model important point:"+BASH_CLEAR,opts.String;
479 |             #}}}
480 | else:
481 |     for epoch in xrange(param['epochs']):
482 |         epoch_costs = []
483 |         startTime=time.time();
484 |         print "Starting epoch %i..." % epoch
485 |         for i, index in enumerate(np.random.permutation(len(train_data))):
486 |             count += 1
487 |             input = create_input(train_data[index], parameters, True, singletons)
488 |             new_cost,energy = f_train(*input)
489 |             #print attention energy for test
490 |             if epoch>=limitPrint and count %freq_eval==0:
491 |                 attenVisualFun(train_data[index]['str_words'],
492 |                               energy,
493 |                                np.random.randint(0,len(train_data[index])));
494 |             if np.isnan(new_cost):
495 |                 print "NaN,index:",index;
496 |             epoch_costs.append(new_cost)
497 |             if count % freq_eval == 0 and epoch>=limitPrint:
498 |                 res_dev = evaluate(parameters, f_eval, dev_sentences,
499 |                                       dev_data, id_to_tag, dico_tags,
500 |                                     folder_out+fileNamePrefix+'.dev.txt')
501 |                 #new F1 value on dev 
502 |                 if res_dev['f1'] > best_f1:
503 |                     best_f1 = res_dev['f1']
504 |                     if param['verbose']:
505 |                         print BASH_CYAN+'NEW DEV BEST: epoch'+BASH_CLEAR, epoch, 'best dev F1', res_dev['f1'],'p:',res_dev['p'],'r:',res_dev['r'],  ' '*15 
506 |                     
507 |                     #new F1 value on dev, so evaluate on test
508 |                     res_test = evaluate(parameters, f_eval, test_sentences,
509 |                                       test_data, id_to_tag, dico_tags,
510 |                                     folder_out+fileNamePrefix+'.test.txt')
511 |                     if saveModel:
512 |                         print "Saving model to disk..."
513 |                         model.save()
514 |                     print BASH_RED+'THIS TEST: epoch'+BASH_CLEAR, epoch, 'F1', res_test['f1'],'p:',res_test['p'],'r:',res_test['r'],  ' '*15
515 |                     param['tf1'], param['tp'], param['tr'] = res_test['f1'],  res_test['p'],  res_test['r']
516 |                     param['be'] = epoch
517 |         print BASH_YELLOW+"avg error:"+BASH_CLEAR,np.mean(epoch_costs);
518 |         print BASH_YELLOW+"One epch espliced:"+BASH_CLEAR,time.time()-startTime;
519 |     print BASH_GREEN+'FINAL TEST RESULT: epoch'+BASH_CLEAR, param['be'], 'final test F1', param['tf1'],'best p:',param['tp'],'best r:',param['tr'] 
520 |     print BASH_YELLOW+"model important point:"+BASH_CLEAR,opts.String;
521 |                 
522 | 
523 | 


--------------------------------------------------------------------------------
/src/Atten_tagger.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import os
  4 | import time
  5 | import codecs
  6 | import optparse
  7 | import numpy as np
  8 | from loader import prepare_dataset;
  9 | from utils import create_input, iobes_iob;
 10 | from model import Model
 11 | 
 12 | optparser = optparse.OptionParser()
 13 | optparser.add_option(
 14 |     "-m", "--model", default="../models/chemAtten_word_char/",
 15 |     help="Model location"
 16 | )
 17 | optparser.add_option(
 18 |     "-i", "--input", default="../data/chemnder_test.txt",
 19 |     help="Input file location"
 20 | )
 21 | optparser.add_option(
 22 |     "-o", "--output", default="./chemdner_test.tsv",
 23 |     help="Output file location"
 24 | )
 25 | optparser.add_option(
 26 |     "-d", "--delimiter", default="__",
 27 |     help="Delimiter to separate words from their tags"
 28 | )
 29 | opts = optparser.parse_args()[0]
 30 | 
 31 | # Check parameters validity
 32 | assert opts.delimiter
 33 | assert os.path.isdir(opts.model)
 34 | assert os.path.isfile(opts.input)
 35 | 
 36 | # Load existing model
 37 | print "Loading model..."
 38 | model = Model(model_path=opts.model)
 39 | 
 40 | # Load reverse mappings
 41 | word_to_id, char_to_id, tag_to_id = [
 42 |     {v: k for k, v in x.items()}
 43 |     for x in [model.id_to_word, model.id_to_char, model.id_to_tag]
 44 | ]
 45 | parameters = model.parameters
 46 | 
 47 | # Load the model
 48 | _, f_eval = model.build4(parameters)
 49 | model.reload()
 50 | 
 51 | #load test sentence  
 52 | def load_sentences(path):
 53 |     sentences = []
 54 |     for line in codecs.open(path, 'r', 'utf8'):
 55 |         sentence =[];
 56 |         line = line.rstrip()
 57 |         if line:
 58 |             word = line.split()
 59 |             for elem in word:
 60 |                 sentence.append([elem]);
 61 |             sentences.append(sentence)
 62 |     return sentences 
 63 | 
 64 | opts.train="../data/chemdner_training.ner.doc.token4.BIO_allfea";
 65 | opts.dev="../data/chemdner_development.ner.doc.token4.BIO_allfea";
 66 | opts.test="../data/chemdner_evaluation.ner.doc.token4.BIO_allfea";
 67 | ssplitTrainFName="../data/training.ner.ssplit.token4.BIO";
 68 | ssplitDevFName="../data/development.ner.ssplit.token4.BIO";
 69 | ssplitTestFName="../data/evaluation.ner.ssplit.token4.BIO";
 70 | from utils import generateDocSentLen;
 71 | #get doc Len  for calcuate loss at sentences level
 72 | train_Lens=generateDocSentLen(opts.train,ssplitTrainFName);
 73 | dev_Lens=generateDocSentLen(opts.dev,ssplitDevFName);
 74 | test_Lens=generateDocSentLen(opts.test,ssplitTestFName);
 75 | 
 76 | test_sentences=load_sentences(opts.input);
 77 | test_data=prepare_dataset(test_sentences,test_Lens,parameters,parameters['lower'],isTest=True);
 78 | f_output = codecs.open(opts.output, 'w', 'utf-8')
 79 | start = time.time()
 80 | 
 81 | def xmlformat(sentence,tags):
 82 | #{{{
 83 |     assert len(sentence)==len(tags);
 84 |     res=[];
 85 |     preTag="";
 86 |     for i in range(len(tags)):
 87 |         if tags[i][0]=='B':
 88 |             if len(preTag):
 89 |                 res.append("</"+preTag+">");
 90 |                 preTag="";
 91 |             res.append("<"+tags[i][2:]+">");
 92 |             preTag=tags[i][2:];
 93 |         if tags[i][0]=='I':
 94 |             if preTag!=tags[i][2:]:
 95 |                 if len(preTag):
 96 |                     res.append("</"+preTag+">");
 97 |                     preTag="";
 98 | 
 99 |         if tags[i][0]=='O':
100 |             if len(preTag):
101 |                 res.append("</"+preTag+">");
102 |                 preTag="";
103 |         res.append(sentence[i]);
104 |     if len(preTag):
105 |         res.append("</"+preTag+">");
106 |     return res;
107 | #}}}
108 | print 'Tagging...'
109 | for line in test_data:
110 |     # Prepare input
111 |     input = create_input(line, parameters, False,useAttend=parameters['useAttend']);
112 |     words=line['str_words'];
113 |     # Decoding
114 |     if parameters['crf']:
115 |         y_preds = np.array(f_eval(*input))
116 |     else:
117 |         y_preds = f_eval(*input).argmax(axis=1)
118 |     y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds]
119 |     # Output tags in the IOB2 format
120 |     if parameters['tag_scheme'] == 'iobes':
121 |         y_preds = iobes_iob(y_preds)
122 |     # Write tags
123 |     assert len(y_preds) == len(words)
124 |     for i in range(len(words)):
125 |         f_output.write(words[i]+'\t'+y_preds[i]+'\n')
126 |     f_output.write('\n')
127 | #    for elem in xmlformat(words,y_preds):
128 | #                    f_output.write(elem+" ");
129 | #    f_output.write("\n");
130 | 
131 | print '---- lines tagged in %.4fs ----' % ( time.time() - start)
132 | f_output.close()
133 | 


--------------------------------------------------------------------------------
/src/activations.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | import backend as K
 3 | from utils import get_from_module
 4 | 
 5 | 
 6 | def softmax(x):
 7 |     ndim = K.ndim(x)
 8 |     if ndim == 2:
 9 |         return K.softmax(x)
10 |     elif ndim == 3:
11 |         e = K.exp(x - K.max(x, axis=-1, keepdims=True))
12 |         s = K.sum(e, axis=-1, keepdims=True)
13 |         return e / s
14 |     else:
15 |         raise ValueError('Cannot apply softmax to a tensor '
16 |                          'that is not 2D or 3D. '
17 |                          'Here, ndim=' + str(ndim))
18 | 
19 | 
20 | def elu(x, alpha=1.0):
21 |     return K.elu(x, alpha)
22 | 
23 | 
24 | def softplus(x):
25 |     return K.softplus(x)
26 | 
27 | 
28 | def softsign(x):
29 |     return K.softsign(x)
30 | 
31 | 
32 | def relu(x, alpha=0., max_value=None):
33 |     return K.relu(x, alpha=alpha, max_value=max_value)
34 | 
35 | 
36 | def tanh(x):
37 |     return K.tanh(x)
38 | 
39 | 
40 | def sigmoid(x):
41 |     return K.sigmoid(x)
42 | 
43 | 
44 | def hard_sigmoid(x):
45 |     return K.hard_sigmoid(x)
46 | 
47 | 
48 | def linear(x):
49 |     return x
50 | 
51 | 
52 | def get(identifier):
53 |     if identifier is None:
54 |         return linear
55 |     return get_from_module(identifier, globals(), 'activation function')
56 | 


--------------------------------------------------------------------------------
/src/backend/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import print_function
 3 | import os
 4 | import json
 5 | import sys
 6 | from .common import epsilon
 7 | from .common import floatx
 8 | from .common import set_epsilon
 9 | from .common import set_floatx
10 | from .common import get_uid
11 | from .common import cast_to_floatx
12 | from .common import image_dim_ordering
13 | from .common import set_image_dim_ordering
14 | from .common import is_keras_tensor
15 | from .common import legacy_weight_ordering
16 | from .common import set_legacy_weight_ordering
17 | 
18 | _keras_base_dir = os.path.expanduser('~')
19 | if not os.access(_keras_base_dir, os.W_OK):
20 |     _keras_base_dir = '/tmp'
21 | 
22 | _keras_dir = os.path.join(_keras_base_dir, '.keras')
23 | if not os.path.exists(_keras_dir):
24 |     os.makedirs(_keras_dir)
25 | 
26 | # Default backend: TensorFlow.
27 | _BACKEND = 'tensorflow'
28 | 
29 | _config_path = os.path.expanduser(os.path.join(_keras_dir, 'keras.json'))
30 | if os.path.exists(_config_path):
31 |     _config = json.load(open(_config_path))
32 |     _floatx = _config.get('floatx', floatx())
33 |     assert _floatx in {'float16', 'float32', 'float64'}
34 |     _epsilon = _config.get('epsilon', epsilon())
35 |     assert isinstance(_epsilon, float)
36 |     _backend = _config.get('backend', _BACKEND)
37 |     assert _backend in {'theano', 'tensorflow'}
38 |     _image_dim_ordering = _config.get('image_dim_ordering',
39 |                                       image_dim_ordering())
40 |     assert _image_dim_ordering in {'tf', 'th'}
41 | 
42 |     set_floatx(_floatx)
43 |     set_epsilon(_epsilon)
44 |     set_image_dim_ordering(_image_dim_ordering)
45 |     _BACKEND = _backend
46 | 
47 | # save config file
48 | if not os.path.exists(_config_path):
49 |     _config = {'floatx': floatx(),
50 |                'epsilon': epsilon(),
51 |                'backend': _BACKEND,
52 |                'image_dim_ordering': image_dim_ordering()}
53 |     with open(_config_path, 'w') as f:
54 |         f.write(json.dumps(_config, indent=4))
55 | 
56 | if 'KERAS_BACKEND' in os.environ:
57 |     _backend = os.environ['KERAS_BACKEND']
58 |     assert _backend in {'theano', 'tensorflow'}
59 |     _BACKEND = _backend
60 | 
61 | # import backend
62 | if _BACKEND == 'theano':
63 |     sys.stderr.write('Using Theano backend.\n')
64 |     from .theano_backend import *
65 | elif _BACKEND == 'tensorflow':
66 |     sys.stderr.write('Using TensorFlow backend.\n')
67 |     from .tensorflow_backend import *
68 | else:
69 |     raise ValueError('Unknown backend: ' + str(_BACKEND))
70 | 
71 | 
72 | def backend():
73 |     '''Publicly accessible method
74 |     for determining the current backend.
75 |     '''
76 |     return _BACKEND
77 | 


--------------------------------------------------------------------------------
/src/backend/common.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from collections import defaultdict
  4 | 
  5 | # the type of float to use throughout the session.
  6 | _FLOATX = 'float32'
  7 | _EPSILON = 10e-8
  8 | _UID_PREFIXES = defaultdict(int)
  9 | _IMAGE_DIM_ORDERING = 'tf'
 10 | _LEGACY_WEIGHT_ORDERING = False
 11 | 
 12 | 
 13 | def epsilon():
 14 |     '''Returns the value of the fuzz
 15 |     factor used in numeric expressions.
 16 | 
 17 |     # Returns
 18 |         A float.
 19 | 
 20 |     # Example
 21 |     ```python
 22 |         >>> keras.backend.epsilon()
 23 |         1e-08
 24 |     ```
 25 |     '''
 26 |     return _EPSILON
 27 | 
 28 | 
 29 | def set_epsilon(e):
 30 |     '''Sets the value of the fuzz
 31 |     factor used in numeric expressions.
 32 | 
 33 |     # Arguments
 34 |         e: float. New value of epsilon.
 35 | 
 36 |     # Example
 37 |     ```python
 38 |         >>> from keras import backend as K
 39 |         >>> K.epsilon()
 40 |         1e-08
 41 |         >>> K.set_epsilon(1e-05)
 42 |         >>> K.epsilon()
 43 |         1e-05
 44 |     ```
 45 |     '''
 46 |     global _EPSILON
 47 |     _EPSILON = e
 48 | 
 49 | 
 50 | def floatx():
 51 |     '''Returns the default float type, as a string
 52 |     (e.g. 'float16', 'float32', 'float64').
 53 | 
 54 |     # Returns
 55 |         String, the current default float type.
 56 | 
 57 |     # Example
 58 |     ```python
 59 |         >>> keras.backend.floatx()
 60 |         'float32'
 61 |     ```
 62 |     '''
 63 |     return _FLOATX
 64 | 
 65 | 
 66 | def set_floatx(floatx):
 67 |     '''Sets the default float type.
 68 | 
 69 |     # Arguments
 70 |         String: 'float16', 'float32', or 'float64'.
 71 | 
 72 |     # Example
 73 |     ```python
 74 |         >>> from keras import backend as K
 75 |         >>> K.floatx()
 76 |         'float32'
 77 |         >>> K.set_floatx('float16')
 78 |         >>> K.floatx()
 79 |         'float16'
 80 |     ```
 81 |     '''
 82 |     global _FLOATX
 83 |     if floatx not in {'float16', 'float32', 'float64'}:
 84 |         raise ValueError('Unknown floatx type: ' + str(floatx))
 85 |     _FLOATX = str(floatx)
 86 | 
 87 | 
 88 | def cast_to_floatx(x):
 89 |     '''Cast a Numpy array to the default Keras float type.
 90 | 
 91 |     # Arguments
 92 |         x: Numpy array.
 93 | 
 94 |     # Returns
 95 |         The same Numpy array, cast to its new type.
 96 | 
 97 |     # Example
 98 |     ```python
 99 |         >>> from keras import backend as K
100 |         >>> K.floatx()
101 |         'float32'
102 |         >>> arr = numpy.array([1.0, 2.0], dtype='float64')
103 |         >>> arr.dtype
104 |         dtype('float64')
105 |         >>> new_arr = K.cast_to_floatx(arr)
106 |         >>> new_arr
107 |         array([ 1.,  2.], dtype=float32)
108 |         >>> new_arr.dtype
109 |         dtype('float32')
110 |     ```
111 |     '''
112 |     return np.asarray(x, dtype=_FLOATX)
113 | 
114 | 
115 | def image_dim_ordering():
116 |     '''Returns the default image dimension ordering
117 |     convention ('th' or 'tf').
118 | 
119 |     # Returns
120 |         A string, either `'th'` or `'tf'`
121 | 
122 |     # Example
123 |     ```python
124 |         >>> keras.backend.image_dim_ordering()
125 |         'th'
126 |     ```
127 |     '''
128 |     return _IMAGE_DIM_ORDERING
129 | 
130 | 
131 | def set_image_dim_ordering(dim_ordering):
132 |     '''Sets the value of the image dimension
133 |     ordering convention ('th' or 'tf').
134 | 
135 |     # Arguments
136 |         dim_ordering: string. `'th'` or `'tf'`.
137 | 
138 |     # Example
139 |     ```python
140 |         >>> from keras import backend as K
141 |         >>> K.image_dim_ordering()
142 |         'th'
143 |         >>> K.set_image_dim_ordering('tf')
144 |         >>> K.image_dim_ordering()
145 |         'tf'
146 |     ```
147 |     '''
148 |     global _IMAGE_DIM_ORDERING
149 |     if dim_ordering not in {'tf', 'th'}:
150 |         raise ValueError('Unknown dim_ordering:', dim_ordering)
151 |     _IMAGE_DIM_ORDERING = str(dim_ordering)
152 | 
153 | 
154 | def get_uid(prefix=''):
155 |     '''Provides a unique UID given a string prefix.
156 | 
157 |     # Arguments
158 |         prefix: string.
159 | 
160 |     # Returns
161 |         An integer.
162 | 
163 |     # Example
164 |     ```
165 |         >>> keras.backend.get_uid('dense')
166 |         >>> 1
167 |         >>> keras.backend.get_uid('dense')
168 |         >>> 2
169 |     ```
170 | 
171 |     '''
172 |     _UID_PREFIXES[prefix] += 1
173 |     return _UID_PREFIXES[prefix]
174 | 
175 | 
176 | def reset_uids():
177 |     global _UID_PREFIXES
178 |     _UID_PREFIXES = defaultdict(int)
179 | 
180 | 
181 | def is_keras_tensor(x):
182 |     '''Returns whether `x` is a Keras tensor.
183 | 
184 |     # Arguments
185 |         x: a potential tensor.
186 | 
187 |     # Returns
188 |         A boolean: whether the argument is a Keras tensor.
189 | 
190 |     # Examples
191 |     ```python
192 |         >>> from keras import backend as K
193 |         >>> np_var = numpy.array([1, 2])
194 |         >>> K.is_keras_tensor(np_var)
195 |         False
196 |         >>> keras_var = K.variable(np_var)
197 |         >>> K.is_keras_tensor(keras_var)  # A variable is not a Tensor.
198 |         False
199 |         >>> keras_placeholder = K.placeholder(shape=(2, 4, 5))
200 |         >>> K.is_keras_tensor(keras_placeholder)  # A placeholder is a Tensor.
201 |         True
202 |     ```
203 |     '''
204 |     if hasattr(x, '_keras_shape'):
205 |         return True
206 |     else:
207 |         return False
208 | 
209 | 
210 | def set_legacy_weight_ordering(value):
211 |     global _LEGACY_WEIGHT_ORDERING
212 |     assert value in {True, False}
213 |     _LEGACY_WEIGHT_ORDERING = value
214 | 
215 | 
216 | def legacy_weight_ordering():
217 |     return _LEGACY_WEIGHT_ORDERING
218 | 


--------------------------------------------------------------------------------
/src/evaluation/conlleval:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl -w
  2 | # conlleval: evaluate result of processing CoNLL-2000 shared task
  3 | # usage:     conlleval [-l] [-r] [-d delimiterTag] [-o oTag] < file
  4 | #            README: http://cnts.uia.ac.be/conll2000/chunking/output.html
  5 | # options:   l: generate LaTeX output for tables like in
  6 | #               http://cnts.uia.ac.be/conll2003/ner/example.tex
  7 | #            r: accept raw result tags (without B- and I- prefix;
  8 | #                                       assumes one word per chunk)
  9 | #            d: alternative delimiter tag (default is single space)
 10 | #            o: alternative outside tag (default is O)
 11 | # note:      the file should contain lines with items separated
 12 | #            by $delimiter characters (default space). The final
 13 | #            two items should contain the correct tag and the 
 14 | #            guessed tag in that order. Sentences should be
 15 | #            separated from each other by empty lines or lines
 16 | #            with $boundary fields (default -X-).
 17 | # url:       http://lcg-www.uia.ac.be/conll2000/chunking/
 18 | # started:   1998-09-25
 19 | # version:   2004-01-26
 20 | # author:    Erik Tjong Kim Sang <erikt@uia.ua.ac.be>
 21 | 
 22 | use strict;
 23 | 
 24 | my $false = 0;
 25 | my $true = 42;
 26 | 
 27 | my $boundary = "-X-";     # sentence boundary
 28 | my $correct;              # current corpus chunk tag (I,O,B)
 29 | my $correctChunk = 0;     # number of correctly identified chunks
 30 | my $correctTags = 0;      # number of correct chunk tags
 31 | my $correctType;          # type of current corpus chunk tag (NP,VP,etc.)
 32 | my $delimiter = " ";      # field delimiter
 33 | my $FB1 = 0.0;            # FB1 score (Van Rijsbergen 1979)
 34 | my $firstItem;            # first feature (for sentence boundary checks)
 35 | my $foundCorrect = 0;     # number of chunks in corpus
 36 | my $foundGuessed = 0;     # number of identified chunks
 37 | my $guessed;              # current guessed chunk tag
 38 | my $guessedType;          # type of current guessed chunk tag
 39 | my $i;                    # miscellaneous counter
 40 | my $inCorrect = $false;   # currently processed chunk is correct until now
 41 | my $lastCorrect = "O";    # previous chunk tag in corpus
 42 | my $latex = 0;            # generate LaTeX formatted output
 43 | my $lastCorrectType = ""; # type of previously identified chunk tag
 44 | my $lastGuessed = "O";    # previously identified chunk tag
 45 | my $lastGuessedType = ""; # type of previous chunk tag in corpus
 46 | my $lastType;             # temporary storage for detecting duplicates
 47 | my $line;                 # line
 48 | my $nbrOfFeatures = -1;   # number of features per line
 49 | my $precision = 0.0;      # precision score
 50 | my $oTag = "O";           # outside tag, default O
 51 | my $raw = 0;              # raw input: add B to every token
 52 | my $recall = 0.0;         # recall score
 53 | my $tokenCounter = 0;     # token counter (ignores sentence breaks)
 54 | 
 55 | my %correctChunk = ();    # number of correctly identified chunks per type
 56 | my %foundCorrect = ();    # number of chunks in corpus per type
 57 | my %foundGuessed = ();    # number of identified chunks per type
 58 | 
 59 | my @features;             # features on line
 60 | my @sortedTypes;          # sorted list of chunk type names
 61 | 
 62 | # sanity check
 63 | while (@ARGV and $ARGV[0] =~ /^-/) {
 64 |    if ($ARGV[0] eq "-l") { $latex = 1; shift(@ARGV); }
 65 |    elsif ($ARGV[0] eq "-r") { $raw = 1; shift(@ARGV); }
 66 |    elsif ($ARGV[0] eq "-d") { 
 67 |       shift(@ARGV); 
 68 |       if (not defined $ARGV[0]) { 
 69 |          die "conlleval: -d requires delimiter character"; 
 70 |       }
 71 |       $delimiter = shift(@ARGV);
 72 |    } elsif ($ARGV[0] eq "-o") {
 73 |       shift(@ARGV);
 74 |       if (not defined $ARGV[0]) {
 75 |          die "conlleval: -o requires delimiter character";
 76 |       }
 77 |       $oTag = shift(@ARGV);
 78 |    } else { die "conlleval: unknown argument $ARGV[0]\n"; }
 79 | }
 80 | if (@ARGV) { die "conlleval: unexpected command line argument\n"; }
 81 | # process input
 82 | while (<STDIN>) {
 83 |    chomp($line = $_);
 84 |    @features = split(/$delimiter/,$line);
 85 |    if ($nbrOfFeatures < 0) { $nbrOfFeatures = $#features; }
 86 |    elsif ($nbrOfFeatures != $#features and @features != 0) {
 87 |       printf STDERR "unexpected number of features: %d (%d)\n",
 88 |          $#features+1,$nbrOfFeatures+1;
 89 |       exit(1);
 90 |    }
 91 |    if (@features == 0 or 
 92 |        $features[0] eq $boundary) { @features = ($boundary,"O","O"); }
 93 |    if (@features < 2) { 
 94 |       die "conlleval: unexpected number of features in line $line\n"; 
 95 |    }
 96 |    if ($raw) {
 97 |       if ($features[$#features] eq $oTag) { $features[$#features] = "O"; } 
 98 |       if ($features[$#features-1] eq $oTag) { $features[$#features-1] = "O"; } 
 99 |       if ($features[$#features] ne "O") { 
100 |          $features[$#features] = "B-$features[$#features]";
101 |       }
102 |       if ($features[$#features-1] ne "O") { 
103 |          $features[$#features-1] = "B-$features[$#features-1]";
104 |       }
105 |    }
106 |    # 20040126 ET code which allows hyphens in the types
107 |    if ($features[$#features] =~ /^([^-]*)-(.*)$/) {
108 |       $guessed = $1;
109 |       $guessedType = $2;
110 |    } else { 
111 |       $guessed = $features[$#features]; 
112 |       $guessedType = ""; 
113 |    }
114 |    pop(@features);
115 |    if ($features[$#features] =~ /^([^-]*)-(.*)$/) {
116 |       $correct = $1;
117 |       $correctType = $2;
118 |    } else { 
119 |       $correct = $features[$#features]; 
120 |       $correctType = ""; 
121 |    }
122 |    pop(@features);
123 | #  ($guessed,$guessedType) = split(/-/,pop(@features));
124 | #  ($correct,$correctType) = split(/-/,pop(@features));
125 |    $guessedType = $guessedType ? $guessedType : "";
126 |    $correctType = $correctType ? $correctType : "";
127 |    $firstItem = shift(@features);
128 | 
129 |    # 1999-06-26 sentence breaks should always be counted as out of chunk
130 |    if ( $firstItem eq $boundary ) { $guessed = "O"; }
131 | 
132 |    if ($inCorrect) {
133 |       if ( &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and
134 |            &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and
135 |            $lastGuessedType eq $lastCorrectType) {
136 |          $inCorrect=$false;
137 |          $correctChunk++;
138 |          $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ?
139 |              $correctChunk{$lastCorrectType}+1 : 1;
140 |       } elsif ( 
141 |            &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) != 
142 |            &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) or
143 |            $guessedType ne $correctType ) {
144 |          $inCorrect=$false; 
145 |       }
146 |    }
147 | 
148 |    if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and 
149 |         &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and
150 |         $guessedType eq $correctType) { $inCorrect = $true; }
151 | 
152 |    if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) ) {
153 |       $foundCorrect++; 
154 |       $foundCorrect{$correctType} = $foundCorrect{$correctType} ?
155 |           $foundCorrect{$correctType}+1 : 1;
156 |    }
157 |    if ( &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) ) {
158 |       $foundGuessed++; 
159 |       $foundGuessed{$guessedType} = $foundGuessed{$guessedType} ?
160 |           $foundGuessed{$guessedType}+1 : 1;
161 |    }
162 |    if ( $firstItem ne $boundary ) { 
163 |       if ( $correct eq $guessed and $guessedType eq $correctType ) { 
164 |          $correctTags++; 
165 |       }
166 |       $tokenCounter++; 
167 |    }
168 | 
169 |    $lastGuessed = $guessed;
170 |    $lastCorrect = $correct;
171 |    $lastGuessedType = $guessedType;
172 |    $lastCorrectType = $correctType;
173 | }
174 | if ($inCorrect) { 
175 |    $correctChunk++;
176 |    $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ?
177 |        $correctChunk{$lastCorrectType}+1 : 1;
178 | }
179 | 
180 | if (not $latex) {
181 |    # compute overall precision, recall and FB1 (default values are 0.0)
182 |    $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0);
183 |    $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0);
184 |    $FB1 = 2*$precision*$recall/($precision+$recall)
185 |       if ($precision+$recall > 0);
186 |    
187 |    # print overall performance
188 |    printf "processed $tokenCounter tokens with $foundCorrect phrases; ";
189 |    printf "found: $foundGuessed phrases; correct: $correctChunk.\n";
190 |    if ($tokenCounter>0) {
191 |       printf "accuracy: %6.2f%%; ",100*$correctTags/$tokenCounter;
192 |       printf "precision: %6.2f%%; ",$precision;
193 |       printf "recall: %6.2f%%; ",$recall;
194 |       printf "FB1: %6.2f\n",$FB1;
195 |    }
196 | }
197 | 
198 | # sort chunk type names
199 | undef($lastType);
200 | @sortedTypes = ();
201 | foreach $i (sort (keys %foundCorrect,keys %foundGuessed)) {
202 |    if (not($lastType) or $lastType ne $i) { 
203 |       push(@sortedTypes,($i));
204 |    }
205 |    $lastType = $i;
206 | }
207 | # print performance per chunk type
208 | if (not $latex) {
209 |    for $i (@sortedTypes) {
210 |       $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0;
211 |       if (not($foundGuessed{$i})) { $foundGuessed{$i} = 0; $precision = 0.0; }
212 |       else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; }
213 |       if (not($foundCorrect{$i})) { $recall = 0.0; }
214 |       else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; }
215 |       if ($precision+$recall == 0.0) { $FB1 = 0.0; }
216 |       else { $FB1 = 2*$precision*$recall/($precision+$recall); }
217 |       printf "%17s: ",$i;
218 |       printf "precision: %6.2f%%; ",$precision;
219 |       printf "recall: %6.2f%%; ",$recall;
220 |       printf "FB1: %6.2f  %d\n",$FB1,$foundGuessed{$i};
221 |    }
222 | } else {
223 |    print "        & Precision &  Recall  & F\$_{\\beta=1} \\\\\\hline";
224 |    for $i (@sortedTypes) {
225 |       $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0;
226 |       if (not($foundGuessed{$i})) { $precision = 0.0; }
227 |       else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; }
228 |       if (not($foundCorrect{$i})) { $recall = 0.0; }
229 |       else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; }
230 |       if ($precision+$recall == 0.0) { $FB1 = 0.0; }
231 |       else { $FB1 = 2*$precision*$recall/($precision+$recall); }
232 |       printf "\n%-7s &  %6.2f\\%% & %6.2f\\%% & %6.2f \\\\",
233 |              $i,$precision,$recall,$FB1;
234 |    }
235 |    print "\\hline\n";
236 |    $precision = 0.0;
237 |    $recall = 0;
238 |    $FB1 = 0.0;
239 |    $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0);
240 |    $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0);
241 |    $FB1 = 2*$precision*$recall/($precision+$recall)
242 |       if ($precision+$recall > 0);
243 |    printf "Overall &  %6.2f\\%% & %6.2f\\%% & %6.2f \\\\\\hline\n",
244 |           $precision,$recall,$FB1;
245 | }
246 | 
247 | exit 0;
248 | 
249 | # endOfChunk: checks if a chunk ended between the previous and current word
250 | # arguments:  previous and current chunk tags, previous and current types
251 | # note:       this code is capable of handling other chunk representations
252 | #             than the default CoNLL-2000 ones, see EACL'99 paper of Tjong
253 | #             Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006
254 | 
255 | sub endOfChunk {
256 |    my $prevTag = shift(@_);
257 |    my $tag = shift(@_);
258 |    my $prevType = shift(@_);
259 |    my $type = shift(@_);
260 |    my $chunkEnd = $false;
261 | 
262 |    if ( $prevTag eq "B" and $tag eq "B" ) { $chunkEnd = $true; }
263 |    if ( $prevTag eq "B" and $tag eq "O" ) { $chunkEnd = $true; }
264 |    if ( $prevTag eq "I" and $tag eq "B" ) { $chunkEnd = $true; }
265 |    if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; }
266 | 
267 |    if ( $prevTag eq "E" and $tag eq "E" ) { $chunkEnd = $true; }
268 |    if ( $prevTag eq "E" and $tag eq "I" ) { $chunkEnd = $true; }
269 |    if ( $prevTag eq "E" and $tag eq "O" ) { $chunkEnd = $true; }
270 |    if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; }
271 | 
272 |    if ($prevTag ne "O" and $prevTag ne "." and $prevType ne $type) { 
273 |       $chunkEnd = $true; 
274 |    }
275 | 
276 |    # corrected 1998-12-22: these chunks are assumed to have length 1
277 |    if ( $prevTag eq "]" ) { $chunkEnd = $true; }
278 |    if ( $prevTag eq "[" ) { $chunkEnd = $true; }
279 | 
280 |    return($chunkEnd);   
281 | }
282 | 
283 | # startOfChunk: checks if a chunk started between the previous and current word
284 | # arguments:    previous and current chunk tags, previous and current types
285 | # note:         this code is capable of handling other chunk representations
286 | #               than the default CoNLL-2000 ones, see EACL'99 paper of Tjong
287 | #               Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006
288 | 
289 | sub startOfChunk {
290 |    my $prevTag = shift(@_);
291 |    my $tag = shift(@_);
292 |    my $prevType = shift(@_);
293 |    my $type = shift(@_);
294 |    my $chunkStart = $false;
295 | 
296 |    if ( $prevTag eq "B" and $tag eq "B" ) { $chunkStart = $true; }
297 |    if ( $prevTag eq "I" and $tag eq "B" ) { $chunkStart = $true; }
298 |    if ( $prevTag eq "O" and $tag eq "B" ) { $chunkStart = $true; }
299 |    if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; }
300 | 
301 |    if ( $prevTag eq "E" and $tag eq "E" ) { $chunkStart = $true; }
302 |    if ( $prevTag eq "E" and $tag eq "I" ) { $chunkStart = $true; }
303 |    if ( $prevTag eq "O" and $tag eq "E" ) { $chunkStart = $true; }
304 |    if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; }
305 | 
306 |    if ($tag ne "O" and $tag ne "." and $prevType ne $type) { 
307 |       $chunkStart = $true; 
308 |    }
309 | 
310 |    # corrected 1998-12-22: these chunks are assumed to have length 1
311 |    if ( $tag eq "[" ) { $chunkStart = $true; }
312 |    if ( $tag eq "]" ) { $chunkStart = $true; }
313 | 
314 |    return($chunkStart);   
315 | }
316 | 


--------------------------------------------------------------------------------
/src/initializations.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | import numpy as np
  3 | import  backend as K
  4 | from utils import get_from_module
  5 | 
  6 | def get_fans(shape, dim_ordering='th'):
  7 |     if len(shape) == 2:
  8 |         fan_in = shape[0]
  9 |         fan_out = shape[1]
 10 |     elif len(shape) == 4 or len(shape) == 5:
 11 |         # assuming convolution kernels (2D or 3D).
 12 |         # TH kernel shape: (depth, input_depth, ...)
 13 |         # TF kernel shape: (..., input_depth, depth)
 14 |         if dim_ordering == 'th':
 15 |             receptive_field_size = np.prod(shape[2:])
 16 |             fan_in = shape[1] * receptive_field_size
 17 |             fan_out = shape[0] * receptive_field_size
 18 |         elif dim_ordering == 'tf':
 19 |             receptive_field_size = np.prod(shape[:2])
 20 |             fan_in = shape[-2] * receptive_field_size
 21 |             fan_out = shape[-1] * receptive_field_size
 22 |         else:
 23 |             raise ValueError('Invalid dim_ordering: ' + dim_ordering)
 24 |     else:
 25 |         # no specific assumptions
 26 |         fan_in = np.sqrt(np.prod(shape))
 27 |         fan_out = np.sqrt(np.prod(shape))
 28 |     return fan_in, fan_out
 29 | 
 30 | 
 31 | def uniform(shape, scale=0.05, name=None):
 32 |     return K.random_uniform_variable(shape, -scale, scale, name=name)
 33 | 
 34 | 
 35 | def normal(shape, scale=0.05, name=None):
 36 |     return K.random_normal_variable(shape, 0.0, scale, name=name)
 37 | 
 38 | 
 39 | def lecun_uniform(shape, name=None, dim_ordering='th'):
 40 |     ''' Reference: LeCun 98, Efficient Backprop
 41 |         http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf
 42 |     '''
 43 |     fan_in, fan_out = get_fans(shape, dim_ordering=dim_ordering)
 44 |     scale = np.sqrt(3. / fan_in)
 45 |     return uniform(shape, scale, name=name)
 46 | 
 47 | 
 48 | def glorot_normal(shape, name=None, dim_ordering='th'):
 49 |     ''' Reference: Glorot & Bengio, AISTATS 2010
 50 |     '''
 51 |     fan_in, fan_out = get_fans(shape, dim_ordering=dim_ordering)
 52 |     s = np.sqrt(2. / (fan_in + fan_out))
 53 |     return normal(shape, s, name=name)
 54 | 
 55 | 
 56 | def glorot_uniform(shape, name=None, dim_ordering='th'):
 57 |     fan_in, fan_out = get_fans(shape, dim_ordering=dim_ordering)
 58 |     s = np.sqrt(6. / (fan_in + fan_out))
 59 |     return uniform(shape, s, name=name)
 60 | 
 61 | 
 62 | def he_normal(shape, name=None, dim_ordering='th'):
 63 |     ''' Reference:  He et al., http://arxiv.org/abs/1502.01852
 64 |     '''
 65 |     fan_in, fan_out = get_fans(shape, dim_ordering=dim_ordering)
 66 |     s = np.sqrt(2. / fan_in)
 67 |     return normal(shape, s, name=name)
 68 | 
 69 | 
 70 | def he_uniform(shape, name=None, dim_ordering='th'):
 71 |     fan_in, fan_out = get_fans(shape, dim_ordering=dim_ordering)
 72 |     s = np.sqrt(6. / fan_in)
 73 |     return uniform(shape, s, name=name)
 74 | 
 75 | 
 76 | def orthogonal(shape, scale=1.1, name=None):
 77 |     ''' From Lasagne. Reference: Saxe et al., http://arxiv.org/abs/1312.6120
 78 |     '''
 79 |     flat_shape = (shape[0], np.prod(shape[1:]))
 80 |     a = np.random.normal(0.0, 1.0, flat_shape)
 81 |     u, _, v = np.linalg.svd(a, full_matrices=False)
 82 |     # pick the one with the correct shape
 83 |     q = u if u.shape == flat_shape else v
 84 |     q = q.reshape(shape)
 85 |     return K.variable(scale * q[:shape[0], :shape[1]], name=name)
 86 | 
 87 | 
 88 | def identity(shape, scale=1, name=None):
 89 |     if len(shape) != 2 or shape[0] != shape[1]:
 90 |         raise ValueError('Identity matrix initialization can only be used '
 91 |                          'for 2D square matrices.')
 92 |     else:
 93 |         return K.variable(scale * np.identity(shape[0]), name=name)
 94 | 
 95 | 
 96 | def zero(shape, name=None):
 97 |     return K.zeros(shape, name=name)
 98 | 
 99 | 
100 | def one(shape, name=None):
101 |     return K.ones(shape, name=name)
102 | 
103 | 
104 | def get(identifier, **kwargs):
105 |     return get_from_module(identifier, globals(),
106 |                            'initialization', kwargs=kwargs)
107 | 


--------------------------------------------------------------------------------
/src/loader.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import codecs
  4 | from utils import create_dico, create_mapping, zero_digits
  5 | from utils import iob2, iob_iobes
  6 | unknown_word='<UNK>';
  7 | 
  8 | def load_sentences(path, lower, zeros):
  9 | #{{{
 10 |     """
 11 |     Load sentences. A line must contain at least a word and its tag.
 12 |     Sentences are separated by empty lines.
 13 |     """
 14 |     sentences = []
 15 |     sentence = []
 16 |     for line in codecs.open(path, 'r', 'utf8'):
 17 |         line = zero_digits(line.rstrip()) if zeros else line.rstrip()
 18 |         if not line:
 19 |             if len(sentence) > 0:
 20 |                 if 'DOCSTART' not in sentence[0][0]:
 21 |                     sentences.append(sentence)
 22 |                 sentence = []
 23 |         else:
 24 |             word = line.split()
 25 |             assert len(word) >= 2
 26 |             sentence.append(word)
 27 |     if len(sentence) > 0:
 28 |         if 'DOCSTART' not in sentence[0][0]:
 29 |             sentences.append(sentence)
 30 |     return sentences
 31 | #}}}
 32 | 
 33 | def update_tag_scheme(sentences, tag_scheme,removeTag=None):
 34 | #{{{
 35 |     """
 36 |     Check and update sentences tagging scheme to IOB2.
 37 |     Only IOB1 and IOB2 schemes are accepted.
 38 |     """
 39 |     for i, s in enumerate(sentences):
 40 |         tags = [w[-1] for w in s]
 41 |         # Check that tags are given in the IOB format
 42 |         if not iob2(tags):
 43 |             s_str = '\n'.join(' '.join(w) for w in s)
 44 |             raise Exception('Sentences should be given in IOB format! ' +
 45 |                             'Please check sentence %i:\n%s' % (i, s_str))
 46 |         if tag_scheme == 'iob':
 47 |             # If format was IOB1, we convert to IOB2
 48 |             for word, new_tag in zip(s, tags):
 49 |                 if removeTag is not None:
 50 |                     if new_tag[2:] in removeTag:
 51 |                         word[-1]='O';
 52 |                     else:
 53 |                         word[-1]=new_tag;
 54 |         elif tag_scheme == 'iobes':
 55 |             new_tags = iob_iobes(tags)
 56 |             for word, new_tag in zip(s, new_tags):
 57 |                 word[-1] = new_tag
 58 |         else:
 59 |             raise Exception('Unknown tagging scheme!')
 60 | #}}}
 61 | 
 62 | def word_mapping(sentences, lower):
 63 | #{{{
 64 |     """
 65 |     Create a dictionary and a mapping of words, sorted by frequency.
 66 |     """
 67 |     words = [[x[0].lower() if lower else x[0] for x in s] for s in sentences]
 68 |     dico = create_dico(words)
 69 |     dico['<UNK>'] = 10000000
 70 |     word_to_id, id_to_word = create_mapping(dico)
 71 |     print "Found %i unique words (%i in total)" % (
 72 |         len(dico), sum(len(x) for x in words)
 73 |     )
 74 |     return dico, word_to_id, id_to_word
 75 | #}}}
 76 | 
 77 | def feature_mapping(sentences,index,featureName="",isPos=False):
 78 | #{{{
 79 |     """
 80 |     Create a dictionary and mapping of characters, sorted by frequency.
 81 |     """
 82 |     if isPos:
 83 |         features = [[w[0].lower()+"_"+w[index] for w in s] for s in sentences]
 84 |     else: 
 85 |         features = [[w[index] for w in s] for s in sentences]
 86 |     dico = create_dico(features)
 87 |     dico[unknown_word]=10000000
 88 |     feature_to_id, id_to_feature = create_mapping(dico)
 89 |     print "Found %i unique %s features" % (len(dico),featureName)
 90 |     return dico, feature_to_id, id_to_feature
 91 | #}}}
 92 | 
 93 | def char_mapping(sentences):
 94 | #{{{
 95 |     """
 96 |     Create a dictionary and mapping of characters, sorted by frequency.
 97 |     """
 98 |     chars = ["".join([w[0] for w in s]) for s in sentences]
 99 |     dico = create_dico(chars)
100 |     char_to_id, id_to_char = create_mapping(dico)
101 |     print "Found %i unique characters" % len(dico)
102 |     return dico, char_to_id, id_to_char
103 | #}}}
104 | 
105 | def tag_mapping(sentences):
106 | #{{{
107 |     """
108 |     Create a dictionary and a mapping of tags, sorted by frequency.
109 |     """
110 |     tags = [[word[-1] for word in s] for s in sentences]
111 |     dico = create_dico(tags)
112 |     tag_to_id, id_to_tag = create_mapping(dico)
113 |     print "Found %i unique named entity tags" % len(dico)
114 |     return dico, tag_to_id, id_to_tag
115 | #}}}
116 | 
117 | def cap_feature(s):
118 | #{{{
119 |     """
120 |     Capitalization feature:
121 |     0 = low caps
122 |     1 = all caps
123 |     2 = first letter caps
124 |     3 = one capital (not first letter)
125 |     """
126 |     if s.lower() == s:
127 |         return 0
128 |     elif s.upper() == s:
129 |         return 1
130 |     elif s[0].upper() == s[0]:
131 |         return 2
132 |     else:
133 |         return 3
134 | #}}}
135 | 
136 | def prepare_sentence(str_words, word_to_id, char_to_id, lower=False):
137 | #{{{
138 |     """
139 |     Prepare a sentence for evaluation.
140 |     """
141 |     def f(x,flag=lower): return x.lower() if flag else x
142 |     words = [word_to_id[f(w) if f(w) in word_to_id else '<UNK>']
143 |              for w in str_words]
144 |     charLower=False;
145 |     if charLower:
146 |         chars = [[char_to_id[c] for c in w.lower() if c in char_to_id]
147 |                  for w in str_words]
148 |     else:
149 |         chars = [[char_to_id[c] for c in w if c in char_to_id]
150 |                  for w in str_words]
151 |     caps = [cap_feature(w) for w in str_words]
152 |     return {
153 |         'str_words': str_words,
154 |         'words': words,
155 |         'chars': chars,
156 |         'caps': caps
157 |     }
158 | #}}}
159 | 
160 | def prepare_dataset(sentences,docLen,parameters,
161 |                         lower=False,isTest=False):
162 | #{{{
163 |     """
164 |     Prepare the dataset. Return a list of lists of dictionaries containing:
165 |         - word indexes
166 |         - word char indexes
167 |         - tag indexes
168 |     """
169 |     def f(x): return x.lower() if lower else x
170 |     #get mapping 
171 | #{{{
172 |     features=parameters['features'];
173 |     feature2IdMap=features['feature2IdMap'];
174 |     word_to_id=feature2IdMap['word'];
175 |     char_to_id=feature2IdMap['char'];
176 |     tag_to_id=feature2IdMap['tag'];
177 |     if features['lemma']['isUsed']:
178 |         lemma_to_id=feature2IdMap['lemma'];
179 |     if features['pos']['isUsed']:
180 |         pos_to_id=feature2IdMap['pos'];
181 |     if features['chunk']['isUsed']:
182 |         chunk_to_id=feature2IdMap['chunk'];
183 |     if features['dic']['isUsed']:
184 |         dic_to_id=feature2IdMap['dic'];
185 | #}}}
186 |     data = []
187 |     if docLen is not None and len(sentences) != len(docLen):
188 |         print "len(doc) != len(docLen)";
189 |         assert 0;
190 |     i=0;
191 |     for s in sentences:
192 |         str_words = [w[0] for w in s]
193 |         elem=prepare_sentence(str_words,word_to_id,char_to_id,lower);
194 |         words = elem['words']
195 |         # Skip characters that are not in the training set
196 |         chars = elem['chars']
197 |         caps = elem['caps'];
198 |         if not isTest:
199 |             tags = [tag_to_id[w[-1]] for w in s]
200 |         
201 |         e={
202 |            'str_words': str_words,
203 |             'words': words,
204 |             'chars': chars,
205 |             'caps': caps,
206 |         }
207 |        
208 |         #add features
209 | #{{{
210 |         if features['lemma']['isUsed']:
211 |             lemma=[lemma_to_id[w[1]] 
212 |                         if w[1] in lemma_to_id 
213 |                                 else lemma_to_id[unknown_word] for w in s];
214 |             e['lemma']=lemma;
215 |         if features['pos']['isUsed']:
216 |             pos=[pos_to_id[w[2]] 
217 |                         if w[2] in pos_to_id 
218 |                                 else pos_to_id[unknown_word] for w in s];
219 |             e['pos']=pos;
220 |         if features['chunk']['isUsed']:
221 |             chunk=[chunk_to_id[w[3]] 
222 |                         if w[3] in chunk_to_id 
223 |                                 else chunk_to_id[unknown_word] for w in s];
224 |             e['chunk']=chunk;
225 |         if features['dic']['isUsed']:
226 |             ner=[dic_to_id[w[4]] for w in s];
227 |             e['dic']=ner; 
228 |        #}}}
229 | 
230 |         #append doc len to data  
231 |         if parameters.has_key('sentencesLevelLoss') \
232 |                 and parameters['sentencesLevelLoss']:
233 |             lens=docLen[i];
234 |             i+=1;
235 |             e['lens']=lens;
236 |        
237 |         if not isTest:
238 |             e['tags']=tags;
239 |         
240 | 
241 | 
242 |         data.append(e);
243 |     return data
244 | #}}}
245 | 
246 | def augment_with_pretrained(dictionary, ext_emb_path, words):
247 | #{{{
248 |     """
249 |     Augment the dictionary with words that have a pretrained embedding.
250 |     If `words` is None, we add every word that has a pretrained embedding
251 |     to the dictionary, otherwise, we only add the words that are given by
252 |     `words` (typically the words in the development and test sets.)
253 |     """
254 |     print 'Loading pretrained embeddings from %s...' % ext_emb_path
255 |     assert os.path.isfile(ext_emb_path)
256 | 
257 |     # Load pretrained embeddings from file
258 |     pretrained = set([
259 |         line.rstrip().split()[0].strip()
260 |         for line in codecs.open(ext_emb_path, 'r', 'utf-8')
261 |         if len(ext_emb_path) > 0
262 |     ])
263 | 
264 |     # We either add every word in the pretrained file,
265 |     # or only words given in the `words` list to which
266 |     # we can assign a pretrained embedding
267 |     if words is None:
268 |         for word in pretrained:
269 |             if word not in dictionary:
270 |                 dictionary[word] = 0
271 |     else:
272 |         for word in words:
273 |             if any(x in pretrained for x in [
274 |                 word,
275 |                 word.lower(),
276 |                 re.sub('\d', '0', word.lower())
277 |             ]) and word not in dictionary:
278 |                 dictionary[word] = 0
279 | 
280 |     word_to_id, id_to_word = create_mapping(dictionary)
281 |     return dictionary, word_to_id, id_to_word 
282 | #}}}
283 | 


--------------------------------------------------------------------------------
/src/model.py:
--------------------------------------------------------------------------------
   1 | import os
   2 | import re
   3 | import numpy as np
   4 | import scipy.io
   5 | import theano
   6 | import theano.tensor as T
   7 | import codecs
   8 | import cPickle
   9 | 
  10 | from utils import shared, set_values, get_name
  11 | from nn import HiddenLayer, EmbeddingLayer, DropoutLayer, forward 
  12 | from nn import LSTM;
  13 | #from nn import LSTM_normal as LSTM;
  14 | from nn import AttentionLayer;
  15 | from optimization import Optimization 
  16 | 
  17 | def loadPreEmbFeatures(fName,feature_to_id,weights,lower=False):
  18 | #{{{
  19 |     def f(x): return x.lower() if lower else x 
  20 |     #to lower
  21 |     feature_to_id_=feature_to_id;
  22 |     if lower:
  23 |         feature_to_id_lower={};
  24 |         for elem in feature_to_id.items():
  25 |             feature_to_id_lower[elem[0].lower()]=elem[1];
  26 |         feature_to_id_=feature_to_id_lower;
  27 |     feature_dim=weights.shape[1];
  28 | 
  29 |     invalid_count=0;
  30 |     valid_count=0;
  31 |     for line in codecs.open(fName,'r','utf-8'):
  32 |         line=line.rstrip().split();
  33 |         if len(line) == feature_dim+1 and line[0] in feature_to_id_: 
  34 |             weights[feature_to_id_[line[0]]]=np.array(
  35 |                 [float(x) for x in line[1:]]
  36 |                 ).astype(theano.config.floatX)
  37 |             valid_count+=1;
  38 |         else:
  39 |             invalid_count+=1;
  40 |     print "when loading %s ,%d Invalid line,%d valid line" %(fName,invalid_count,valid_count);
  41 | #}}}
  42 | 
  43 | class Model(object):
  44 |     """
  45 |     Network architecture.
  46 |     """
  47 |     def __init__(self, parameters=None, models_path=None, 
  48 |                  model_path=None,Training=False):
  49 | #{{{
  50 |         """
  51 |         Initialize the model. We either provide the parameters and a path where
  52 |         we store the models, or the location of a trained model.
  53 |         """
  54 |         if Training: 
  55 | #{{{
  56 |             assert parameters and models_path 
  57 |             # Create a name based on the parameters
  58 |             self.parameters = parameters
  59 |             self.name = get_name(parameters)
  60 |             # Model location 
  61 |             if model_path is None:
  62 |                 model_path = os.path.join(models_path, self.name)
  63 |             self.model_path = model_path
  64 |             self.parameters_path = os.path.join(model_path, 'parameters.pkl')
  65 |             self.mappings_path = os.path.join(model_path, 'mappings.pkl')
  66 |             # Create directory for the model if it does not exist
  67 |             if not os.path.exists(self.model_path):
  68 |                 os.makedirs(self.model_path)
  69 |             # Save the parameters to disk
  70 |             with open(self.parameters_path, 'wb') as f:
  71 |                 cPickle.dump(parameters, f) 
  72 | #}}}
  73 |         else: 
  74 | #{{{
  75 |             # Model location
  76 |             self.model_path = model_path
  77 |             self.parameters_path = os.path.join(model_path, 'parameters.pkl')
  78 |             self.mappings_path = os.path.join(model_path, 'mappings.pkl')
  79 |             # Create directory for the model if it does not exist
  80 |             if not os.path.exists(self.model_path):
  81 |                 os.makedirs(self.model_path)
  82 |             # Save the parameters to disk
  83 |             with open(self.parameters_path, 'rb') as f:
  84 |                 self.parameters=cPickle.load(f);
  85 |             self.reload_mappings();
  86 |         self.components = {}
  87 | #}}}
  88 | #}}}
  89 |     
  90 |     def save_mappings(self, id_to_word, id_to_char, id_to_tag):
  91 | #{{{
  92 |         """
  93 |         We need to save the mappings if we want to use the model later.
  94 |         """
  95 |         self.id_to_word = id_to_word
  96 |         self.id_to_char = id_to_char
  97 |         self.id_to_tag = id_to_tag
  98 |         with open(self.mappings_path, 'wb') as f:
  99 |             mappings = {
 100 |                 'id_to_word': self.id_to_word,
 101 |                 'id_to_char': self.id_to_char,
 102 |                 'id_to_tag': self.id_to_tag,
 103 |             }
 104 |             cPickle.dump(mappings, f)
 105 | #}}}
 106 | 
 107 |     def reload_mappings(self):
 108 | #{{{
 109 |         """
 110 |         Load mappings from disk.
 111 |         """
 112 |         with open(self.mappings_path, 'rb') as f:
 113 |             mappings = cPickle.load(f)
 114 |         self.id_to_word = mappings['id_to_word']
 115 |         self.id_to_char = mappings['id_to_char']
 116 |         self.id_to_tag = mappings['id_to_tag']
 117 | #}}}
 118 | 
 119 |     def add_component(self, param):
 120 |         """
 121 |         Add a new parameter to the network.
 122 |         """
 123 |         if param.name in self.components:
 124 |             raise Exception('The network already has a parameter "%s"!'
 125 |                             % param.name)
 126 |         self.components[param.name] = param
 127 | 
 128 |     def modelScore(self,tag_ids,scores,s_len):
 129 |     #{{{
 130 |         """
 131 |             ATTENTATION THIS FUNCTION IS SYMBOL PROGRAMMING
 132 |             this function is to return the score of our model at a fixed sentence label 
 133 |         @param:
 134 |             scores:        the scores matrix ,the output of our model
 135 |             tag:           a numpy array, which represent one sentence label 
 136 |             sent_lens:     a scalar number, the length of sentence.
 137 |                 because our sentence label will be expand to max sentence length,
 138 |                 so we will use this to get the original sentence label. 
 139 |         @return: 
 140 |             a scalar number ,the score;
 141 |         """
 142 |     #{{{
 143 |         n_tags=self.output_dim;
 144 |         transitions=self.transitions;
 145 |         #score from tags_scores
 146 |         real_path_score = scores[T.arange(s_len), tag_ids].sum()
 147 | 
 148 |         # Score from transitions
 149 |         b_id = theano.shared(value=np.array([n_tags], dtype=np.int32))
 150 |         e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32))
 151 |         padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0)
 152 |         real_path_score += transitions[
 153 |                 padded_tags_ids[T.arange(s_len + 1)],
 154 |                 padded_tags_ids[T.arange(s_len + 1) + 1]
 155 |             ].sum()
 156 |         #to prevent T.exp(real_path_score) to be inf 
 157 |         #return real_path_score;
 158 |         return real_path_score/s_len;
 159 |     #}}}
 160 |     #}}}
 161 |    
 162 |     def save(self):
 163 | #{{{
 164 |         """
 165 |         Write components values to disk.
 166 |         """
 167 |         for name, param in self.components.items():
 168 |             param_path = os.path.join(self.model_path, "%s.mat" % name)
 169 |             if hasattr(param, 'params'):
 170 |                 param_values = {p.name: p.get_value() for p in param.params}
 171 |             else:
 172 |                 param_values = {name: param.get_value()}
 173 |             scipy.io.savemat(param_path, param_values)
 174 | #}}}
 175 | 
 176 |     def reload(self,features=None):
 177 | #{{{
 178 |         """
 179 |         Load components values from disk.
 180 |         """
 181 |         featureLayerNameMap=['pos_layer','lemma_layer',
 182 |                              'chunk_layer','dic_layer'];
 183 |         for name, param in self.components.items():
 184 |             #when feature is use to attended and not lstm-input, 
 185 |             #we will not reload the param
 186 |             if features is not None and name in featureLayerNameMap:
 187 |                 featuresName=name[:name.find('_')];
 188 |                 if features[featuresName]['attended']==1 and \
 189 |                     features[featuresName]['lstm-input']==0:
 190 |                     continue;
 191 |             param_path = os.path.join(self.model_path, "%s.mat" % name)
 192 |             param_values = scipy.io.loadmat(param_path)
 193 |             if hasattr(param, 'params'):
 194 |                 for p in param.params:
 195 |                     set_values(p.name, p, param_values[p.name])
 196 |             else:
 197 |                 set_values(name, param, param_values[name])
 198 | #}}}
 199 |     
 200 |     def build4(self,parameters):
 201 |         #{{{
 202 |         """
 203 |         Build the network.
 204 |         """
 205 |         #some parameters 
 206 |         dropout=parameters['dropout'] ;
 207 |         char_dim=parameters['char_dim'];
 208 |         char_lstm_dim=parameters['char_lstm_dim'];
 209 |         char_bidirect=parameters['char_bidirect'];
 210 |         word_dim=parameters['word_dim'];
 211 |         word_lstm_dim=parameters['word_lstm_dim'];
 212 |         word_bidirect=parameters['word_bidirect'];
 213 |         lr_method=parameters['lr_method'];
 214 |         pre_emb=parameters['pre_emb'];
 215 |         crf=parameters['crf'];
 216 |         cap_dim=parameters['cap_dim'];
 217 |         training=parameters['training'];
 218 |         features=parameters['features'];
 219 |         useAttend=parameters['useAttend'];
 220 |         if useAttend:
 221 |             reloadParam=parameters['loading'];
 222 |         else:
 223 |             reloadParam=None;
 224 |         if reloadParam is not None:
 225 |             reloadPath=parameters['loading_path']; 
 226 |         sentencesLevelLoss=parameters['sentencesLevelLoss'];
 227 |         
 228 |         # Training parameters
 229 |         n_words = len(self.id_to_word)
 230 |         n_chars = len(self.id_to_char)
 231 |         n_tags = len(self.id_to_tag)
 232 |         self.output_dim = len(self.id_to_tag);
 233 |         self.transitions = shared((self.output_dim+ 1, self.output_dim ), 'transitions')
 234 | 
 235 |         # Number of capitalization features
 236 |         if cap_dim:
 237 |             n_cap = 4
 238 | 
 239 |         # Network variables
 240 |         is_train = T.iscalar('is_train')
 241 |         word_ids = T.ivector(name='word_ids')
 242 |         wordTrue_ids=T.ivector(name='wordTrue_ids');
 243 |         char_for_ids = T.imatrix(name='char_for_ids')
 244 |         char_rev_ids = T.imatrix(name='char_rev_ids')
 245 |         char_pos_ids = T.ivector(name='char_pos_ids')
 246 |         docLen=T.ivector(name='docLen');
 247 |         tag_ids = T.ivector(name='tag_ids')
 248 |         if cap_dim:
 249 |             cap_ids = T.ivector(name='cap_ids')
 250 |         
 251 |         #some features
 252 |         if features is not None and features['lemma']['isUsed']:
 253 |             lemma_ids=T.ivector(name='lemma_ids');
 254 |         if features is not None and features['pos']['isUsed']:
 255 |             pos_ids=T.ivector(name='pos_ids');
 256 |         if features is not None and features['chunk']['isUsed']:
 257 |             chunk_ids=T.ivector(name='chunk_ids');
 258 |         if features is not None and features['dic']['isUsed']:
 259 |             dic_ids=T.ivector(name='dic_ids');
 260 | 
 261 |         # Sentence length
 262 |         s_len = (word_ids if word_dim else char_pos_ids).shape[0]
 263 | 
 264 |         # Final input (all word features)
 265 |         input_dim = 0
 266 |         inputs = []
 267 | 
 268 |         # Word inputs
 269 | #{{{
 270 |         if word_dim:
 271 |             input_dim += word_dim
 272 |             word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
 273 |             word_input = word_layer.link(word_ids)
 274 |             wordTrue_input=word_layer.link(wordTrue_ids);
 275 |             inputs.append(word_input)
 276 |             # Initialize with pretrained embeddings
 277 |             if pre_emb and training:
 278 |                 new_weights = word_layer.embeddings.get_value()
 279 |                 print 'Loading pretrained embeddings from %s...' % pre_emb
 280 |                 pretrained = {}
 281 |                 emb_invalid = 0
 282 |                 for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')):
 283 |                     line = line.rstrip().split()
 284 |                     if len(line) == word_dim + 1:
 285 |                         pretrained[line[0]] = np.array(
 286 |                             [float(x) for x in line[1:]]
 287 |                         ).astype(np.float32)
 288 |                     else:
 289 |                         emb_invalid += 1
 290 |                 if emb_invalid > 0:
 291 |                     print 'WARNING: %i invalid lines' % emb_invalid
 292 |                 c_found = 0
 293 |                 c_lower = 0
 294 |                 c_zeros = 0
 295 |                 # Lookup table initialization
 296 |                 for i in xrange(n_words):
 297 |                     word = self.id_to_word[i]
 298 |                     if word in pretrained:
 299 |                         new_weights[i] = pretrained[word]
 300 |                         c_found += 1
 301 |                     elif word.lower() in pretrained:
 302 |                         new_weights[i] = pretrained[word.lower()]
 303 |                         c_lower += 1
 304 |                     elif re.sub('\d', '0', word.lower()) in pretrained:
 305 |                         new_weights[i] = pretrained[
 306 |                             re.sub('\d', '0', word.lower())
 307 |                         ]
 308 |                         c_zeros += 1
 309 |                 word_layer.embeddings.set_value(new_weights)
 310 |                 print 'Loaded %i pretrained embeddings.' % len(pretrained)
 311 |                 print ('%i / %i (%.4f%%) words have been initialized with '
 312 |                        'pretrained embeddings.') % (
 313 |                             c_found + c_lower + c_zeros, n_words,
 314 |                             100. * (c_found + c_lower + c_zeros) / n_words
 315 |                       )
 316 |                 print ('%i found directly, %i after lowercasing, '
 317 |                        '%i after lowercasing + zero.') % (
 318 |                           c_found, c_lower, c_zeros
 319 |                       )#}}}
 320 | 
 321 |         # Chars inputs
 322 | #{{{
 323 |         if char_dim:
 324 |             input_dim += char_lstm_dim
 325 |             char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')
 326 | 
 327 |             char_lstm_for = LSTM(char_dim, char_lstm_dim, with_batch=True,
 328 |                                  name='char_lstm_for')
 329 |             char_lstm_rev = LSTM(char_dim, char_lstm_dim, with_batch=True,
 330 |                                  name='char_lstm_rev')
 331 | 
 332 |             char_lstm_for.link(char_layer.link(char_for_ids))
 333 |             char_lstm_rev.link(char_layer.link(char_rev_ids))
 334 | 
 335 |             char_for_output = char_lstm_for.h.dimshuffle((1, 0, 2))[
 336 |                 T.arange(s_len), char_pos_ids
 337 |             ]
 338 |             char_rev_output = char_lstm_rev.h.dimshuffle((1, 0, 2))[
 339 |                 T.arange(s_len), char_pos_ids
 340 |             ]
 341 |             char_output=T.concatenate([char_for_output,char_rev_output],axis=-1);
 342 |             inputs.append(char_for_output)
 343 |             if char_bidirect:
 344 |                 inputs.append(char_rev_output)
 345 |                 input_dim += char_lstm_dim
 346 | #}}}
 347 |         
 348 |         # Capitalization feature
 349 |         #
 350 |         if cap_dim:
 351 |             input_dim += cap_dim
 352 |             cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer')
 353 |             inputs.append(cap_layer.link(cap_ids))
 354 |         
 355 |         #add feature  
 356 | #{{{
 357 |         if features is not None and features['lemma']['isUsed']:
 358 |             lemma_layer=EmbeddingLayer(features['lemma']['num'],
 359 |                                      features['lemma']['dim'],
 360 |                                      name='lemma_layer');
 361 |             if features['lemma']['pre_emb'] is not "":
 362 |                 new_weights=lemma_layer.embeddings.get_value();
 363 |                 loadPreEmbFeatures(features['lemma']['pre_emb'],
 364 |                                    features['feature_to_id_map']['lemma'],
 365 |                                     new_weights,
 366 |                                   lower=True);
 367 |                 lemma_layer.embeddings.set_value(new_weights); 
 368 |             lemma_output=lemma_layer.link(lemma_ids);
 369 |             if features['lemma']['lstm-input']:
 370 |                 input_dim+=features['lemma']['dim'];
 371 |                 inputs.append(lemma_output);
 372 |         if features is not None and features['pos']['isUsed']:
 373 |             pos_layer=EmbeddingLayer(features['pos']['num'],
 374 |                                      features['pos']['dim'],
 375 |                                      name='pos_layer');
 376 |             if features['pos']['pre_emb'] is not "":
 377 |                 new_weights=pos_layer.embeddings.get_value();
 378 |                 loadPreEmbFeatures(features['pos']['pre_emb'],
 379 |                                    features['feature_to_id_map']['pos'],
 380 |                                   new_weights);
 381 |                 pos_layer.embeddings.set_value(new_weights);
 382 |             pos_output=pos_layer.link(pos_ids);
 383 |             if features['pos']['lstm-input']:
 384 |                 input_dim+=features['pos']['dim'];
 385 |                 inputs.append(pos_output);
 386 |         if features is not None and features['chunk']['isUsed']:
 387 |             chunk_layer=EmbeddingLayer(features['chunk']['num'],
 388 |                                      features['chunk']['dim'],
 389 |                                      name='chunk_layer');
 390 |             chunk_output=chunk_layer.link(chunk_ids);
 391 |             if features['chunk']['lstm-input']:
 392 |                 input_dim+=features['chunk']['dim'];
 393 |                 inputs.append(chunk_output)
 394 |         if features is not None and features['dic']['isUsed']:
 395 |             dic_layer=EmbeddingLayer(features['dic']['num'],
 396 |                                      features['dic']['dim'],
 397 |                                      name='dic_layer');
 398 |             dic_output=dic_layer.link(dic_ids);
 399 |             if features['dic']['lstm-input']:
 400 |                 input_dim+=features['dic']['dim'];
 401 |                 inputs.append(dic_output);
 402 | #}}}
 403 | 
 404 |         # Prepare final input
 405 |         if len(inputs) != 1:
 406 |             inputs = T.concatenate(inputs, axis=1)
 407 | 
 408 |         #
 409 |         # Dropout on final input
 410 |         #
 411 |         if dropout:
 412 |             dropout_layer = DropoutLayer(p=dropout)
 413 |             input_train = dropout_layer.link(inputs)
 414 |             input_test = (1 - dropout) * inputs
 415 |             inputs = T.switch(T.neq(is_train, 0), input_train,input_test);
 416 | 
 417 |         # LSTM for words
 418 |         word_lstm_for = LSTM(input_dim, word_lstm_dim, with_batch=False,
 419 |                              name='word_lstm_for')
 420 |         word_lstm_rev = LSTM(input_dim, word_lstm_dim, with_batch=False,
 421 |                              name='word_lstm_rev') 
 422 |         if sentencesLevelLoss:
 423 |             def sentLSTM(i,output,input,lenVec):
 424 |     #{{{
 425 |                 Len=lenVec[i];
 426 |                 accLen=lenVec[:i].sum();
 427 |                 currentInput=input[accLen:accLen+Len];
 428 |                 word_lstm_for.link(currentInput);
 429 |                 word_lstm_rev.link(currentInput[::-1,:]);
 430 |                 wordForOutput=word_lstm_for.h;
 431 |                 wordRevOutput=word_lstm_rev.h[::-1,:];
 432 |                 finalOutput=T.concatenate(
 433 |                         [wordForOutput,wordRevOutput],axis=-1
 434 |                         )
 435 |                 output=T.set_subtensor(output[accLen:accLen+Len],
 436 |                                        finalOutput);
 437 |                 return output;
 438 |     #}}}
 439 |             result,update=theano.scan(fn=sentLSTM,
 440 |                                            outputs_info=T.zeros((inputs.shape[0],word_lstm_dim*2),dtype='float32'),
 441 |                                            sequences=[T.arange(docLen.shape[0])],
 442 |                                            non_sequences=[inputs,docLen]);
 443 |             
 444 |             word_lstm_for.link(inputs)
 445 |             word_lstm_rev.link(inputs[::-1, :])
 446 |             word_for_output = word_lstm_for.h
 447 |             word_for_c=word_lstm_for.c;
 448 |             word_rev_output = word_lstm_rev.h[::-1, :]
 449 |             word_rev_c=word_lstm_rev.c[::-1,:];
 450 |             
 451 |             final_c=T.concatenate(
 452 |                     [word_for_c,word_rev_c],
 453 |                     axis=-1
 454 |                  )    
 455 |             final_output=result[-1]
 456 |         else :
 457 |             word_lstm_for.link(inputs)
 458 |             word_lstm_rev.link(inputs[::-1, :])
 459 |             word_for_output = word_lstm_for.h
 460 |             word_for_c=word_lstm_for.c;
 461 |             word_rev_output = word_lstm_rev.h[::-1, :]
 462 |             word_rev_c=word_lstm_rev.c[::-1,:];
 463 |             final_output = T.concatenate(
 464 |                     [word_for_output, word_rev_output],
 465 |                     axis=-1
 466 |                 )
 467 |             final_c=T.concatenate(
 468 |                     [word_for_c,word_rev_c],
 469 |                     axis=-1
 470 |                 )
 471 |        
 472 |         if useAttend:
 473 |             #attention layer
 474 |             attended=[];
 475 |             attendedDim=0;
 476 |             if features is not None and features['word']['attended']:
 477 |                 attended.append(wordTrue_input);
 478 |                 attendedDim+=word_dim;
 479 |             if features is not None and features['char']['attended']:
 480 |                 attended.append(char_output);
 481 |                 attendedDim+=char_lstm_dim*2;
 482 |             if features is not None and features['lemma']['attended']:
 483 |                 attended.append(lemma_output);
 484 |                 attendedDim+=features['lemma']['dim'];
 485 |             if features is not None and features['pos']['attended']:
 486 |                 attended.append(pos_output);
 487 |                 attendedDim+=features['pos']['dim'];
 488 |             if features is not None and features['chunk']['attended']:
 489 |                 attended.append(chunk_output);
 490 |                 attendedDim+=features['chunk']['dim'];
 491 |             if features is not None and features['dic']['attended']:
 492 |                 attended.append(dic_output);
 493 |                 attendedDim+=features['dic']['dim'];
 494 |             
 495 |             attention_layer=AttentionLayer(attended_dim=attendedDim,
 496 |                                            state_dim=attendedDim,
 497 |             #attention_layer=AttentionLayer(attended_dim=word_lstm_dim*2,
 498 |             #                               state_dim=word_lstm_dim*2,
 499 |                                            source_dim=word_lstm_dim*2,
 500 |                                            scoreFunName=parameters['attenScoreFun'],
 501 |                                           name='attention_layer');
 502 | 
 503 |             if len(attended)>1:
 504 |                 attendedInput=T.concatenate(attended,axis=-1);
 505 |             else:
 506 |                 attendedInput=attended[0];
 507 |         
 508 |             final_output=attention_layer.link(attendedInput,attendedInput,final_output);
 509 |             #using lstm_state to compute attention
 510 |             #final_output=attention_layer.link(final_output,final_c,final_output);
 511 |             self.energy=attention_layer.energy;
 512 |         else:
 513 |             final_output=final_output;
 514 | 
 515 |         tanh_layer = HiddenLayer(2 * word_lstm_dim, word_lstm_dim,
 516 |                                  name='tanh_layer', activation='tanh')
 517 |         final_output = tanh_layer.link(final_output)
 518 | 
 519 |         # Sentence to Named Entity tags - Score
 520 |         final_layer = HiddenLayer(word_lstm_dim, n_tags, name='final_layer',
 521 |                                   activation=(None if crf else 'softmax'))
 522 |         tags_scores = final_layer.link(final_output)
 523 | 
 524 |         # No CRF
 525 |         if not crf:
 526 |             cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean()
 527 |         # CRF
 528 |         else:
 529 |             if sentencesLevelLoss:
 530 |                 #calcuate loss according to sentence instead of docLen
 531 |                 def sentLoss(i,scores,trueIds,transitions,lenVec):
 532 |     #{{{
 533 |                     Len=lenVec[i];
 534 |                     accLen=lenVec[:i].sum();
 535 |                     currentTagsScores=scores[accLen:accLen+Len];
 536 |                     currentIds=trueIds[accLen:accLen+Len];
 537 |                     real_path_score = currentTagsScores[T.arange(Len), 
 538 |                                                        currentIds].sum()
 539 |                     # Score from transitions
 540 |                     padded_tags_ids = T.concatenate([[n_tags],currentIds], axis=0)
 541 |                     real_path_score += transitions[
 542 |                         padded_tags_ids[T.arange(Len )],
 543 |                         padded_tags_ids[T.arange(Len ) + 1]
 544 |                     ].sum()
 545 | 
 546 |                     all_paths_scores = forward(currentTagsScores,transitions)
 547 |                     cost = - (real_path_score - all_paths_scores)
 548 |                     return cost;
 549 |     #}}}
 550 |                 result,update=theano.scan(fn=sentLoss,
 551 |                                          outputs_info=None,
 552 |                                          sequences=[T.arange(docLen.shape[0])],
 553 |                                          non_sequences=[tags_scores,tag_ids,self.transitions,docLen])
 554 |                 cost=result.sum();
 555 |             else:
 556 |                 real_path_score = tags_scores[T.arange(s_len), tag_ids].sum()
 557 | 
 558 |                 # Score from transitions
 559 |                 padded_tags_ids = T.concatenate([[n_tags], tag_ids], axis=0)
 560 |                 real_path_score += self.transitions[
 561 |                     padded_tags_ids[T.arange(s_len )],
 562 |                     padded_tags_ids[T.arange(s_len ) + 1]
 563 |                 ].sum()
 564 | 
 565 |                 all_paths_scores = forward(tags_scores, self.transitions)
 566 |                 cost = - (real_path_score - all_paths_scores)
 567 | 
 568 |         # Network parameters
 569 |         params = []
 570 |         if word_dim:
 571 |             self.add_component(word_layer)
 572 |             params.extend(word_layer.params)
 573 |         if char_dim:
 574 |             self.add_component(char_layer)
 575 |             self.add_component(char_lstm_for)
 576 |             params.extend(char_layer.params)
 577 |             params.extend(char_lstm_for.params)
 578 |             if char_bidirect:
 579 |                 self.add_component(char_lstm_rev)
 580 |                 params.extend(char_lstm_rev.params)
 581 |         self.add_component(word_lstm_for)
 582 |         params.extend(word_lstm_for.params)
 583 |         if word_bidirect:
 584 |             self.add_component(word_lstm_rev)
 585 |             params.extend(word_lstm_rev.params)
 586 |         if cap_dim:
 587 |             self.add_component(cap_layer)
 588 |             params.extend(cap_layer.params)
 589 |         self.add_component(final_layer)
 590 |         params.extend(final_layer.params)
 591 |         if crf:
 592 |             self.add_component(self.transitions)
 593 |             params.append(self.transitions)
 594 |         if word_bidirect:
 595 |             self.add_component(tanh_layer)
 596 |             params.extend(tanh_layer.params)
 597 |         #add feature layer
 598 |         if features is not None and features['lemma']['isUsed']:
 599 |             self.add_component(lemma_layer);
 600 |             params.extend(lemma_layer.params);
 601 |         if features is not None and features['pos']['isUsed']:
 602 |             self.add_component(pos_layer);
 603 |             params.extend(pos_layer.params);
 604 |         if features is not None and features['chunk']['isUsed']:
 605 |             self.add_component(chunk_layer);
 606 |             params.extend(chunk_layer.params);
 607 |         if features is not None and features['dic']['isUsed']:
 608 |             self.add_component(dic_layer);
 609 |             params.extend(dic_layer.params);
 610 |         
 611 |         if useAttend and reloadParam:
 612 |             #reload pre-train params 
 613 |             model_path=self.model_path;
 614 |             self.model_path=reloadPath;
 615 |             print "loading:",self.model_path;
 616 |             self.reload(features);
 617 |             self.model_path=model_path;
 618 |         
 619 |         if useAttend:
 620 |             #add attention_layer
 621 |             self.add_component(attention_layer);
 622 |             params.extend(attention_layer.params);
 623 | 
 624 |         # Prepare train and eval inputs
 625 |         eval_inputs = []
 626 |         if word_dim:
 627 |             eval_inputs.append(word_ids)
 628 |         if char_dim:
 629 |             eval_inputs.append(char_for_ids)
 630 |             if char_bidirect:
 631 |                 eval_inputs.append(char_rev_ids)
 632 |             eval_inputs.append(char_pos_ids)
 633 |         if cap_dim:
 634 |             eval_inputs.append(cap_ids)
 635 |         if useAttend:
 636 |             eval_inputs.append(wordTrue_ids);
 637 |             if sentencesLevelLoss:
 638 |                 eval_inputs.append(docLen);
 639 |         #add feature input 
 640 |         if features is not None and features['lemma']['isUsed']:
 641 |             eval_inputs.append(lemma_ids);
 642 |         if features is not None and features['pos']['isUsed']:
 643 |             eval_inputs.append(pos_ids);
 644 |         if features is not None and features['chunk']['isUsed']:
 645 |             eval_inputs.append(chunk_ids);
 646 |         if features is not None and features['dic']['isUsed']:
 647 |             eval_inputs.append(dic_ids);
 648 |         train_inputs = eval_inputs + [tag_ids]
 649 | 
 650 |         # Parse optimization method parameters
 651 |         if "-" in lr_method:
 652 |             lr_method_name = lr_method[:lr_method.find('-')]
 653 |             lr_method_parameters = {}
 654 |             for x in lr_method[lr_method.find('-') + 1:].split('-'):
 655 |                 split = x.split('_')
 656 |                 assert len(split) == 2
 657 |                 lr_method_parameters[split[0]] = float(split[1])
 658 |         else:
 659 |             lr_method_name = lr_method
 660 |             lr_method_parameters = {}
 661 | 
 662 |         # Compile training function
 663 |         print 'Compiling...'
 664 |         if training:
 665 |             #constraints
 666 |             if useAttend:
 667 |                 self.constraints=attention_layer.constraints;
 668 |             else:
 669 |                 self.constraints={};
 670 |             from keras import optimizers ;
 671 |             self.optimizer=optimizers.SGD(lr=0.001,momentum=0.9,
 672 |                                          decay=0.,nesterov=True,clipvalue=5);
 673 |             self.optimizer=optimizers.RMSprop();
 674 |             #self.optimizer=SGD(lr=lr_method_parameters['lr'],clipvalue=5,gradient_noise=0.01)
 675 |             updates = Optimization(clip=5.0).get_updates(lr_method_name, cost, params,constraints=self.constraints, **lr_method_parameters)
 676 |             #updates = self.optimizer.get_updates(params,self.constraints,cost);
 677 |             f_train_outputs=[cost];
 678 |             if useAttend:
 679 |                 f_train_outputs.append(self.energy);
 680 |  
 681 |             f_train = theano.function(
 682 |                 inputs=train_inputs,
 683 |                 outputs=f_train_outputs,
 684 |                 updates=updates,
 685 |                 on_unused_input='ignore',
 686 |                 givens=({is_train: np.cast['int32'](1)} if dropout else {})
 687 |             )
 688 |             
 689 |             f_test = theano.function(
 690 |                 inputs=train_inputs,
 691 |                 outputs=cost,
 692 |                 on_unused_input='ignore',
 693 |                 givens=({is_train: np.cast['int32'](0)} if dropout else {})
 694 |             )
 695 |             self.f_test=f_test;
 696 |         else:
 697 |             f_train = None
 698 | 
 699 |         # Compile evaluation function
 700 |         if not crf:
 701 |             f_eval = theano.function(
 702 |                 inputs=eval_inputs,
 703 |                 outputs=tags_scores,
 704 |                 givens=({is_train: np.cast['int32'](0)} if dropout else {})
 705 |             )
 706 |         else:
 707 |             if sentencesLevelLoss:
 708 |                 def sentVitebe(i,predictTag,scores,transitions,lenVec):
 709 |                     #{{{
 710 |                     Len=lenVec[i];
 711 |                     accLen=lenVec[:i].sum();
 712 |                     currentTagsScores=scores[accLen:accLen+Len];
 713 |                     currentPredictIds=forward(currentTagsScores,
 714 |                                              transitions,viterbi=True,
 715 |                                              return_alpha=False,
 716 |                                              return_best_sequence=True) ;
 717 |                     predictTag=T.set_subtensor(predictTag[accLen:accLen+Len],currentPredictIds);
 718 |                     return predictTag;
 719 |                     #}}}
 720 |                 predictTag,update=theano.scan(fn=sentVitebe,
 721 |                                              outputs_info=T.zeros((tags_scores.shape[0],),dtype='int32'),
 722 |                                              sequences=[T.arange(docLen.shape[0])],
 723 |                                              non_sequences=[tags_scores,self.transitions,docLen]);
 724 |                 predictTag=predictTag[-1];
 725 |             else:
 726 |                 predictTag=forward(tags_scores, self.transitions, 
 727 |                                    viterbi=True,return_alpha=False, 
 728 |                                    return_best_sequence=True) 
 729 |             f_eval = theano.function(
 730 |                 inputs=eval_inputs,
 731 |                 outputs=predictTag,
 732 |                 on_unused_input='ignore',
 733 |                 givens=({is_train: np.cast['int32'](0)} if dropout else {})
 734 |             )
 735 |             #f_AttenVisual=theano.function(
 736 |             #    inputs=eval_inputs,
 737 |             #    outputs=[predictTag,self.energy],
 738 |             #    on_unused_input='ignore',
 739 |             #    givens=({is_train: np.cast['int32'](0)} if dropout else {})
 740 |             #    )
 741 |             #self.f_AttenVisual=f_AttenVisual;
 742 | 
 743 |         return f_train, f_eval;
 744 | #}}}
 745 | 
 746 |     def build(self,parameters):
 747 | #{{{
 748 |         """
 749 |         Build the network.
 750 |         """
 751 |         #some parameters 
 752 |         dropout=parameters['dropout'] ;
 753 |         char_dim=parameters['char_dim'];
 754 |         char_lstm_dim=parameters['char_lstm_dim'];
 755 |         char_bidirect=parameters['char_bidirect'];
 756 |         word_dim=parameters['word_dim'];
 757 |         word_lstm_dim=parameters['word_lstm_dim'];
 758 |         word_bidirect=parameters['word_bidirect'];
 759 |         lr_method=parameters['lr_method'];
 760 |         pre_emb=parameters['pre_emb'];
 761 |         crf=parameters['crf'];
 762 |         cap_dim=parameters['cap_dim'];
 763 |         training=parameters['training'];
 764 |         features=parameters['features'];
 765 |         
 766 |         # Training parameters
 767 |         n_words = len(self.id_to_word)
 768 |         n_chars = len(self.id_to_char)
 769 |         n_tags = len(self.id_to_tag)
 770 |         self.output_dim = len(self.id_to_tag);
 771 |         self.transitions = shared((self.output_dim+ 1, self.output_dim ), 'transitions')
 772 | 
 773 |         # Number of capitalization features
 774 |         if cap_dim:
 775 |             n_cap = 4
 776 |         
 777 |         if features is not None and features['lemma']['isUsed']:
 778 |             lemma_ids=T.ivector(name='lemma_ids');
 779 |         if features is not None and features['pos']['isUsed']:
 780 |             pos_ids=T.ivector(name='pos_ids');
 781 |         if features is not None and features['chunk']['isUsed']:
 782 |             chunk_ids=T.ivector(name='chunk_ids');
 783 |         if features is not None and features['NER']['isUsed']:
 784 |             dic_ids=T.ivector(name='dic_ids');
 785 | 
 786 |         # Network variables
 787 |         is_train = T.iscalar('is_train')
 788 |         word_ids = T.ivector(name='word_ids')
 789 |         char_for_ids = T.imatrix(name='char_for_ids')
 790 |         char_rev_ids = T.imatrix(name='char_rev_ids')
 791 |         char_pos_ids = T.ivector(name='char_pos_ids')
 792 |         tag_ids = T.ivector(name='tag_ids')
 793 |         if cap_dim:
 794 |             cap_ids = T.ivector(name='cap_ids')
 795 | 
 796 |         # Sentence length
 797 |         s_len = (word_ids if word_dim else char_pos_ids).shape[0]
 798 | 
 799 |         # Final input (all word features)
 800 |         input_dim = 0
 801 |         inputs = []
 802 | 
 803 |         # Word inputs
 804 | #{{{
 805 |         if word_dim:
 806 |             input_dim += word_dim
 807 |             word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer')
 808 |             word_input = word_layer.link(word_ids)
 809 |             #for attention 
 810 |             inputs.append(word_input)
 811 |             # Initialize with pretrained embeddings
 812 |             if pre_emb and training:
 813 |                 new_weights = word_layer.embeddings.get_value()
 814 |                 print 'Loading pretrained embeddings from %s...' % pre_emb
 815 |                 pretrained = {}
 816 |                 emb_invalid = 0
 817 |                 for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')):
 818 |                     line = line.rstrip().split()
 819 |                     if len(line) == word_dim + 1:
 820 |                         pretrained[line[0]] = np.array(
 821 |                             [float(x) for x in line[1:]]
 822 |                         ).astype(np.float32)
 823 |                     else:
 824 |                         emb_invalid += 1
 825 |                 if emb_invalid > 0:
 826 |                     print 'WARNING: %i invalid lines' % emb_invalid
 827 |                 c_found = 0
 828 |                 c_lower = 0
 829 |                 c_zeros = 0
 830 |                 # Lookup table initialization
 831 |                 for i in xrange(n_words):
 832 |                     word = self.id_to_word[i]
 833 |                     if word in pretrained:
 834 |                         new_weights[i] = pretrained[word]
 835 |                         c_found += 1
 836 |                     elif word.lower() in pretrained:
 837 |                         new_weights[i] = pretrained[word.lower()]
 838 |                         c_lower += 1
 839 |                     elif re.sub('\d', '0', word.lower()) in pretrained:
 840 |                         new_weights[i] = pretrained[
 841 |                             re.sub('\d', '0', word.lower())
 842 |                         ]
 843 |                         c_zeros += 1
 844 |                 word_layer.embeddings.set_value(new_weights)
 845 |                 print 'Loaded %i pretrained embeddings.' % len(pretrained)
 846 |                 print ('%i / %i (%.4f%%) words have been initialized with '
 847 |                        'pretrained embeddings.') % (
 848 |                             c_found + c_lower + c_zeros, n_words,
 849 |                             100. * (c_found + c_lower + c_zeros) / n_words
 850 |                       )
 851 |                 print ('%i found directly, %i after lowercasing, '
 852 |                        '%i after lowercasing + zero.') % (
 853 |                           c_found, c_lower, c_zeros
 854 |                       )#}}}
 855 | 
 856 |         # Chars inputs
 857 | #{{{
 858 |         if char_dim:
 859 |             input_dim += char_lstm_dim
 860 |             char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer')
 861 | 
 862 |             char_lstm_for = LSTM(char_dim, char_lstm_dim, with_batch=True,
 863 |                                  name='char_lstm_for')
 864 |             char_lstm_rev = LSTM(char_dim, char_lstm_dim, with_batch=True,
 865 |                                  name='char_lstm_rev')
 866 | 
 867 |             char_lstm_for.link(char_layer.link(char_for_ids))
 868 |             char_lstm_rev.link(char_layer.link(char_rev_ids))
 869 | 
 870 |             char_for_output = char_lstm_for.h.dimshuffle((1, 0, 2))[
 871 |                 T.arange(s_len), char_pos_ids
 872 |             ]
 873 |             char_rev_output = char_lstm_rev.h.dimshuffle((1, 0, 2))[
 874 |                 T.arange(s_len), char_pos_ids
 875 |             ]
 876 | 
 877 |             inputs.append(char_for_output)
 878 |             if char_bidirect:
 879 |                 inputs.append(char_rev_output)
 880 |                 input_dim += char_lstm_dim
 881 | #}}}
 882 |         
 883 |         # Capitalization feature
 884 |         #
 885 |         if cap_dim:
 886 |             input_dim += cap_dim
 887 |             cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer')
 888 |             inputs.append(cap_layer.link(cap_ids))
 889 | 
 890 |         # Prepare final input
 891 |         if len(inputs) != 1:
 892 |             inputs = T.concatenate(inputs, axis=1)
 893 | 
 894 |         #
 895 |         # Dropout on final input
 896 |         #
 897 |         if dropout:
 898 |             dropout_layer = DropoutLayer(p=dropout)
 899 |             input_train = dropout_layer.link(inputs)
 900 |             input_test = (1 - dropout) * inputs
 901 |             inputs = T.switch(T.neq(is_train, 0), input_train, input_test)
 902 | 
 903 |         # LSTM for words
 904 |         word_lstm_for = LSTM(input_dim, word_lstm_dim, with_batch=False,
 905 |                              name='word_lstm_for')
 906 |         word_lstm_rev = LSTM(input_dim, word_lstm_dim, with_batch=False,
 907 |                              name='word_lstm_rev')
 908 |         word_lstm_for.link(inputs)
 909 |         word_lstm_rev.link(inputs[::-1, :])
 910 |         word_for_output = word_lstm_for.h
 911 |         word_rev_output = word_lstm_rev.h[::-1, :]
 912 |         if word_bidirect:
 913 |             final_output = T.concatenate(
 914 |                 [word_for_output, word_rev_output],
 915 |                 axis=1
 916 |             )
 917 |             tanh_layer = HiddenLayer(2 * word_lstm_dim, word_lstm_dim,
 918 |                                      name='tanh_layer', activation='tanh')
 919 |             final_output = tanh_layer.link(final_output)
 920 |         else:
 921 |             final_output = word_for_output
 922 | 
 923 |         # Sentence to Named Entity tags - Score
 924 |         final_layer = HiddenLayer(word_lstm_dim, n_tags, name='final_layer',
 925 |                                   activation=(None if crf else 'softmax'))
 926 |         tags_scores = final_layer.link(final_output)
 927 | 
 928 |         # No CRF
 929 |         if not crf:
 930 |             cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean()
 931 |         # CRF
 932 |         else:
 933 | 
 934 |             #all_paths_scores = forward(observations, self.transitions)
 935 |             #cost = - (self.modelScore(tag_ids,tags_scores,s_len) - all_paths_scores)
 936 |             #real_path_score=self.modelScore(tag_ids,tags_scores,tag_ids.shape[0]) ;
 937 |             #error=real_path_score+self.noiseLoss(tags_scores,tag_ids,0.5);
 938 |             #cost=-error;
 939 |             #cost=self.likehoodLoss(tags_scores,tag_ids,observations,2)
 940 |             
 941 |             real_path_score = tags_scores[T.arange(s_len), tag_ids].sum()
 942 | 
 943 |             # Score from transitions
 944 |             padded_tags_ids = T.concatenate([[n_tags], tag_ids], axis=0)
 945 |             real_path_score += self.transitions[
 946 |                 padded_tags_ids[T.arange(s_len )],
 947 |                 padded_tags_ids[T.arange(s_len ) + 1]
 948 |             ].sum()
 949 | 
 950 |             all_paths_scores = forward(tags_scores, self.transitions)
 951 |             cost = - (real_path_score - all_paths_scores)
 952 | 
 953 |         # Network parameters
 954 |         params = []
 955 |         if word_dim:
 956 |             self.add_component(word_layer)
 957 |             params.extend(word_layer.params)
 958 |         if char_dim:
 959 |             self.add_component(char_layer)
 960 |             self.add_component(char_lstm_for)
 961 |             params.extend(char_layer.params)
 962 |             params.extend(char_lstm_for.params)
 963 |             if char_bidirect:
 964 |                 self.add_component(char_lstm_rev)
 965 |                 params.extend(char_lstm_rev.params)
 966 |         self.add_component(word_lstm_for)
 967 |         params.extend(word_lstm_for.params)
 968 |         if word_bidirect:
 969 |             self.add_component(word_lstm_rev)
 970 |             params.extend(word_lstm_rev.params)
 971 |         if cap_dim:
 972 |             self.add_component(cap_layer)
 973 |             params.extend(cap_layer.params)
 974 |         self.add_component(final_layer)
 975 |         params.extend(final_layer.params)
 976 |         if crf:
 977 |             self.add_component(self.transitions)
 978 |             params.append(self.transitions)
 979 |         if word_bidirect:
 980 |             self.add_component(tanh_layer)
 981 |             params.extend(tanh_layer.params)
 982 | 
 983 |         # Prepare train and eval inputs
 984 |         eval_inputs = []
 985 |         if word_dim:
 986 |             eval_inputs.append(word_ids)
 987 |         if char_dim:
 988 |             eval_inputs.append(char_for_ids)
 989 |             if char_bidirect:
 990 |                 eval_inputs.append(char_rev_ids)
 991 |             eval_inputs.append(char_pos_ids)
 992 |         if cap_dim:
 993 |             eval_inputs.append(cap_ids)
 994 |         train_inputs = eval_inputs + [tag_ids]
 995 | 
 996 |         # Parse optimization method parameters
 997 |         if "-" in lr_method:
 998 |             lr_method_name = lr_method[:lr_method.find('-')]
 999 |             lr_method_parameters = {}
1000 |             for x in lr_method[lr_method.find('-') + 1:].split('-'):
1001 |                 split = x.split('_')
1002 |                 assert len(split) == 2
1003 |                 lr_method_parameters[split[0]] = float(split[1])
1004 |         else:
1005 |             lr_method_name = lr_method
1006 |             lr_method_parameters = {}
1007 | 
1008 |         # Compile training function
1009 |         print 'Compiling...'
1010 |         if training:
1011 |             import  optimizers ;
1012 |             self.optimizer=optimizers.RMSprop(lr=0.001);
1013 |             updates = Optimization(clip=5.0).get_updates(lr_method_name, cost, params, **lr_method_parameters)
1014 |             self.constraints={};
1015 |             #updates = self.optimizer.get_updates(params,self.constraints,cost);
1016 |             f_train = theano.function(
1017 |                 inputs=train_inputs,
1018 |                 outputs=cost,
1019 |                 updates=updates,
1020 |                 givens=({is_train: np.cast['int32'](1)} if dropout else {})
1021 |             )
1022 |             #for debug 
1023 |             #f_Debug = theano.function(
1024 |             #    inputs=train_inputs,
1025 |             #    outputs=cost,
1026 |             #    updates=self.update,
1027 |             #    givens=({is_train: np.cast['int32'](1)} if dropout else {})
1028 |             #)
1029 |             #debug end 
1030 |         else:
1031 |             f_train = None
1032 | 
1033 |         # Compile evaluation function
1034 |         if not crf:
1035 |             f_eval = theano.function(
1036 |                 inputs=eval_inputs,
1037 |                 outputs=tags_scores,
1038 |                 givens=({is_train: np.cast['int32'](0)} if dropout else {})
1039 |             )
1040 |         else:
1041 |             f_eval = theano.function(
1042 |                 inputs=eval_inputs,
1043 |                 outputs=forward(tags_scores, self.transitions, viterbi=True,
1044 |                                 return_alpha=False, return_best_sequence=True),
1045 |                 givens=({is_train: np.cast['int32'](0)} if dropout else {})
1046 |             )
1047 | 
1048 |         return f_train, f_eval
1049 | #}}}
1050 | 


--------------------------------------------------------------------------------
/src/nn.py:
--------------------------------------------------------------------------------
  1 | import theano
  2 | import theano.tensor as T
  3 | from utils import shared
  4 | import numpy as np
  5 | 
  6 | class HiddenLayer(object): 
  7 | #{{{
  8 |     """
  9 |     Hidden layer with or without bias.
 10 |     Input: tensor of dimension (dims*, input_dim)
 11 |     Output: tensor of dimension (dims*, output_dim)
 12 |     """
 13 |     def __init__(self, input_dim, output_dim, bias=True, activation='sigmoid',
 14 |                  name='hidden_layer'):
 15 |         self.input_dim = input_dim
 16 |         self.output_dim = output_dim
 17 |         self.bias = bias
 18 |         self.name = name
 19 |         if activation is None:
 20 |             self.activation = None
 21 |         elif activation == 'tanh':
 22 |             self.activation = T.tanh
 23 |         elif activation == 'sigmoid':
 24 |             self.activation = T.nnet.sigmoid
 25 |         elif activation == 'softmax':
 26 |             self.activation = T.nnet.softmax
 27 |         else:
 28 |             raise Exception("Unknown activation function: " % activation)
 29 | 
 30 |         # Initialize weights and bias
 31 |         self.weights = shared((input_dim, output_dim), name + '_weights')
 32 |         self.bias = shared((output_dim,), name + '_bias')
 33 | 
 34 |         # Define parameters
 35 |         if self.bias:
 36 |             self.params = [self.weights, self.bias]
 37 |         else:
 38 |             self.params = [self.weights]
 39 | 
 40 |     def link(self, input):
 41 |         """
 42 |         The input has to be a tensor with the right
 43 |         most dimension equal to input_dim.
 44 |         """
 45 |         self.input = input
 46 |         self.linear_output = T.dot(self.input, self.weights)
 47 |         if self.bias:
 48 |             self.linear_output = self.linear_output + self.bias
 49 |         if self.activation is None:
 50 |             self.output = self.linear_output
 51 |         else:
 52 |             self.output = self.activation(self.linear_output)
 53 |         return self.output
 54 | #}}}
 55 | 
 56 | class EmbeddingLayer(object):
 57 | #{{{
 58 |     """
 59 |     Embedding layer: word embeddings representations
 60 |     Input: tensor of dimension (dim*) with values in range(0, input_dim)
 61 |     Output: tensor of dimension (dim*, output_dim)
 62 |     """
 63 | 
 64 |     def __init__(self, input_dim, output_dim, name='embedding_layer'):
 65 |         """
 66 |         Typically, input_dim is the vocabulary size,
 67 |         and output_dim the embedding dimension.
 68 |         """
 69 |         self.input_dim = input_dim
 70 |         self.output_dim = output_dim
 71 |         self.name = name
 72 | 
 73 |         # Randomly generate weights
 74 |         self.embeddings = shared((input_dim, output_dim),
 75 |                                  self.name + '__embeddings')
 76 | 
 77 |         # Define parameters
 78 |         self.params = [self.embeddings]
 79 | 
 80 |     def link(self, input):
 81 |         """
 82 |         Return the embeddings of the given indexes.
 83 |         Input: tensor of shape (dim*)
 84 |         Output: tensor of shape (dim*, output_dim)
 85 |         """
 86 |         self.input = input
 87 |         self.output = self.embeddings[self.input]
 88 |         return self.output
 89 | #}}}
 90 | 
 91 | 
 92 | class DropoutLayer(object):
 93 | #{{{
 94 |     """
 95 |     Dropout layer. Randomly set to 0 values of the input
 96 |     with probability p.
 97 |     """
 98 |     def __init__(self, p=0.5, name='dropout_layer'):
 99 |         """
100 |         p has to be between 0 and 1 (1 excluded).
101 |         p is the probability of dropping out a unit, so
102 |         setting p to 0 is equivalent to have an identity layer.
103 |         """
104 |         assert 0. <= p < 1.
105 |         self.p = p
106 |         self.rng = T.shared_randomstreams.RandomStreams(seed=123456)
107 |         self.name = name
108 | 
109 |     def link(self, input):
110 |         """
111 |         Dropout link: we just apply mask to the input.
112 |         """
113 |         if self.p > 0:
114 |             mask = self.rng.binomial(n=1, p=1-self.p, size=input.shape,
115 |                                      dtype=theano.config.floatX)
116 |             self.output = input * mask
117 |         else:
118 |             self.output = input
119 | 
120 |         return self.output
121 | #}}}
122 | 
123 | from keras import activations;
124 | from keras import backend as K;
125 | from keras import initializers as initializations;
126 | 
127 | class Layer(object):
128 |     def __init__(self):
129 |         self.build();
130 |         return;
131 |     def build(self):
132 |         return;
133 | 
134 | class Convolution1D(Layer):
135 | #{{{
136 |     def __init__(self,nb_filter,filter_length,input_dim,init='glorot_uniform',
137 |                     activation=None,border_mode='valid',subsample_length=1,
138 |                     bias=True,
139 |                     name='Convolution1D'):
140 | #{{{
141 |         self.nb_filter = nb_filter
142 |         self.filter_length = filter_length
143 |         self.init = initializations.get(init, dim_ordering='th')
144 |         self.activation = activations.get(activation)
145 |         self.border_mode = border_mode
146 |         self.subsample_length = subsample_length
147 | 
148 |         self.subsample = (subsample_length, 1) 
149 |         self.bias=bias;
150 |         self.input_dim = input_dim
151 |         self.name=name;
152 | 
153 |         super(Convolution1D,self).__init__();
154 | #}}}
155 |     def build(self):
156 | #{{{
157 |         self.W_shape=(self.filter_length,1,self.input_dim,self.nb_filter);
158 |         
159 |         self.W=self.init(self.W_shape,name='{}_W'.format(self.name));
160 |         if self.bias:
161 |             init=initializations.get('zero');
162 |             self.b=init((self.nb_filter,),
163 |                                 name='{}_b'.format(self.name));
164 | 
165 |         self.params=[self.W,self.b];
166 | #}}}
167 |     def call(self,x):
168 | #{{{
169 |         x=K.expand_dims(x,0);
170 |         x=K.expand_dims(x,2);
171 |         output=K.conv2d(x,self.W,strides=self.subsample,
172 |                             border_mode=self.border_mode,
173 |                             dim_ordering='tf');
174 |         output=K.squeeze(output,2);
175 |         if self.bias:
176 |             output+=K.reshape(self.b,(1,1,self.nb_filter));
177 |         output=self.activation(output);
178 |         output=K.squeeze(output,0);
179 |         return output;
180 | #}}}
181 | #}}}
182 | 
183 | class LSTM(object): 
184 | #{{{
185 | #{{{
186 |     """
187 |     Long short-term memory (LSTM). Can be used with or without batches.
188 |     Without batches:
189 |         Input: matrix of dimension (sequence_length, input_dim)
190 |         Output: vector of dimension (output_dim)
191 |     With batches:
192 |         Input: tensor3 of dimension (batch_size, sequence_length, input_dim)
193 |         Output: matrix of dimension (batch_size, output_dim)
194 |     """
195 | #}}}
196 |     def __init__(self, input_dim, output_dim, with_batch=True, 
197 |                  activation='tanh',inner_activation='hard_sigmoid',
198 |                  name='LSTM'):
199 | #{{{
200 |         """
201 |         Initialize neural network.
202 |         """
203 |         self.input_dim = input_dim
204 |         self.output_dim = output_dim;
205 |         self.with_batch = with_batch
206 |         self.name = name 
207 |         self.inner_activation=activations.get(inner_activation);
208 |         self.activation=activations.get(activation);
209 |         self.build();
210 | #}}}
211 |     def build(self):
212 | #{{{
213 |         self.W=shared((self.input_dim,self.output_dim*3),name='{}_W'.format(self.name));
214 |         self.U=shared((self.output_dim,self.output_dim*3),name='{}_U'.format(self.name));
215 |         self.w_ci = shared((self.output_dim, self.output_dim), name='{}_w_ci'.format(self.name)  )
216 |         self.w_co = shared((self.output_dim, self.output_dim), name='{}_w_co'.format(self.name)  )
217 |         self.b=shared((self.output_dim*3,),name='{}_b'.format(self.name));
218 |         self.c_0 = shared((self.output_dim,), name='{}_c_0'.format(self.name)  )
219 |         self.h_0 = shared((self.output_dim,), name='{}_h_0'.format(self.name)  )
220 |         self.params=[self.W,self.U,
221 |                                     self.w_ci,self.w_co,self.b,
222 |                                     self.c_0,self.h_0,
223 |                     ];
224 |         #}}}
225 |    
226 |     def get_initial_states(self, x):
227 |         # build an all-zero tensor of shape (samples, output_dim)
228 |         initial_state = K.zeros_like(x)  # (samples, timesteps, input_dim)
229 |         initial_state = K.sum(initial_state, axis=(1, 2))  # (samples,)
230 |         initial_state = K.expand_dims(initial_state)  # (samples, 1)
231 |         initial_state = K.tile(initial_state, [1, self.output_dim])  # (samples, output_dim)
232 |         initial_states = [initial_state for _ in range(len(self.states))]
233 |         return initial_states 
234 |     def step(self,x, h_tm1,c_tm1):
235 | #{{{
236 |         z=T.dot(x,self.W)+T.dot(h_tm1,self.U)+self.b;
237 |         if self.with_batch:
238 |             z_i=z[:,:self.output_dim];
239 |             z_c=z[:,self.output_dim:2*self.output_dim];
240 |             z_o=z[:,2*self.output_dim:];
241 |         else:
242 |             z_i=z[:self.output_dim];
243 |             z_c=z[self.output_dim:2*self.output_dim];
244 |             z_o=z[2*self.output_dim:];
245 | 
246 |         i_t = self.inner_activation(z_i +
247 |                                  T.dot(c_tm1, self.w_ci))
248 |         # f_t = T.nnet.sigmoid(T.dot(x_t, self.w_xf) +
249 |         #                      T.dot(h_tm1, self.w_hf) +
250 |         #                      T.dot(c_tm1, self.w_cf) +
251 |         #                      self.b_f)
252 |         c_t = (1 - i_t) * c_tm1 + i_t * self.activation(z_c)
253 |         o_t = self.inner_activation(z_o +
254 |                                  T.dot(c_t, self.w_co))
255 |         h_t = o_t * self.activation(c_t)
256 |         return  h_t,c_t
257 | #}}}
258 |     def link(self, input):
259 | #{{{
260 |         """
261 |         Propagate the input through the network and return the last hidden
262 |         vector. The whole sequence is also accessible via self.h, but
263 |         where self.h of shape (sequence_length, batch_size, output_dim)
264 |         """
265 | 
266 |         # If we use batches, we have to permute the first and second dimension.
267 |         if self.with_batch:
268 |             self.input = input.dimshuffle(1, 0, 2)
269 |             initial_states = [T.alloc(x, self.input.shape[1], self.output_dim)
270 |                             for x in [self.h_0, self.c_0]]
271 |         else:
272 |             self.input = input
273 |             initial_states = [self.h_0, self.c_0] 
274 |         
275 |         [h,c], _ = theano.scan(
276 |             fn=self.step,
277 |             sequences=self.input,
278 |             outputs_info=initial_states,
279 |         )
280 |         self.h = h
281 |         self.c=c
282 |         self.output = h[-1]
283 | 
284 |         return self.output
285 | #}}}
286 | #}}}
287 | 
288 | class LSTM_normal(object): 
289 | #{{{
290 | #{{{
291 |     """
292 |     Long short-term memory (LSTM). Can be used with or without batches.
293 |     Without batches:
294 |         Input: matrix of dimension (sequence_length, input_dim)
295 |         Output: vector of dimension (output_dim)
296 |     With batches:
297 |         Input: tensor3 of dimension (batch_size, sequence_length, input_dim)
298 |         Output: matrix of dimension (batch_size, output_dim)
299 |     """
300 | #}}}
301 |     def __init__(self, input_dim, output_dim, with_batch=True, 
302 |                  activation='tanh',inner_activation='hard_sigmoid',
303 |                  name='LSTM_normal'):
304 | #{{{
305 |         """
306 |         Initialize neural network.
307 |         """
308 |         self.input_dim = input_dim
309 |         self.output_dim = output_dim;
310 |         self.with_batch = with_batch
311 |         self.name = name 
312 |         self.inner_activation=activations.get(inner_activation);
313 |         self.forget_bias_init = initializations.get('one')
314 |         self.activation=activations.get(activation);
315 |         self.build();
316 | #}}}
317 |     def build(self):
318 | #{{{
319 |         import numpy as np;
320 |         self.W = shared((self.input_dim, 4 * self.output_dim),
321 |                                name='{}_W'.format(self.name))
322 |         self.U = shared((self.output_dim, 4 * self.output_dim),
323 |                                      name='{}_U'.format(self.name))
324 | 
325 |         self.b = K.variable(np.hstack((np.zeros(self.output_dim),
326 |                                         K.get_value(self.forget_bias_init(
327 |                                                 (self.output_dim,))),
328 |                                         np.zeros(self.output_dim),
329 |                                         np.zeros(self.output_dim))),
330 |                                 name='{}_b'.format(self.name))
331 |         #self.c_0 = shared((self.output_dim,), name='{}_c_0'.format(self.name)  )
332 |         #self.h_0 = shared((self.output_dim,), name='{}_h_0'.format(self.name)  )
333 |         self.c_0=np.zeros(self.output_dim).astype(theano.config.floatX);
334 |         self.h_0=np.zeros(self.output_dim).astype(theano.config.floatX);
335 |         self.params=[self.W,self.U,
336 |                         self.b,
337 |                     # self.c_0,self.h_0
338 |                     ];
339 |         #}}}
340 |     def step(self,x, h_tm1,c_tm1):
341 | #{{{
342 |         z = K.dot(x , self.W) + K.dot(h_tm1 , self.U) + self.b
343 |         if self.with_batch:
344 |             z0 = z[:,:self.output_dim]
345 |             z1 = z[:,self.output_dim: 2 * self.output_dim]
346 |             z2 = z[:,2 * self.output_dim: 3 * self.output_dim]
347 |             z3 = z[:,3 * self.output_dim:]
348 |         else:
349 |             z0 = z[:self.output_dim]
350 |             z1 = z[self.output_dim: 2 * self.output_dim]
351 |             z2 = z[2 * self.output_dim: 3 * self.output_dim]
352 |             z3 = z[3 * self.output_dim:]
353 | 
354 | 
355 |         i = self.inner_activation(z0)
356 |         f = self.inner_activation(z1)
357 |         c = f * c_tm1 + i * self.activation(z2)
358 |         o = self.inner_activation(z3) 
359 |         h=o*self.activation(c);
360 |         return  h,c;
361 | #}}}
362 |     
363 |     def link(self, input):
364 | #{{{
365 |         """
366 |         Propagate the input through the network and return the last hidden
367 |         vector. The whole sequence is also accessible via self.h, but
368 |         where self.h of shape (sequence_length, batch_size, output_dim)
369 |         """
370 | 
371 |         # If we use batches, we have to permute the first and second dimension.
372 |         self.input = input
373 |         if self.with_batch:
374 |             self.input = input.dimshuffle(1, 0, 2)
375 |             initial_states = [T.alloc(x, self.input.shape[1], self.output_dim)
376 |                             for x in [self.h_0, self.c_0]]
377 |         else:
378 |             self.input = input
379 |             initial_states = [self.h_0, self.c_0] 
380 |         step_function=self.step;
381 | 
382 |         [h,c], _ = theano.scan(
383 |             fn=step_function,
384 |             sequences=self.input,
385 |             outputs_info=initial_states,
386 |         )
387 |         self.h = h
388 |         self.output = h[-1]
389 | 
390 |         return self.output
391 | #}}}
392 | #}}}
393 | 
394 | class AttentionLSTM(LSTM):
395 |     def build(self):
396 | #{{{
397 |         super(AttentionLSTM,self).build()   ;
398 |         self.W_A=shared((self.input_dim+self.output_dim,1),name='{}_W_A'.format(self.name));
399 |         self.b_A=shared((1,),name='{}_b_A'.format(self.name));
400 |         self.params+=[self.W_A,self.b_A];
401 | #}}}
402 |     def step(self, h_tm1,c_tm1,x):
403 | #{{{
404 |         assert x.ndim==2;
405 |         H=x;
406 |         input_length=x.shape[0];
407 |         C=T.repeat(c_tm1.reshape((1,-1)),input_length,axis=0);
408 |         _HC=K.concatenate([H,C]);
409 |         energy=T.dot(_HC,self.W_A.reshape((-1,1)))+self.b_A;
410 |         energy=K.softmax(energy.reshape((1,-1)));
411 |         x=(H*energy.reshape((-1,1))).sum(axis=0)
412 |         
413 |         h_t,c_t=super(AttentionLSTM,self).step_noBatch(x,h_tm1,c_tm1);
414 |         return  h_t,c_t
415 | #}}}
416 |     def link(self, input):
417 | #{{{
418 |         """
419 |         Propagate the input through the network and return the last hidden
420 |         vector. The whole sequence is also accessible via self.h, but
421 |         where self.h of shape (sequence_length, batch_size, output_dim)
422 |         """
423 | 
424 |         # If we use batches, we have to permute the first and second dimension.
425 |         if self.with_batch:
426 |             assert 0,"AttentionLSTM not implement with_batch";
427 |         else:
428 |             self.input=input;
429 |             initial_states = [self.h_0, self.c_0] 
430 |          
431 |         step_function=self.step;  
432 | 
433 |         [h,c], _ = theano.scan(
434 |             fn=step_function,
435 |             outputs_info=initial_states,
436 |             non_sequences=[self.input],
437 |             n_steps=self.input.shape[0]
438 |         )
439 |         self.h = h
440 |         self.output = self.h[-1]
441 | 
442 |         return self.output
443 | #}}}
444 |  
445 | class AttentionLSTM2(AttentionLSTM):
446 | #{{{
447 |     def __init__(self,attended_dim,wordInput_dim,
448 |                  combineOput_dim,output_dim, **kwargs):
449 | #{{{
450 |         self.attendedInput_dim=attended_dim;
451 |         self.wordInput_dim=wordInput_dim;
452 |         self.combineOput_dim=combineOput_dim;
453 |         super(AttentionLSTM2, self).__init__(output_dim=output_dim,
454 |                                              input_dim=combineOput_dim,
455 |                                              **kwargs)
456 | #}}}
457 |     def build(self):
458 | #{{{
459 |         if self.input_dim is None:
460 |             self.input_dim=self.combineOput_dim;
461 |         super(AttentionLSTM,self).build()   ;
462 |         #attention weight
463 |         self.W_A=shared((self.attendedInput_dim+self.output_dim,1),name='{}_W_A'.format(self.name));
464 |         self.b_A=shared((1,),name='{}_b_A'.format(self.name));
465 |         
466 |         #combine weight
467 |         self.W_combine=shared((self.attendedInput_dim+self.wordInput_dim,
468 |                                  self.combineOput_dim),
469 |                                  name='{}_W_combine'.format(self.name));
470 |         self.b_combine=shared((self.combineOput_dim,),
471 |                                  name='{}_b_combine'.format(self.name));
472 |         self.params+=[self.W_A,self.b_A];
473 |         self.params+=[self.W_combine,self.b_combine];
474 | 
475 | #}}}
476 |     def step(self, word,h_tm1,c_tm1,x):
477 | #{{{
478 |         H=x;
479 |         input_length=x.shape[0];
480 |         C=T.repeat(c_tm1.reshape((1,-1)),input_length,axis=0);
481 |         _HC=K.concatenate([H,C]);
482 |         energy=T.dot(_HC,self.W_A.reshape((-1,1)))+self.b_A;
483 |         energy=K.softmax(energy.reshape((1,-1)));
484 |         x=(H*energy.reshape((-1,1))).sum(axis=0)
485 | 
486 |         #combine glimpsed with word;
487 |         combine=K.concatenate([x,word]);
488 |         combined=K.dot(combine,self.W_combine)+self.b_combine;
489 |         #original LSTM step
490 |         h_t,c_t=super(AttentionLSTM,self).step_noBatch(combined,h_tm1,c_tm1);
491 |         return  h_t,c_t
492 | #}}}
493 |     def link(self, input,words):
494 | #{{{
495 |         """
496 |         Propagate the input through the network and return the last hidden
497 |         vector. The whole sequence is also accessible via self.h, but
498 |         where self.h of shape (sequence_length, batch_size, output_dim)
499 |         """
500 | 
501 |         # If we use batches, we have to permute the first and second dimension.
502 |         if self.with_batch:
503 |             assert 0,"AttentionLSTM not implement with_batch";
504 |         else:
505 |             self.input = input
506 |             initial_states = [self.h_0, self.c_0] 
507 |         
508 |         step_function=self.step;  
509 | 
510 |         [h,c], _ = theano.scan(
511 |             fn=step_function,
512 |             sequences=[words],
513 |             outputs_info=initial_states,
514 |             non_sequences=[self.input],
515 |         )
516 |         self.h = h
517 |         self.output = h[-1]
518 | 
519 |         return self.output
520 | #}}}
521 |  
522 | #}}}
523 | 
524 | class AttentionLSTM3(LSTM):
525 | #{{{
526 |     def __init__(self,attended_dim,wordInput_dim,
527 |                  output_dim,mode='concat', **kwargs):
528 | #{{{
529 |         self.attendedInput_dim=attended_dim;
530 |         self.wordInput_dim=wordInput_dim;
531 |         self.attendedMode=mode;
532 |         self.init=initializations.get('glorot_uniform');
533 |         super(AttentionLSTM3, self).__init__(output_dim=output_dim,
534 |                                              input_dim=attended_dim+wordInput_dim,
535 |                                              **kwargs)
536 | #}}}
537 |     def build(self):
538 | #{{{
539 |         if self.input_dim is None:
540 |             self.input_dim=self.combineOput_dim;
541 |         super(AttentionLSTM3,self).build()   ;
542 |         #attention weight 
543 |         self.W_A_X=shared((self.attendedInput_dim,self.output_dim),
544 |                              name='{}_W_A_X');
545 |         #self.b_A_X=shared((self.output_dim,),
546 |         #                     name='{}_b_A_X');
547 |         self.W_A_h=shared((self.output_dim,self.output_dim),
548 |                              name='{}_W_A_h');
549 |         #self.b_A_h=shared((self.output_dim,),
550 |         #                     name='{}_b_A_h');
551 |         self.W_A=self.init((self.output_dim,),name='{}_W_A'.format(self.name));
552 |         #self.b_A=shared((1,),name='{}_b_A'.format(self.name));
553 |         self.params+=[self.W_A_X,
554 |                       #self.b_A_X,
555 |                           self.W_A_h,
556 |                       #self.b_A_h,
557 |                             self.W_A,
558 |                       #self.b_A,
559 |                          ];
560 | 
561 | 
562 | #}}}
563 |     def step(self, word,index,energy_tm1,h_tm1,c_tm1,x):
564 | #{{{
565 |         #attention 
566 |         H=x;
567 |         if self.attendedMode is "concat":
568 |             M_X=T.dot(x,self.W_A_X)#+self.b_A_X;
569 |             M_state=T.dot(self.W_A_h,c_tm1)#+self.b_A_h; 
570 |             M=T.tanh(M_X+M_state)
571 |             _energy=T.dot(M,self.W_A.T)#+self.b_A;
572 |         elif self.attendedMode is "dot":
573 |             energy=None;
574 |             assert 0,"not implement";
575 |         elif self.attendedMode is "general":
576 |             M_X=T.dot(x,self.W_A_X)#+self.b_A_X;
577 |             M_state=T.dot(self.W_A_h,c_tm1)#+self.b_A_h; 
578 |             M=T.tanh(M_X*M_state);
579 |             _energy=T.dot(M,self.W_A.T)#+self.b_A;
580 |         #mask
581 |         mask=T.zeros((1,x.shape[0]),dtype=theano.config.floatX);
582 |         energy=T.nnet.softmax(_energy[:index+1]);
583 |         masked_energy=T.set_subtensor(mask[0,:index+1],energy.flatten());
584 |         glimpsed=(masked_energy.T*H).sum(axis=0)
585 |         #combine glimpsed with word;
586 |         if self.wordInput_dim==0:
587 |             combined=glimpsed;
588 |         else:
589 |             combine=K.concatenate([glimpsed,word]);
590 |             combined=combine; 
591 |         #original LSTM step 
592 |         h_t,c_t=super(AttentionLSTM3,self).step(combined,h_tm1,c_tm1);
593 |         return  masked_energy.flatten(),h_t,c_t
594 | #}}}
595 |     def link(self, input,words):
596 | #{{{
597 |         """
598 |         Propagate the input through the network and return the last hidden
599 |         vector. The whole sequence is also accessible via self.h, but
600 |         where self.h of shape (sequence_length, batch_size, output_dim)
601 |         """
602 | 
603 |         # If we use batches, we have to permute the first and second dimension.
604 |         if self.with_batch:
605 |             assert 0,"AttentionLSTM not implement with_batch";
606 |         else:
607 |             self.input = input
608 |             initial_states = [self.h_0, self.c_0] 
609 |         
610 |         step_function=self.step;  
611 | 
612 |         [e,h,c], _ = theano.scan(
613 |             fn=step_function,
614 |             sequences=[words,T.arange(words.shape[0])],
615 |             outputs_info=[T.zeros((input.shape[0],),
616 |                                   dtype=theano.config.floatX)]+initial_states,
617 |             non_sequences=[self.input],
618 |         )
619 |         self.h = h
620 |         self.output = h[-1]
621 |         self.e=e;
622 |         self.c=c;
623 |         return self.output
624 | #}}}
625 |  
626 | #}}}
627 | 
628 | class AttentionLayer(Layer):
629 |     def __init__(self,attended_dim,state_dim,
630 |                 source_dim,scoreFunName='Euclidean',
631 |                  atten_activation='tanh',name='AttentionLayer'):
632 | #{{{
633 |         self.attended_dim=attended_dim;
634 |         self.state_dim=state_dim;
635 |         self.source_dim=source_dim;
636 |         self.init=initializations.get('glorot_uniform');
637 |         self.name=name;
638 |         self.one_init=initializations.get('one');
639 |         self.atten_activation=activations.get(atten_activation);
640 |         self.scoreFunName=scoreFunName;
641 |         self.eps=1e-5;
642 |         #self.source_dim=glimpsed_dim;
643 |         super(AttentionLayer,self).__init__();
644 |     #}}}
645 |     def euclideanScore(self,attended,state,W):
646 | #{{{
647 |         #Euclidean distance 
648 |         M=(attended-state)**2;
649 |         M=T.dot(M,W);
650 |         _energy=M.max()-M;
651 |         return _energy; 
652 | #}}}
653 |     def manhattenScore(self,attended,state,W):
654 | #{{{
655 |         #Manhattan Distance 
656 |         #eps for avoid gradient to be NaN;
657 |         M=T.abs_(T.maximum(attended-state,self.eps));
658 |         M=T.dot(M,W);
659 |         _energy=M.max()-M;
660 |         return _energy; 
661 | #}}}
662 |     def bilinearScore(self,attended,state,W):
663 | #{{{
664 |         #Bilinear function  
665 |         M=(attended*state*W).sum(axis=-1);
666 |         _energy=self.atten_activation(M);
667 |         return _energy;
668 | #}}}
669 |     def forwardNNScore(self,attended,state,W):
670 | #{{{
671 |         #get weights
672 |         W_1=W[:(self.attended_dim+self.state_dim)*self.state_dim]; 
673 |         W_1=W_1.reshape((self.attended_dim+self.state_dim,self.state_dim));
674 |         W_2=W[(self.attended_dim+self.state_dim)*self.state_dim:];
675 |         
676 |         #forward neural network 
677 |         state_=T.repeat(state.reshape((1,-1)),attended.shape[0],axis=0);
678 |         input=T.concatenate([attended,state_],axis=-1);
679 |         M1=self.atten_activation(T.dot(input,W_1));
680 |         M2=self.atten_activation(T.dot(M1,W_2));
681 |         _energy=M2;
682 |         return _energy;
683 |     #}}}
684 |     def CNNScore(self,attended,state,W):
685 | #{{{
686 |         state_=T.repeat(state.reshape((1,-1)),attended.shape[0],axis=0);
687 |         input=T.concatenate([attended,state_],axis=-1);
688 |         M1=self.CNN1.call(input);
689 |         M2=self.CNN2.call(M1);
690 |         _energy=M2.flatten();
691 |         return _energy;
692 | #}}}
693 |     def CosineScore(self,attended,state,W):
694 | #{{{
695 |         dotProduct=T.dot(attended,state.T);
696 |         Al2Norm=T.sqrt((attended**2).sum(axis=-1));
697 |         Bl2Norm=T.sqrt((state**2).sum(axis=-1));
698 |         M=dotProduct/(Al2Norm*Bl2Norm);
699 |         _energy=T.exp(M+2);
700 |         return _energy;
701 | #}}}
702 |     def vanilaScore(self,attended,state,W):
703 |         """
704 |             the origin score proprosed by Bahdanau 2015
705 |         """
706 | 
707 |     def build(self):
708 | #{{{
709 |         self.W_A_X=shared((self.attended_dim,self.attended_dim),
710 |                              name='{}_W_A_X'.format(self.name));
711 |         self.b_A_X=shared((self.attended_dim,),
712 |                             name='{}_W_A_b'.format(self.name));
713 |         self.W_A_h=shared((self.attended_dim,self.attended_dim),
714 |                              name='{}_W_A_h'.format(self.name));
715 |         self.W_A_combine=shared((self.source_dim*2,
716 |                                  self.source_dim),
717 |                                name='{}_W_A_combine'.format(self.name));
718 |         self.b_A_combine=shared((self.source_dim,),
719 |                                name='{}_b_A_combine'.format(self.name))
720 |         #self.W_A_combine=shared((self.source_dim,
721 |         #                         self.source_dim),
722 |         #                         name='{}_W_A_combine'.format(self.name));
723 |         #self.b_A_combine=shared((self.source_dim,),
724 |         #                         name='{}_b_A_combine'.format(self.name))
725 |         #use constraint
726 |         self.constraints={}
727 |         
728 |         self.params=[
729 |                      self.W_A_X,self.b_A_X,
730 |                     # self.W_A_h,
731 |                      self.W_A_combine,self.b_A_combine
732 |                     ];
733 |         
734 |         #for attention weight and score function
735 |         if self.scoreFunName == "Euclidean":
736 | #{{{
737 |             self.W_A=shared((self.state_dim,),
738 |                           name='{}_W_A'.format(self.name));
739 |             self.W_A.set_value(np.ones((self.state_dim,),dtype=theano.config.floatX));
740 |             self.constraints[self.W_A]=self.NonNegConstraint;
741 |             self.scoreFun=self.euclideanScore;
742 |             self.params.append(self.W_A);
743 | #}}}
744 |         elif self.scoreFunName == "Bilinear":
745 | #{{{
746 |             assert self.attended_dim==self.state_dim,"in Bilinear score function,"\
747 |                 " attended_dim must be equal to state_dim"
748 |             self.W_A=self.init((self.state_dim,),
749 |                                 name="{}_W_A".format(self.name));
750 |             self.scoreFun=self.bilinearScore;
751 |             self.params.append(self.W_A);
752 | #}}}
753 |         elif self.scoreFunName == "forwardNN":
754 | #{{{
755 |             #this is two layer NN 
756 |             #first layer (attended_dim+state_dim,state_dim);
757 |             #second layer (state_dim,1);
758 |             self.W_A=shared(((self.attended_dim+self.state_dim)\
759 |                                 *self.state_dim+self.state_dim,),
760 |                                 name="{}_W_A".format(self.name));
761 |             self.scoreFun=self.forwardNNScore;
762 |             self.params.append(self.W_A);
763 | #}}}
764 |         elif self.scoreFunName == "CNN":
765 | #{{{
766 |             #this if one layer CNN and pool layer;
767 |             nb_filter=(self.attended_dim+self.state_dim)/2;
768 |             filter_length=3;
769 |             input_dim=self.attended_dim+self.state_dim;
770 |             self.CNN1=Convolution1D(nb_filter=nb_filter,
771 |                                    filter_length=filter_length,
772 |                                   input_dim=input_dim,activation='tanh',
773 |                                   border_mode='same');
774 |             self.CNN2=Convolution1D(nb_filter=1,
775 |                                    filter_length=filter_length,
776 |                                   input_dim=nb_filter,activation='tanh',
777 |                                   border_mode='same');
778 |             self.W_A=self.CNN1.W;
779 |             self.scoreFun=self.CNNScore;
780 |             self.params.append(self.W_A);
781 |             self.params.append(self.CNN2.W);
782 | #}}}
783 |         elif self.scoreFunName == "Cosine":
784 | #{{{
785 |             self.scoreFun=self.CosineScore;
786 |             self.W_A=None;
787 | #}}}
788 |         elif self.scoreFunName == "Manhatten":
789 | #{{{
790 |             self.scoreFun=self.manhattenScore;
791 |             self.W_A=self.one_init((self.state_dim,),
792 |                           name='{}_W_A'.format(self.name));
793 |             self.constraints[self.W_A]=self.NonNegConstraint;
794 |             self.params.append(self.W_A);
795 | #}}}
796 |         else:
797 |             assert 0, "we only have Euclidean, Bilinear, forwardNN"\
798 |                     " score function for attention";
799 | 
800 | #}}}
801 |     def softmaxReScale(self,energy_,threshould):
802 | #{{{
803 |         #in energy_, the goundthrud should be max
804 |         assert energy_.ndim==1;
805 |         #convert threshould from percentage to energy_;
806 |         threshould_=T.log(T.exp(energy_-energy_.max()).sum())+T.log(threshould)+energy_.max()
807 |         energy=self.reScale(energy_,threshould_);
808 |         return T.nnet.softmax(energy);
809 |     #}}}
810 |     def reScale(self,energy,threshold,replaceValue=1e-7):
811 | #{{{
812 |         assert energy.ndim==1;
813 |         maxValue=energy.max();
814 |         def checkThreshold(value,threshold,replaceValue):
815 |             return T.switch(T.lt(value,threshold),replaceValue,value);
816 |         result,update=theano.scan(fn=checkThreshold,
817 |                                  outputs_info=None,
818 |                                  sequences=[energy],
819 |                                  non_sequences=[threshold,replaceValue]);
820 |         return T.switch(T.lt(maxValue,threshold),energy,result);
821 | #}}}
822 |     
823 |     def step(self,state,attended,source):
824 |         #from theano.gradient import disconnected_grad;
825 |         #state=disconnected_grad(state_);
826 |         #M_state=T.dot(self.W_A_h,state) ;
827 | 
828 |         _energy=self.scoreFun(attended,state,self.W_A)
829 |         energy=T.nnet.softmax(_energy);
830 |         #energy=self.softmaxReScale(_energy,0.02);
831 |         #energy=self.reScale(energy.flatten(),0.02).reshape((1,-1))
832 |         #energyIndex=energy.flatten().argmin(axis=-1);
833 |         glimpsed=(energy.T*source).sum(axis=0)
834 |         #glimpsed=source[energyIndex];
835 |         return energy.flatten(),glimpsed;
836 | 
837 |     def NonNegConstraint(self,p):
838 |         p*=K.cast(p>=0.,K.floatx());
839 |         return p;
840 | 
841 |     def link(self,attended,state,source):
842 |         step_function=self.step;
843 |         attended_=T.tanh(T.dot(attended,self.W_A_X))+self.b_A_X;
844 |         #attended_=attended;
845 |         [energy,glimpsed],_=theano.scan(fn=step_function,
846 |                             sequences=[attended_],
847 |                                outputs_info=None,
848 |                             non_sequences=[attended_,source]);
849 |         self.energy=energy;
850 |         
851 |         #combine 
852 |         #combine=T.concatenate([glimpsed,attended],axis=-1);
853 |         combine=T.concatenate([glimpsed,source],axis=-1);
854 |         combined=T.tanh(T.dot(combine,self.W_A_combine))+self.b_A_combine;
855 |         #no source
856 |         #combined=T.tanh(T.dot(glimpsed,self.W_A_combine))+self.b_A_combine;
857 |         return combined;
858 | 
859 | def log_sum_exp(x, axis=None):
860 |     """
861 |     Sum probabilities in the log-space.
862 |     """
863 |     xmax = x.max(axis=axis, keepdims=True)
864 |     xmax_ = x.max(axis=axis)
865 |     return xmax_ + T.log(T.exp(x - xmax).sum(axis=axis))
866 | 
867 | def forward(observations, transitions, viterbi=False,
868 |             return_alpha=False, return_best_sequence=False):
869 |     """
870 |     Takes as input:
871 |         - observations, sequence of shape (n_steps, n_classes)
872 |         - transitions, sequence of shape (n_classes, n_classes)
873 |     Probabilities must be given in the log space.
874 |     Compute alpha, matrix of size (n_steps, n_classes), such that
875 |     alpha[i, j] represents one of these 2 values:
876 |         - the probability that the real path at node i ends in j
877 |         - the maximum probability of a path finishing in j at node i (Viterbi)
878 |     Returns one of these 2 values:
879 |         - alpha
880 |         - the final probability, which can be:
881 |             - the sum of the probabilities of all paths
882 |             - the probability of the best path (Viterbi)
883 |     """
884 |     #the last row of transitions is the inital state 
885 |     trans=transitions[:-1];
886 |     assert not return_best_sequence or (viterbi and not return_alpha)
887 |     assert viterbi==return_best_sequence
888 | 
889 |     def recurrence(obs, previous, transitions):
890 |         previous = previous.dimshuffle(0, 'x')
891 |         obs = obs.dimshuffle('x', 0)
892 |         if viterbi:
893 |             scores = previous + obs + transitions
894 |             out = scores.max(axis=0)
895 |             
896 |             out2 = scores.argmax(axis=0)
897 |             return out, out2
898 |         else:
899 |             return log_sum_exp(previous + obs + transitions, axis=0)
900 | 
901 |     initial = transitions[-1]+observations[0]
902 |     alpha, _ = theano.scan(
903 |         fn=recurrence,
904 |         outputs_info=(initial, None) if return_best_sequence else initial,
905 |         sequences=[observations[1:]],
906 |         non_sequences=[trans]
907 |     )
908 |     if viterbi:
909 |             alpha0=T.concatenate([[initial],alpha[0]],axis=0);
910 |             alpha=[alpha0,alpha[1]];
911 |     #else:
912 |     #    alpha=T.concatenate([log_sum_exp(initial,axis=0).dimshuffle('x',0),
913 |     #                            alpha],axis=0);
914 | 
915 |     if return_alpha:
916 |         return alpha
917 |     elif return_best_sequence:
918 |         sequence, _ = theano.scan(
919 |             fn=lambda beta_i, previous: beta_i[previous],
920 |             outputs_info=T.cast(T.argmax(alpha[0][-1]), 'int32'),
921 |             sequences=T.cast(alpha[1][::-1], 'int32')
922 |         )
923 |         sequence = T.concatenate([sequence[::-1], [T.argmax(alpha[0][-1])]])
924 |         return sequence
925 |     else:
926 |         if viterbi:
927 |             return alpha[-1].max(axis=0)
928 |         else:
929 |             return log_sum_exp(alpha[-1], axis=0)
930 | 
931 | 
932 | def forward_org(observations, transitions, viterbi=False,
933 |             return_alpha=False, return_best_sequence=False):
934 |     """
935 |     Takes as input:
936 |         - observations, sequence of shape (n_steps, n_classes)
937 |         - transitions, sequence of shape (n_classes, n_classes)
938 |     Probabilities must be given in the log space.
939 |     Compute alpha, matrix of size (n_steps, n_classes), such that
940 |     alpha[i, j] represents one of these 2 values:
941 |         - the probability that the real path at node i ends in j
942 |         - the maximum probability of a path finishing in j at node i (Viterbi)
943 |     Returns one of these 2 values:
944 |         - alpha
945 |         - the final probability, which can be:
946 |             - the sum of the probabilities of all paths
947 |             - the probability of the best path (Viterbi)
948 | 
949 |     """
950 |     
951 |     assert not return_best_sequence or (viterbi and not return_alpha)
952 | 
953 |     def recurrence(obs, previous, transitions):
954 |         previous = previous.dimshuffle(0, 'x')
955 |         obs = obs.dimshuffle('x', 0)
956 |         if viterbi:
957 |             scores = previous + obs + transitions
958 |             out = scores.max(axis=0)
959 |             if return_best_sequence:
960 |                 out2 = scores.argmax(axis=0)
961 |                 return out, out2
962 |             else:
963 |                 return out
964 |         else:
965 |             return log_sum_exp(previous + obs + transitions, axis=0)
966 | 
967 |     initial = observations[0]
968 |     alpha, _ = theano.scan(
969 |         fn=recurrence,
970 |         outputs_info=(initial, None) if return_best_sequence else initial,
971 |         sequences=[observations[1:]],
972 |         non_sequences=transitions
973 |     )
974 | 
975 |     if return_alpha:
976 |         return alpha
977 |     elif return_best_sequence:
978 |         sequence, _ = theano.scan(
979 |             fn=lambda beta_i, previous: beta_i[previous],
980 |             outputs_info=T.cast(T.argmax(alpha[0][-1]), 'int32'),
981 |             sequences=T.cast(alpha[1][::-1], 'int32')
982 |         )
983 |         #sequence = T.concatenate([sequence[::-1], [T.argmax(alpha[0][-1])]])
984 |         return alpha
985 |     else:
986 |         if viterbi:
987 |             return alpha[-1].max(axis=0)
988 |         else:
989 |             return log_sum_exp(alpha[-1], axis=0)
990 | 
991 | 


--------------------------------------------------------------------------------
/src/optimization.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import theano
  3 | import theano.tensor as T
  4 | 
  5 | floatX = theano.config.floatX
  6 | device = theano.config.device
  7 | 
  8 | 
  9 | class Optimization:
 10 | 
 11 |     def __init__(self, clip=None):
 12 |         """
 13 |         Initialization
 14 |         """
 15 |         self.clip = clip
 16 | 
 17 |     def get_gradients(self, cost, params):
 18 |         """
 19 |         Compute the gradients, and clip them if required.
 20 |         """
 21 |         if self.clip is None:
 22 |             return T.grad(cost, params)
 23 |         else:
 24 |             assert self.clip > 0
 25 |             return T.grad(
 26 |                 theano.gradient.grad_clip(cost, -1 * self.clip, self.clip),
 27 |                 params 
 28 |             )
 29 | 
 30 |     def get_updates(self, method, cost, params,constraints={}, *args, **kwargs):
 31 |         """
 32 |         Compute the updates for different optimizers.
 33 |         """
 34 |         if method == 'sgd':
 35 |             updates = self.sgd(cost, params,constraints=constraints, **kwargs)
 36 |         elif method == 'sgdmomentum':
 37 |             updates = self.sgdmomentum(cost, params **kwargs)
 38 |         elif method == 'adagrad':
 39 |             updates = self.adagrad(cost, params, **kwargs)
 40 |         elif method == 'adadelta':
 41 |             updates = self.adadelta(cost, params, **kwargs)
 42 |         elif method == 'adam':
 43 |             updates = self.adam(cost, params, **kwargs)
 44 |         elif method == 'rmsprop':
 45 |             updates = self.rmsprop(cost, params, **kwargs)
 46 |         else:
 47 |             raise("Not implemented learning method: %s" % method)
 48 |         return updates
 49 |     
 50 |     def sgd(self, cost, params,constraints={}, lr=0.01):
 51 | #{{{
 52 |         """
 53 |         Stochatic gradient descent.
 54 |         """
 55 |         updates = []
 56 |         
 57 |         lr = theano.shared(np.float32(lr).astype(floatX))
 58 |         gradients = self.get_gradients(cost, params)
 59 |         
 60 |         for p, g in zip(params, gradients):
 61 |             v=-lr*g;
 62 |             new_p=p+v;
 63 |             # apply constraints
 64 |             if p in constraints:
 65 |                 c=constraints[p];
 66 |                 new_p=c(new_p);
 67 |             updates.append((p, new_p))
 68 | 
 69 |         return updates
 70 | #}}}
 71 |     def sgdmomentum(self, cost, params,constraints={}, lr=0.01,consider_constant=None, momentum=0.):
 72 |         """
 73 |         Stochatic gradient descent with momentum. Momentum has to be in [0, 1)
 74 |         """
 75 |         # Check that the momentum is a correct value
 76 |         assert 0 <= momentum < 1
 77 | 
 78 |         lr = theano.shared(np.float32(lr).astype(floatX))
 79 |         momentum = theano.shared(np.float32(momentum).astype(floatX))
 80 | 
 81 |         gradients = self.get_gradients(cost, params)
 82 |         velocities = [theano.shared(np.zeros_like(param.get_value(borrow=True)).astype(floatX)) for param in params]
 83 | 
 84 |         updates = []
 85 |         for param, gradient, velocity in zip(params, gradients, velocities):
 86 |             new_velocity = momentum * velocity - lr * gradient
 87 |             updates.append((velocity, new_velocity))
 88 |             new_p=param+new_velocity;
 89 |             # apply constraints
 90 |             if param in constraints:
 91 |                 c=constraints[param];
 92 |                 new_p=c(new_p);
 93 |             updates.append((param, new_p))
 94 |         return updates
 95 | 
 96 |     def adagrad(self, cost, params, lr=1.0, epsilon=1e-6,consider_constant=None):
 97 |         """
 98 |         Adagrad. Based on http://www.ark.cs.cmu.edu/cdyer/adagrad.pdf
 99 |         """
100 |         lr = theano.shared(np.float32(lr).astype(floatX))
101 |         epsilon = theano.shared(np.float32(epsilon).astype(floatX))
102 | 
103 |         gradients = self.get_gradients(cost, params,consider_constant)
104 |         gsums = [theano.shared(np.zeros_like(param.get_value(borrow=True)).astype(floatX)) for param in params]
105 | 
106 |         updates = []
107 |         for param, gradient, gsum in zip(params, gradients, gsums):
108 |             new_gsum = gsum + gradient ** 2.
109 |             updates.append((gsum, new_gsum))
110 |             updates.append((param, param - lr * gradient / (T.sqrt(gsum + epsilon))))
111 |         return updates
112 | 
113 |     def adadelta(self, cost, params, rho=0.95, epsilon=1e-6,consider_constant=None):
114 |         """
115 |         Adadelta. Based on:
116 |         http://www.matthewzeiler.com/pubs/googleTR2012/googleTR2012.pdf
117 |         """
118 |         rho = theano.shared(np.float32(rho).astype(floatX))
119 |         epsilon = theano.shared(np.float32(epsilon).astype(floatX))
120 | 
121 |         gradients = self.get_gradients(cost, params,consider_constant)
122 |         accu_gradients = [theano.shared(np.zeros_like(param.get_value(borrow=True)).astype(floatX)) for param in params]
123 |         accu_deltas = [theano.shared(np.zeros_like(param.get_value(borrow=True)).astype(floatX)) for param in params]
124 | 
125 |         updates = []
126 |         for param, gradient, accu_gradient, accu_delta in zip(params, gradients, accu_gradients, accu_deltas):
127 |             new_accu_gradient = rho * accu_gradient + (1. - rho) * gradient ** 2.
128 |             delta_x = - T.sqrt((accu_delta + epsilon) / (new_accu_gradient + epsilon)) * gradient
129 |             new_accu_delta = rho * accu_delta + (1. - rho) * delta_x ** 2.
130 |             updates.append((accu_gradient, new_accu_gradient))
131 |             updates.append((accu_delta, new_accu_delta))
132 |             updates.append((param, param + delta_x))
133 |         return updates
134 | 
135 |     def adam(self, cost, params, lr=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,consider_constant=None):
136 |         """
137 |         Adam. Based on http://arxiv.org/pdf/1412.6980v4.pdf
138 |         """
139 |         updates = []
140 |         gradients = self.get_gradients(cost, params,consider_constant)
141 | 
142 |         t = theano.shared(np.float32(1.).astype(floatX))
143 | 
144 |         for param, gradient in zip(params, gradients):
145 |             value = param.get_value(borrow=True)
146 |             m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable)
147 |             v_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable)
148 | 
149 |             m = beta1 * m_prev + (1. - beta1) * gradient
150 |             v = beta2 * v_prev + (1. - beta2) * gradient ** 2.
151 |             m_hat = m / (1. - beta1 ** t)
152 |             v_hat = v / (1. - beta2 ** t)
153 |             theta = param - (lr * m_hat) / (T.sqrt(v_hat) + epsilon)
154 | 
155 |             updates.append((m_prev, m))
156 |             updates.append((v_prev, v))
157 |             updates.append((param, theta))
158 | 
159 |         updates.append((t, t + 1.))
160 |         return updates
161 | 
162 |     def rmsprop(self, cost, params, lr=0.001, rho=0.9, eps=1e-6,consider_constant=None):
163 |         """
164 |         RMSProp.
165 |         """
166 |         lr = theano.shared(np.float32(lr).astype(floatX))
167 | 
168 |         gradients = self.get_gradients(cost, params,consider_constant)
169 |         accumulators = [theano.shared(np.zeros_like(p.get_value()).astype(np.float32)) for p in params]
170 | 
171 |         updates = []
172 | 
173 |         for param, gradient, accumulator in zip(params, gradients, accumulators):
174 |             new_accumulator = rho * accumulator + (1 - rho) * gradient ** 2
175 |             updates.append((accumulator, new_accumulator))
176 | 
177 |             new_param = param - lr * gradient / T.sqrt(new_accumulator + eps)
178 |             updates.append((param, new_param))
179 | 
180 |         return updates
181 | 


--------------------------------------------------------------------------------
/src/tagger.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import os
  4 | import time
  5 | import codecs
  6 | import optparse
  7 | import numpy as np
  8 | from loader import prepare_dataset;
  9 | from utils import create_input, iobes_iob;
 10 | from model import Model
 11 | 
 12 | optparser = optparse.OptionParser()
 13 | optparser.add_option(
 14 |     "-m", "--model", default="../models/chemnerModel/",
 15 |     help="Model location"
 16 | )
 17 | optparser.add_option(
 18 |     "-i", "--input", default="../data/chemnder_test.txt",
 19 |     help="Input file location"
 20 | )
 21 | optparser.add_option(
 22 |     "-o", "--output", default="./chemnder_test.tsv",
 23 |     help="Output file location"
 24 | )
 25 | optparser.add_option(
 26 |     "-d", "--delimiter", default="__",
 27 |     help="Delimiter to separate words from their tags"
 28 | )
 29 | opts = optparser.parse_args()[0]
 30 | 
 31 | # Check parameters validity
 32 | assert opts.delimiter
 33 | assert os.path.isdir(opts.model)
 34 | assert os.path.isfile(opts.input)
 35 | 
 36 | # Load existing model
 37 | print "Loading model..."
 38 | model = Model(model_path=opts.model)
 39 | 
 40 | # Load reverse mappings
 41 | word_to_id, char_to_id, tag_to_id = [
 42 |     {v: k for k, v in x.items()}
 43 |     for x in [model.id_to_word, model.id_to_char, model.id_to_tag]
 44 | ]
 45 | parameters = model.parameters
 46 | #print model.parameters
 47 | # Load the model
 48 | _, f_eval = model.build4(parameters)
 49 | model.reload()
 50 | 
 51 | #load test sentence  
 52 | def load_sentences(path):
 53 |     sentences = []
 54 |     for line in codecs.open(path, 'r', 'utf8'):
 55 |         sentence =[];
 56 |         line = line.rstrip()
 57 |         if line:
 58 |             word = line.split()
 59 |             for elem in word:
 60 |                 sentence.append([elem]);
 61 |             sentences.append(sentence)
 62 |     return sentences 
 63 | 
 64 | test_sentences=load_sentences(opts.input);
 65 | test_data=prepare_dataset(test_sentences,None,parameters,parameters['lower'],isTest=True);
 66 | f_output = codecs.open(opts.output, 'w', 'utf-8')
 67 | start = time.time()
 68 | 
 69 | def xmlformat(sentence,tags):
 70 | #{{{
 71 |     assert len(sentence)==len(tags);
 72 |     res=[];
 73 |     preTag="drug";
 74 |     for i in range(len(tags)):
 75 |         if tags[i][0]=='B':
 76 |             if len(preTag):
 77 |                 res.append("</"+preTag+">");
 78 |                 preTag="";
 79 |             res.append("<"+tags[i][2:]+">");
 80 |             preTag=tags[i][2:];
 81 |         if tags[i][0]=='I':
 82 |             if preTag!=tags[i][2:]:
 83 |                 if len(preTag):
 84 |                     res.append("</"+preTag+">");
 85 |                     preTag="";
 86 | 
 87 |         if tags[i][0]=='O':
 88 |             if len(preTag):
 89 |                 res.append("</"+preTag+">");
 90 |                 preTag="";
 91 |         res.append(sentence[i]);
 92 |     if len(preTag):
 93 |         res.append("</"+preTag+">");
 94 |     return res;
 95 | #}}}
 96 | print 'Tagging...'
 97 | for line in test_data:
 98 |     # Prepare input
 99 |     input = create_input(line, parameters, False,useAttend=parameters['useAttend']);
100 |     words=line['str_words'];
101 |     # Decoding
102 |     if parameters['crf']:
103 |         y_preds = np.array(f_eval(*input))
104 |     else:
105 |         y_preds = f_eval(*input).argmax(axis=1)
106 |     y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds]
107 |     # Output tags in the IOB2 format
108 |     if parameters['tag_scheme'] == 'iobes':
109 |         y_preds = iobes_iob(y_preds)
110 |     # Write tags
111 |     assert len(y_preds) == len(words)
112 |     
113 | #    print words
114 |     for i in range(len(words)):
115 |         f_output.write(words[i]+'\t'+y_preds[i]+'\n')
116 |     f_output.write('\n')     
117 | #    for elem in xmlformat(words,y_preds):
118 | #                    f_output.write(elem+" ");
119 | #    f_output.write("\n");
120 | 
121 | print '---- lines tagged in %.4fs ----' % ( time.time() - start)
122 | f_output.close()
123 | 


--------------------------------------------------------------------------------
/src/train.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import os
  4 | import numpy as np 
  5 | SEED=1234;
  6 | np.random.seed(SEED);
  7 | import optparse
  8 | import itertools
  9 | import time
 10 | import subprocess
 11 | from collections import OrderedDict
 12 | from utils import create_input
 13 | import loader
 14 | 
 15 | from utils import models_path, evaluate, eval_script, eval_temp,create_mapping;
 16 | from loader import word_mapping, char_mapping, tag_mapping
 17 | from loader import update_tag_scheme, prepare_dataset
 18 | from loader import augment_with_pretrained, feature_mapping;
 19 | from model import Model 
 20 | #import random ;
 21 | #for bash color 
 22 | BASH_RED="\033[0;31m";
 23 | BASH_GREEN="\033[0;32m"
 24 | BASH_YELLOW="\033[0;33m"
 25 | BASH_CYAN="\033[0;36m"
 26 | BASH_CLEAR="\033[0m"
 27 | 
 28 | #prepare for model 
 29 | #{{{
 30 | # Read parameters from command line
 31 | #{{{
 32 | optparser = optparse.OptionParser()
 33 | optparser.add_option(
 34 |     "-T", "--train", default="training.ner.ssplit.token4.BIO",
 35 |     help="Train set location"
 36 | )
 37 | optparser.add_option(
 38 |     "-d", "--dev", default="development.ner.ssplit.token4.BIO",
 39 |     help="Dev set location"
 40 | )
 41 | optparser.add_option(
 42 |     "-t", "--test", default="evaluation.ner.ssplit.token4.BIO",
 43 |     help="Test set location"
 44 | )
 45 | optparser.add_option(
 46 |     "-s", "--tag_scheme", default="iob",
 47 |     help="Tagging scheme (IOB or IOBES)"
 48 | )
 49 | optparser.add_option(
 50 |     "-l", "--lower", default="0",
 51 |     type='int', help="Lowercase words (this will not affect character inputs)"
 52 | )
 53 | optparser.add_option(
 54 |     "-z", "--zeros", default="0",
 55 |     type='int', help="Replace digits with 0"
 56 | )
 57 | optparser.add_option(
 58 |     "-c", "--char_dim", default="25",
 59 |     type='int', help="Char embedding dimension"
 60 | )
 61 | optparser.add_option(
 62 |     "-C", "--char_lstm_dim", default="25",
 63 |     type='int', help="Char LSTM hidden layer size"
 64 | )
 65 | optparser.add_option(
 66 |     "-b", "--char_bidirect", default="1",
 67 |     type='int', help="Use a bidirectional LSTM for chars"
 68 | )
 69 | optparser.add_option(
 70 |     "-w", "--word_dim", default="50",
 71 |     type='int', help="Token embedding dimension"
 72 | )
 73 | optparser.add_option(
 74 |     "-W", "--word_lstm_dim", default="100",
 75 |     type='int', help="Token LSTM hidden layer size"
 76 | )
 77 | optparser.add_option(
 78 |     "-B", "--word_bidirect", default="1",
 79 |     type='int', help="Use a bidirectional LSTM for words"
 80 | )
 81 | optparser.add_option(
 82 |     "-p", "--pre_emb", default="./word2vec_model/chemdner_pubmed_drug.word2vec_model_token4_d50",
 83 |     help="Location of pretrained embeddings"
 84 | )
 85 | optparser.add_option(
 86 |     "-A", "--all_emb", default="0",
 87 |     type='int', help="Load all embeddings"
 88 | )
 89 | optparser.add_option(
 90 |     "-a", "--cap_dim", default="1",
 91 |     type='int', help="Capitalization feature dimension (0 to disable)"
 92 | )
 93 | optparser.add_option(
 94 |     "-f", "--crf", default="1",
 95 |     type='int', help="Use CRF (0 to disable)"
 96 | )
 97 | optparser.add_option(
 98 |     "-D", "--dropout", default="0.5",
 99 |     type='float', help="Droupout on the input (0 = no dropout)"
100 | )
101 | optparser.add_option(
102 |     "-L", "--lr_method", default="sgd-lr_.003",
103 |     help="Learning method (SGD, Adadelta, Adam..)"
104 | )
105 | optparser.add_option(
106 |     "-r", "--reload", default="0",
107 |     type='int', help="Reload the last saved model"
108 | )
109 | optparser.add_option(
110 |     "-S","--String",default="bilstm-crf-chemdner100d",
111 |     help="some about this model"
112 |     )
113 | opts = optparser.parse_args()[0]
114 | #}}}
115 | 
116 | 
117 | #according corpus to set some parameter for loading file 
118 | CORPUS="chem";
119 | tagFilter=None;
120 | if CORPUS == "chem":
121 | #{{{
122 |     opts.train="./chemdner_corpus/chemdner_training.ner.sen.token4.BIO_allfea";
123 |     opts.dev="./chemdner_corpus/chemdner_development.ner.sen.token4.BIO_allfea";
124 |     opts.test="./chemdner_corpus/chemdner_evaluation.ner.sen.token4.BIO_allfea";
125 |     opts.pre_emb="./word2vec_model/chemdner_pubmed_drug.word2vec_model_token4_d50";
126 |     tagFilter=None;
127 |     devBoundary=55508
128 | #}}}
129 | elif CORPUS == "CDR":
130 | #{{{
131 |     opts.train="./cdr_corpus/cdr_training.ner.sen.token4.BIO_allfea_drug";
132 |     opts.dev="./cdr_corpus/cdr_development.ner.sen.token4.BIO_allfea_drug";
133 |     opts.test="./cdr_corpus/cdr_test.ner.sen.token4.BIO_allfea_drug";
134 |     opts.pre_emb="./word2vec_model/chemdner_pubmed_drug.word2vec_model_token4_d50";
135 |     tagFilter=['Disease'];
136 |     devBoundary=8319;
137 | #}}}
138 | 
139 | else:
140 |     assert 0,"unknown corpus";
141 | 
142 | #read word_dim from word2vec_model
143 | #{{{
144 | with open(opts.pre_emb) as file:
145 |     first_line = file.readline()
146 |     #create vec_table
147 |     frequency = int(first_line.split()[0]);
148 |     vec_size = int(first_line.split()[1]);
149 |     opts.word_dim=vec_size;
150 |     opts.word_lstm_dim=vec_size;
151 | #}}}
152 | 
153 | # Parse parameters 
154 | #{{{
155 | parameters = OrderedDict()
156 | parameters['tag_scheme'] = opts.tag_scheme
157 | parameters['lower'] = opts.lower == 1
158 | parameters['zeros'] = opts.zeros == 1
159 | parameters['char_dim'] = opts.char_dim
160 | parameters['char_lstm_dim'] = opts.char_lstm_dim
161 | parameters['char_bidirect'] = opts.char_bidirect == 1
162 | parameters['word_dim'] = opts.word_dim
163 | parameters['word_lstm_dim'] = opts.word_lstm_dim
164 | parameters['word_bidirect'] = opts.word_bidirect == 1
165 | parameters['pre_emb'] = opts.pre_emb
166 | parameters['all_emb'] = opts.all_emb == 1
167 | parameters['cap_dim'] = opts.cap_dim
168 | parameters['crf'] = opts.crf == 1
169 | parameters['dropout'] = opts.dropout
170 | parameters['lr_method'] = opts.lr_method
171 | #}}}
172 | 
173 | # Check parameters validity
174 | #{{{
175 | assert os.path.isfile(opts.train)
176 | assert os.path.isfile(opts.dev)
177 | assert os.path.isfile(opts.test)
178 | assert parameters['char_dim'] > 0 or parameters['word_dim'] > 0
179 | assert 0. <= parameters['dropout'] < 1.0
180 | assert parameters['tag_scheme'] in ['iob', 'iobes']
181 | assert not parameters['all_emb'] or parameters['pre_emb']
182 | assert not parameters['pre_emb'] or parameters['word_dim'] > 0
183 | assert not parameters['pre_emb'] or os.path.isfile(parameters['pre_emb'])
184 | #}}}
185 | # Check evaluation script / folders
186 | if not os.path.isfile(eval_script):
187 |     raise Exception('CoNLL evaluation script not found at "%s"' % eval_script)
188 | if not os.path.exists(eval_temp):
189 |     os.makedirs(eval_temp)
190 | if not os.path.exists(models_path):
191 |     os.makedirs(models_path)
192 | #}}}
193 | #prepare for train 
194 | #{{{
195 | # Data parameters
196 | lower = parameters['lower']
197 | zeros = parameters['zeros']
198 | tag_scheme = parameters['tag_scheme']
199 | 
200 | # Load sentences
201 | train_sentences = loader.load_sentences(opts.train, lower, zeros)
202 | dev_sentences = loader.load_sentences(opts.dev, lower, zeros)
203 | test_sentences = loader.load_sentences(opts.test, lower, zeros)
204 | #show dev boundary in doc ratio
205 | def showDevBoundary(docDataset,sentDataset,ratio):
206 |     count=0;
207 |     for elem in docDataset[:int(len(docDataset)*(1-ratio))]:
208 |         count+=len(elem);
209 |     i=0;
210 |     count_=0
211 |     for elem in sentDataset:
212 |         i+=1;
213 |         if count_ < count:
214 |             count_+=len(elem);
215 |         else:
216 |             break;
217 |     return i-1;
218 | #check 1 word sentences 
219 | def check1word(sentences):
220 |     Lens=[];
221 |     for elem in sentences:
222 |         Lens.append(len(elem));
223 |     if min(Lens)==1:
224 |         assert 0;
225 | #check1word(train_sentences);
226 | #check1word(dev_sentences);
227 | #check1word(test_sentences);
228 | 
229 | #merge dev to train 
230 | totalSentences=train_sentences+dev_sentences;
231 | #redefine train and dev 
232 | #corpus are already random genergated, so no need to shuffly
233 | #random.seed(SEED);
234 | #random.shuffle(totalSentences);
235 | devRatio=0.1;
236 | #doc 10% != sentence 10%
237 | #devBoundary=int(len(totalSentences)*(1-devRatio))
238 | train_sentences=totalSentences[:devBoundary];
239 | dev_sentences=totalSentences[devBoundary:];
240 | 
241 | # Use selected tagging scheme (IOB / IOBES)
242 | update_tag_scheme(train_sentences, tag_scheme,tagFilter)
243 | update_tag_scheme(dev_sentences, tag_scheme,tagFilter)
244 | update_tag_scheme(test_sentences, tag_scheme,tagFilter)
245 | 
246 | # Create a dictionary / mapping of words
247 | # If we use pretrained embeddings, we add them to the dictionary.
248 | if parameters['pre_emb']:
249 |     dico_words_train = word_mapping(train_sentences, lower)[0]
250 |     dico_words, word_to_id, id_to_word = augment_with_pretrained(
251 |         dico_words_train.copy(),
252 |         parameters['pre_emb'],
253 |         list(itertools.chain.from_iterable(
254 |             [[w[0] for w in s] for s in dev_sentences + test_sentences])
255 |         ) if not parameters['all_emb'] else None
256 |     )
257 | else:
258 |     dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower)
259 |     dico_words_train = dico_words
260 | 
261 | # Create a dictionary and a mapping for words / POS tags / tags
262 | dico_chars, char_to_id, id_to_char = char_mapping(train_sentences)
263 | dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)
264 | 
265 | #feature mapping 
266 | #{{{
267 | featureMap={#{{{
268 |             'word':{
269 |                         'index':1,
270 |                         'lstm-input':0,
271 |                         'attended':0,
272 |             },
273 |             'char':{    
274 |                         'index':0,
275 |                         'lstm-input':0,
276 |                         'attended':0,
277 |             },
278 |             'lemma':{   'index':1,
279 |                         'isUsed':0,
280 |                         'num':0,
281 |                         'dim':25,
282 |                         'lstm-input':0,
283 |                         'attended':0,
284 |                         'pre_emb':''},
285 |             'pos':{     'index':2,
286 |                         'isUsed':0,
287 |                         'num':0,
288 |                         'dim':50,
289 |                         'lstm-input':0,
290 |                         'attended':0,
291 |                         'pre_emb':''},
292 |             'chunk':{   'index':3,
293 |                         'isUsed':0,
294 |                         'num':0,
295 |                         'lstm-input':0,
296 |                         'attended':0,
297 |                         'dim':10},
298 |             'dic':{     'index':4,
299 |                         'isUsed':1,
300 |                         'num':3,
301 |                         'lstm-input':1,
302 |                         'attended':0,
303 |                         'dim':5},
304 |            }#}}}
305 | feature2IdMap={'word':word_to_id,
306 |                    'char':char_to_id,
307 |                    'tag':tag_to_id};
308 | 
309 | if featureMap['lemma']['isUsed'] :
310 |     dico_lemma,lemma_to_id,id_to_lemma=feature_mapping(train_sentences,
311 |                                             featureMap['lemma']['index'],'lemma');
312 |     featureMap['lemma']['num']=len(dico_lemma)
313 |     feature2IdMap['lemma']=lemma_to_id;
314 | 
315 | if featureMap['pos']['isUsed'] :
316 |     dico_pos,pos_to_id,id_to_pos=feature_mapping(train_sentences,
317 |                                             featureMap['pos']['index'],'pos');
318 |     featureMap['pos']['num']=len(dico_pos)
319 |     feature2IdMap['pos']=pos_to_id;
320 | if featureMap['chunk']['isUsed']:
321 |     dico_chunk,chunk_to_id,id_to_chunk=feature_mapping(train_sentences,
322 |                                             featureMap['chunk']['index'],'chunk');
323 |     featureMap['chunk']['num']=len(dico_chunk)
324 |     feature2IdMap['chunk']=chunk_to_id;
325 | if featureMap['dic']['isUsed'] :
326 |     dico_dic={'B':0,'I':1,'O':2};
327 |     dic_to_id,id_to_dic=create_mapping(dico_dic);
328 |     feature2IdMap['dic']=dic_to_id;
329 | print BASH_YELLOW+str(featureMap)+BASH_CLEAR;
330 | featureMap['feature2IdMap']=feature2IdMap;
331 | parameters['features']=featureMap;
332 | #}}}
333 | 
334 | # Index data
335 | train_data = prepare_dataset(
336 |     train_sentences,None, parameters, lower
337 | )
338 | dev_data = prepare_dataset(
339 |     dev_sentences,None, parameters, lower
340 | )
341 | test_data = prepare_dataset(
342 |     test_sentences,None, parameters, lower
343 | )
344 | 
345 | print "%i / %i / %i sentences in train / dev / test." % (
346 |     len(train_data), len(dev_data), len(test_data))
347 | 
348 | 
349 | parameters['useAttend']=False;
350 | parameters['sentencesLevelLoss']=False;
351 | parameters['training']=True;
352 | saveModel=True;
353 | useEarlyStopping=False;
354 | # Initialize model
355 | model = Model(parameters=parameters, 
356 |               models_path=models_path,
357 |               model_path="./models/bilstm-crf-dic-chemdner-50d/",Training=True);
358 | # Save the mappings to disk
359 | print 'Saving the mappings to disk...'
360 | model.save_mappings(id_to_word, id_to_char, id_to_tag)
361 | print BASH_YELLOW+"Model location: "+BASH_CLEAR+ "%s" % model.model_path
362 | print BASH_YELLOW+"model important point:"+BASH_CLEAR,opts.String;
363 | print BASH_YELLOW+'save model:'+BASH_CLEAR,saveModel;
364 | # Build the model
365 | f_train, f_eval = model.build4(parameters)
366 | 
367 | # Reload previous model values
368 | if opts.reload:
369 |     print 'Reloading previous model...'
370 |     model.reload()
371 | #}}}
372 | #
373 | # Train network
374 | #
375 | singletons = set([word_to_id[k] for k, v
376 |                   in dico_words_train.items() if v == 1])
377 | freq_eval = int(len(train_data)*0.3)  # evaluate on dev every freq_eval steps
378 | count = 0
379 | limitPrint=12;
380 | param = {
381 |          'lr':0.005,
382 |          'verbose':1,
383 |          'decay':True, # decay on the learning rate if improvement stops
384 |          'bs':5, # number of backprop through time steps
385 |          'seed':345,
386 |          'epochs':40,
387 |          'crf':True,
388 |          'shuffle':True};
389 | folder_out = '../log/BiLSTM-CRF/'
390 | print BASH_YELLOW+"folder_out:"+BASH_CLEAR,folder_out;
391 | best_f1=-np.inf;
392 | 
393 | #generate FILE NAME PREFIX 
394 | fileNamePrefix="";
395 | if opts.String != "":
396 |     fileNamePrefix=opts.String;
397 |     fileNamePrefix=fileNamePrefix.replace(",","_");
398 |     fileNamePrefix=fileNamePrefix.replace(" ","_");
399 | 
400 | #train model 
401 | if useEarlyStopping:
402 | #{{{
403 |     from utils import EarlyStopping;
404 |     eStop=EarlyStopping(patience=15,mode='min');
405 |     eStop.on_train_begin();
406 |     f_test=model.f_test;
407 |     
408 |     #start train our model
409 |     for epoch in xrange(param['epochs']):
410 |         epoch_costs = []
411 |         startTime=time.time();
412 |         
413 |         #decide whether early stop 
414 |         if eStop.stop_training:
415 |             break;
416 |         
417 |         print "Starting epoch %i..." % epoch
418 |         for i, index in enumerate(np.random.permutation(len(train_data))):
419 |             count += 1
420 |             input = create_input(train_data[index], parameters, True, singletons,False)
421 |             new_cost = f_train(*input)
422 |             if np.isnan(new_cost):
423 |                 print index,"nan"
424 |             epoch_costs.append(new_cost)
425 |             #validation 
426 |             if count == len(train_data):
427 |                 valLoss=[];
428 |                 for i in range(len(dev_data)):
429 |                     devInput=create_input(dev_data[i],parameters,True,None,False);
430 |                     newDevCost=f_test(*devInput);
431 |                     valLoss.append(newDevCost);
432 |                 #res_dev = evaluate(parameters, f_eval, dev_sentences,
433 |                 #              dev_data, id_to_tag, dico_tags,
434 |                 #            folder_out+fileNamePrefix+'.dev.txt',
435 |                 #            useAttend=False)
436 |                 eStop.on_epoch_end(epoch,np.mean(valLoss)) ;
437 |             if eStop.stop_training:
438 |                 break;
439 |         print BASH_YELLOW+"avg error:"+BASH_CLEAR,np.mean(epoch_costs),\
440 |                     "avg dev loss:",np.mean(valLoss);
441 |         print BASH_YELLOW+"One epch espliced:"+BASH_CLEAR,time.time()-startTime;
442 | 
443 |     #start evaluate on test
444 |     res_test = evaluate(parameters, f_eval, test_sentences,
445 |                       test_data, id_to_tag, dico_tags,
446 |                     folder_out+fileNamePrefix+'.test.txt',
447 |                     useAttend=False)
448 |     if saveModel:
449 |         print "Saving model to disk..."
450 |         model.save()
451 |     print BASH_RED+'TEST: epoch'+BASH_CLEAR, epoch, 'F1', res_test['f1'],'p:',res_test['p'],'r:',res_test['r'],  ' '*15
452 |     print BASH_YELLOW+"model important point:"+BASH_CLEAR,opts.String;
453 |             #}}}
454 | else:
455 | #{{{
456 |     #start train our model
457 |     for epoch in xrange(param['epochs']):
458 |         epoch_costs = []
459 |         startTime=time.time();
460 |         print "Starting epoch %i..." % epoch
461 |         for i, index in enumerate(np.random.permutation(len(train_data))):
462 |             count += 1
463 |             input = create_input(train_data[index], parameters, True, singletons,False)
464 |             new_cost = f_train(*input)
465 |             if np.isnan(new_cost):
466 |                 print index,"nan"
467 |             epoch_costs.append(new_cost)
468 |             if count % freq_eval == 0 and epoch>=limitPrint:
469 |                 res_dev = evaluate(parameters, f_eval, dev_sentences,
470 |                                       dev_data, id_to_tag, dico_tags,
471 |                                     folder_out+fileNamePrefix+'.dev.txt',
472 |                                     useAttend=False)
473 |                 #new F1 value on dev 
474 |                 if res_dev['f1'] > best_f1:
475 |                     best_f1 = res_dev['f1']
476 |                     if param['verbose']:
477 |                         print BASH_CYAN+'NEW DEV BEST: epoch'+BASH_CLEAR, epoch, 'best dev F1', res_dev['f1'],'p:',res_dev['p'],'r:',res_dev['r'],  ' '*15 
478 |                     
479 |                     #new F1 value on dev, so evaluate on test
480 |                     res_test = evaluate(parameters, f_eval, test_sentences,
481 |                                       test_data, id_to_tag, dico_tags,
482 |                                     folder_out+fileNamePrefix+'.test.txt',
483 |                                     useAttend=False)
484 |                     if saveModel:
485 |                         print "Saving model to disk..."
486 |                         model.save()
487 |                     print BASH_RED+'THIS TEST: epoch'+BASH_CLEAR, epoch, 'F1', res_test['f1'],'p:',res_test['p'],'r:',res_test['r'],  ' '*15
488 |                     param['tf1'], param['tp'], param['tr'] = res_test['f1'],  res_test['p'],  res_test['r']
489 |                     param['be'] = epoch
490 |         print BASH_YELLOW+"avg error:"+BASH_CLEAR,np.mean(epoch_costs);
491 |         print BASH_YELLOW+"One epch espliced:"+BASH_CLEAR,time.time()-startTime;
492 |     print BASH_GREEN+'FINAL TEST RESULT: epoch'+BASH_CLEAR, param['be'], 'final test F1', param['tf1'],'best p:',param['tp'],'best r:',param['tr'] 
493 |     print BASH_YELLOW+"model important point:"+BASH_CLEAR,opts.String;
494 |                 #}}}
495 | 


--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import codecs
  4 | import numpy as np 
  5 | import six
  6 | import theano
  7 | 
  8 | 
  9 | models_path = "./models"
 10 | eval_path = "./evaluation"
 11 | eval_temp = os.path.join(eval_path, "temp")
 12 | eval_script = os.path.join(eval_path, "conlleval")
 13 | 
 14 | class EarlyStopping(object):
 15 | #{{{
 16 |     '''Stop training when a monitored quantity has stopped improving.
 17 | 
 18 |     # Arguments
 19 |         monitor: quantity to be monitored.
 20 |         min_delta: minimum change in the monitored quantity
 21 |             to qualify as an improvement, i.e. an absolute
 22 |             change of less than min_delta, will count as no
 23 |             improvement.
 24 |         patience: number of epochs with no improvement
 25 |             after which training will be stopped.
 26 |     '''
 27 |     def __init__(self, monitor='val_loss', 
 28 |                  min_delta=1e-6, patience=5,mode='min'):
 29 | #{{{
 30 |         super(EarlyStopping, self).__init__()
 31 | 
 32 |         self.monitor = monitor
 33 |         self.patience = patience
 34 |         self.min_delta = min_delta
 35 |         self.wait = 0
 36 |         self.stopped_epoch = 0
 37 |         self.stop_training=False;
 38 |         
 39 |         if mode =="min":
 40 |             self.monitor_op = np.less;
 41 |         elif mode == "max":
 42 |             self.monitor_op = np.greater;
 43 |         else:
 44 |             assert 0,"unknown early stop mode:";
 45 | 
 46 |         self.min_delta *= -1
 47 | #}}}
 48 |     def on_train_begin(self):
 49 |         self.wait = 0       # Allow instances to be re-used
 50 |         self.best = np.Inf if self.monitor_op == np.less else -np.Inf
 51 | 
 52 |     def on_epoch_end(self, epoch, loss):
 53 | #{{{
 54 |         current = loss
 55 | 
 56 |         if self.monitor_op(current - self.min_delta, self.best):
 57 |             self.best = current
 58 |             self.wait = 0
 59 |         else:
 60 |             if self.wait >= self.patience:
 61 |                 self.stopped_epoch = epoch
 62 |                 self.stop_training = True
 63 |             self.wait += 1
 64 | #}}}
 65 |     def on_train_end(self, logs={}):
 66 |         if self.stopped_epoch > 0 :
 67 |             print('Epoch %05d: early stopping' % (self.stopped_epoch))
 68 | 
 69 | #}}}
 70 | def get_from_module(identifier, module_params, module_name,
 71 |                     instantiate=False, kwargs=None):
 72 |     #{{{
 73 |     if isinstance(identifier, six.string_types):
 74 |         res = module_params.get(identifier)
 75 |         if not res:
 76 |             raise ValueError('Invalid ' + str(module_name) + ': ' +
 77 |                              str(identifier))
 78 |         if instantiate and not kwargs:
 79 |             return res()
 80 |         elif instantiate and kwargs:
 81 |             return res(**kwargs)
 82 |         else:
 83 |             return res
 84 |     elif isinstance(identifier, dict):
 85 |         name = identifier.pop('name')
 86 |         res = module_params.get(name)
 87 |         if res:
 88 |             return res(**identifier)
 89 |         else:
 90 |             raise ValueError('Invalid ' + str(module_name) + ': ' +
 91 |                              str(identifier))
 92 |     return identifier
 93 | #}}}
 94 | 
 95 | def findNotSame(fNameX,fNameY):
 96 | #{{{
 97 |     """
 98 |     verify two file is same or not 
 99 |     """
100 |     space='space';
101 |     def loadFile(fName):
102 |         word=[];
103 |         import codecs;
104 |         for line in codecs.open(fName,'r','utf8'):
105 |             line=line.rstrip();
106 |             if len(line)>0:
107 |                 word.append(line[0]);
108 |             else:
109 |                 word.append(space);
110 |         return word;
111 |     word1=loadFile(fNameX);
112 |     word2=loadFile(fNameY);
113 |     i=0;
114 |     j=0;
115 |     while i<len(word1) and j<len(word2):
116 |         if word1[i]==word2[j]:
117 |             i+=1;
118 |             j+=1;
119 |             continue;
120 |         elif word1[i] ==space:
121 |             i+=1;
122 |         elif word2[j]==space:
123 |             j+=1;
124 |         else:
125 |             print "not same,X:",word1[i],",line:",i,',Y:',word2[j],',line:',j;
126 |             break;
127 | #}}}
128 | 
129 | def generateDocSentLen(fNameX,fNameY):
130 | #{{{
131 |     """
132 |     statistic one article have word in each sentence
133 |     """
134 |     from loader import load_sentences;
135 |     doc=load_sentences(fNameX,False,False);
136 |     sent=load_sentences(fNameY,False,False);
137 |     assert len(doc) < len(sent);
138 |     res=[];
139 |     i=0;
140 |     for elem in doc:
141 |         docLen=[];
142 |         count=0;
143 |         while count<len(elem):
144 |             docLen.append(len(sent[i]));
145 |             count+=len(sent[i]);
146 |             i+=1;
147 |         if count!=len(elem):
148 |             print "two file len not same";
149 |             assert 0;
150 |         res.append(docLen)
151 |     
152 |     return res;
153 | #}}}
154 | 
155 | def get_name(parameters):
156 | #{{{
157 |     """
158 |     Generate a model name from its parameters.
159 |     """
160 |     l = []
161 |     for k, v in parameters.items():
162 |         if type(v) is str and "/" in v:
163 |             l.append((k, v[::-1][:v[::-1].index('/')][::-1]))
164 |         else:
165 |             l.append((k, v))
166 |     name = ",".join(["%s=%s" % (k, str(v).replace(',', '')) for k, v in l])
167 |     return "".join(i for i in name if i not in "\/:*?<>|")
168 | #}}}
169 | 
170 | def set_values(name, param, pretrained):
171 | #{{{
172 |     """
173 |     Initialize a network parameter with pretrained values.
174 |     We check that sizes are compatible.
175 |     """
176 |     param_value = param.get_value()
177 |     if pretrained.size != param_value.size:
178 |         raise Exception(
179 |             "Size mismatch for parameter %s. Expected %i, found %i."
180 |             % (name, param_value.size, pretrained.size)
181 |         )
182 |     param.set_value(np.reshape(
183 |         pretrained, param_value.shape
184 |     ).astype(np.float32))
185 | #}}}
186 | 
187 | import initializations;
188 | def shared(shape, name):
189 | #{{{
190 |     """
191 |     Create a shared object of a numpy array.
192 |     """ 
193 |     init=initializations.get('glorot_uniform');
194 |     if len(shape) == 1:
195 |         value = np.zeros(shape)  # bias are initialized with zeros
196 |         return theano.shared(value=value.astype(theano.config.floatX), name=name)
197 |     else:
198 |         drange = np.sqrt(6. / (np.sum(shape)))
199 |         value = drange * np.random.uniform(low=-1.0, high=1.0, size=shape)
200 |         return init(shape=shape,name=name);
201 | #}}}
202 | 
203 | def create_dico(item_list):
204 | #{{{
205 |     """
206 |     Create a dictionary of items from a list of list of items.
207 |     """
208 |     assert type(item_list) is list
209 |     dico = {}
210 |     for items in item_list:
211 |         for item in items:
212 |             if item not in dico:
213 |                 dico[item] = 1
214 |             else:
215 |                 dico[item] += 1
216 |     return dico
217 | #}}}
218 | 
219 | def create_mapping(dico):
220 | #{{{
221 |     """
222 |     Create a mapping (item to ID / ID to item) from a dictionary.
223 |     Items are ordered by decreasing frequency.
224 |     """
225 |     sorted_items = sorted(dico.items(), key=lambda x: (-x[1], x[0]))
226 |     id_to_item = {i: v[0] for i, v in enumerate(sorted_items)}
227 |     item_to_id = {v: k for k, v in id_to_item.items()}
228 |     return item_to_id, id_to_item
229 | #}}}
230 | 
231 | def zero_digits(s):
232 | #{{{
233 |     """
234 |     Replace every digit in a string by a zero.
235 |     """
236 |     return re.sub('\d', '0', s)
237 | #}}}
238 | 
239 | def iob2(tags):
240 | #{{{
241 |     """
242 |     Check that tags have a valid IOB format.
243 |     Tags in IOB1 format are converted to IOB2.
244 |     """
245 |     for i, tag in enumerate(tags):
246 |         if tag == 'O':
247 |             continue
248 |         split = tag.split('-')
249 |         if split[0] not in ['I', 'B']:
250 |         #if len(split) != 2 or split[0] not in ['I', 'B']:
251 |             return False
252 |         if split[0] == 'B':
253 |             continue
254 |         elif i == 0 or tags[i - 1] == 'O':  # conversion IOB1 to IOB2
255 |             tags[i] = 'B' + tag[1:]
256 |         elif tags[i - 1][1:] == tag[1:]:
257 |             continue
258 |         else:  # conversion IOB1 to IOB2
259 |             tags[i] = 'B' + tag[1:]
260 |     return True
261 | #}}}
262 | 
263 | def iob_iobes(tags):
264 | #{{{
265 |     """
266 |     IOB -> IOBES
267 |     """
268 |     new_tags = []
269 |     for i, tag in enumerate(tags):
270 |         if tag == 'O':
271 |             new_tags.append(tag)
272 |         elif tag.split('-')[0] == 'B':
273 |             if i + 1 != len(tags) and \
274 |                tags[i + 1].split('-')[0] == 'I':
275 |                 new_tags.append(tag)
276 |             else:
277 |                 new_tags.append(tag.replace('B-', 'S-'))
278 |         elif tag.split('-')[0] == 'I':
279 |             if i + 1 < len(tags) and \
280 |                     tags[i + 1].split('-')[0] == 'I':
281 |                 new_tags.append(tag)
282 |             else:
283 |                 new_tags.append(tag.replace('I-', 'E-'))
284 |         else:
285 |             raise Exception('Invalid IOB format!')
286 |     return new_tags
287 | #}}}
288 | 
289 | def iobes_iob(tags):
290 | #{{{
291 |     """
292 |     IOBES -> IOB
293 |     """
294 |     new_tags = []
295 |     for i, tag in enumerate(tags):
296 |         if tag.split('-')[0] == 'B':
297 |             new_tags.append(tag)
298 |         elif tag.split('-')[0] == 'I':
299 |             new_tags.append(tag)
300 |         elif tag.split('-')[0] == 'S':
301 |             new_tags.append(tag.replace('S-', 'B-'))
302 |         elif tag.split('-')[0] == 'E':
303 |             new_tags.append(tag.replace('E-', 'I-'))
304 |         elif tag.split('-')[0] == 'O':
305 |             new_tags.append(tag)
306 |         else:
307 |             raise Exception('Invalid format!')
308 |     return new_tags
309 | #}}}
310 | 
311 | def insert_singletons(words, singletons, p=0.5):
312 | #{{{
313 |     """
314 |     Replace singletons by the unknown word with a probability p.
315 |     """
316 |     new_words = []
317 |     for word in words:
318 |         if word in singletons and np.random.uniform() < p:
319 |             new_words.append(0)
320 |         else:
321 |             new_words.append(word)
322 |     return new_words
323 | #}}}
324 | 
325 | def pad_word_chars(words):
326 | #{{{
327 |     """
328 |     Pad the characters of the words in a sentence.
329 |     Input:
330 |         - list of lists of ints (list of words, a word being a list of char indexes)
331 |     Output:
332 |         - padded list of lists of ints
333 |         - padded list of lists of ints (where chars are reversed)
334 |         - list of ints corresponding to the index of the last character of each word
335 |     """
336 |     max_length = max([len(word) for word in words])
337 |     char_for = []
338 |     char_rev = []
339 |     char_pos = []
340 |     for word in words:
341 |         padding = [0] * (max_length - len(word))
342 |         char_for.append(word + padding)
343 |         char_rev.append(word[::-1] + padding)
344 |         char_pos.append(len(word) - 1)
345 |     return char_for, char_rev, char_pos
346 | #}}}
347 | 
348 | 
349 | def create_input(data, parameters, add_label, singletons=None,
350 |                 useAttend=True):
351 | #{{{
352 |     """
353 |     Take sentence data and return an input for
354 |     the training or the evaluation function.
355 |     """
356 |     words = data['words']
357 |     wordsTrue=data['words'];
358 |     chars = data['chars']
359 |     if singletons is not None:
360 |         words = insert_singletons(words, singletons)
361 |     if parameters['cap_dim']:
362 |         caps = data['caps']
363 |     char_for, char_rev, char_pos = pad_word_chars(chars)
364 |     input = []
365 |     if parameters['word_dim']:
366 |         input.append(words)
367 |     if parameters['char_dim']:
368 |         input.append(char_for)
369 |         if parameters['char_bidirect']:
370 |             input.append(char_rev)
371 |         input.append(char_pos)
372 |     if parameters['cap_dim']:
373 |         input.append(caps)
374 |     if useAttend:
375 |         input.append(wordsTrue);
376 |         if parameters.has_key('sentencesLevelLoss') \
377 |                 and parameters['sentencesLevelLoss']:
378 |             input.append(data['lens']) ;
379 |     
380 |     #add features 
381 |     if parameters.has_key('features'):
382 |         features=parameters['features'];
383 |     else:
384 |         features=None;
385 |     if features is not None and features['lemma']['isUsed']:
386 |         input.append(data['lemma']);
387 |     if features is not None and features['pos']['isUsed']:
388 |         input.append(data['pos']);
389 |     if features is not None and features['chunk']['isUsed']:
390 |         input.append(data['chunk']);
391 |     if features is not None and features['dic']['isUsed']:
392 |         input.append(data['dic']);
393 | 
394 |     if add_label:
395 |         input.append(data['tags'])
396 |     return input
397 | #}}}
398 | 
399 | from os.path import isfile
400 | from os import chmod
401 | import stat
402 | import subprocess
403 | PREFIX = './evaluation/'
404 | def get_perf(filename):
405 |     ''' run conlleval.pl perl script to obtain
406 |     precision/recall and F1 score '''
407 |     _conlleval = PREFIX + 'conlleval'
408 |     if not isfile(_conlleval):
409 |         #download('http://www-etud.iro.umontreal.ca/~mesnilgr/atis/conlleval.pl') 
410 |         os.system('wget https://www.comp.nus.edu.sg/%7Ekanmy/courses/practicalNLP_2008/packages/conlleval.pl')
411 |         chmod('conlleval.pl', stat.S_IRWXU) # give the execute permissions
412 |     
413 |     out = []
414 |     proc = subprocess.Popen(["perl", _conlleval], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
415 |     stdout, _ = proc.communicate(open(filename).read())
416 |     for line in stdout.split('\n'):
417 |         if 'accuracy' in line:
418 |             out = line.split()
419 |             break
420 |     
421 |     # out = ['accuracy:', '16.26%;', 'precision:', '0.00%;', 'recall:', '0.00%;', 'FB1:', '0.00']
422 |     precision = float(out[3][:-2])
423 |     recall    = float(out[5][:-2])
424 |     f1score   = float(out[7])
425 | 
426 |     return {'p':precision, 'r':recall, 'f1':f1score}
427 | 
428 | def evaluate(parameters, f_eval, raw_sentences, parsed_sentences,
429 |              id_to_tag, dictionary_tags,filename,
430 |              useAttend=True):
431 | #{{{
432 |     """
433 |     Evaluate current model using CoNLL script.
434 |     """
435 |     n_tags = len(id_to_tag)
436 |     predictions = []
437 |     count = np.zeros((n_tags, n_tags), dtype=np.int32)
438 | 
439 |     for raw_sentence, data in zip(raw_sentences, parsed_sentences):
440 |         input = create_input(data, parameters, False,useAttend=useAttend)
441 |         if parameters['crf']:
442 |             y_preds = np.array(f_eval(*input))
443 |         else:
444 |             y_preds = f_eval(*input).argmax(axis=1)
445 |         y_reals = np.array(data['tags']).astype(np.int32)
446 |         assert len(y_preds) == len(y_reals)
447 |         p_tags = [id_to_tag[y_pred] for y_pred in y_preds]
448 |         r_tags = [id_to_tag[y_real] for y_real in y_reals]
449 |         if parameters['tag_scheme'] == 'iobes':
450 |             p_tags = iobes_iob(p_tags)
451 |             r_tags = iobes_iob(r_tags)
452 |         for i, (y_pred, y_real) in enumerate(zip(y_preds, y_reals)):
453 |             new_line = " ".join(raw_sentence[i][:-1] + [r_tags[i], p_tags[i]])
454 |             predictions.append(new_line)
455 |             count[y_real, y_pred] += 1
456 |         predictions.append("")
457 |     #write to file 
458 |     with codecs.open(filename, 'w', 'utf8') as f:
459 |         f.write("\n".join(predictions))
460 |     return get_perf(filename) 
461 | #}}}
462 | 


--------------------------------------------------------------------------------