├── .gitignore ├── LICENSE ├── README.md ├── data ├── cdr_corpus.rar └── chemdner_corpus.rar ├── models ├── chem_Att-BiLSTM-CRF_word_char.rar ├── chemdner_word2vec.rar └── chemner_BiLSTM-CRF_word_char.rar └── src ├── AttenTrain.py ├── Atten_tagger.py ├── activations.py ├── backend ├── __init__.py ├── common.py └── theano_backend.py ├── evaluation └── conlleval ├── initializations.py ├── loader.py ├── model.py ├── nn.py ├── optimization.py ├── tagger.py ├── train.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Att-ChemdNER 2 | *** 3 | This repo contains the source code and dataset for the following paper: 4 | - [Ling Luo, Zhihao Yang, Pei Yang, Yin Zhang, Lei Wang, Hongfei Lin, Jian Wang. An Attention-based BiLSTM-CRF Approach to Document-level Chemical Named Entity Recognition. Bioinformatics, 2017.](https://academic.oup.com/bioinformatics/advance-article-abstract/doi/10.1093/bioinformatics/btx761/4657076?redirectedFrom=fulltext) 5 | ## Dependency package 6 | 7 | Att-ChemdNER uses the following dependencies: 8 | 9 | - [Python 2.7](https://www.python.org/) 10 | - [Theano 0.9.0](http://www.deeplearning.net/software/theano/) 11 | - [numpy 1.12.1](http://www.numpy.org/) 12 | 13 | 14 | ## Content 15 | - data 16 | - CHENDNER corpus 17 | - CDR corpus 18 | - models 19 | - The basic BiLSTM-CRF model 20 | - The Att-BiLSTM-CRF model 21 | - The 50-dimensional word embedding 22 | - src 23 | - backend 24 | - evaluation: evaluate result of NER task 25 | - activations.py: activation functions 26 | - initializations.py 27 | - loader.py: load the data set 28 | - model.py: build the model 29 | - nn.py: the layers of the network architecture 30 | - optimizaiton.py: optimization method 31 | - utils.py 32 | - train.py: train a basic BiLSTM-CRF model 33 | - AttenTrain.py: train a Att-BiLSTM-CRF model 34 | - tagger.py: tag the document using the BiLSTM-CRF model 35 | - AttenTrain.py: tag the document using the Att-BiLSTM-CRF model 36 | 37 | ## Train a basic BiLSTM-CRF model 38 | To train a basic BiLSTM-CRF model, you need to provide the file of the training set, development set,testing set and word embedding model, and run the train.py script: 39 | 40 | ``` 41 | python train.py --train trainfile --dev devfile --test testfile --pre_emb word_embedding.model 42 | ``` 43 | ## Train a Att-BiLSTM-CRF model 44 | To train our Att-BiLSTM-CRF model, you need to provide the file of the training set, development set,testing set and word embedding model, and run the AttenTrain.py script: 45 | 46 | ``` 47 | python AttenTrain.py --train trainfile --dev devfile --test testfile --pre_emb word_embedding.model 48 | ``` 49 | ## Tag the documents using the BiLSTM-CRF model 50 | Recognize the chemical entities from the documents using the pretrained BiLSTM-CRF model, and you need to provide the pretrained model, inputfile and outputfile: 51 | 52 | ``` 53 | python tagger.py --model BiLSTM-CRF.model --input inputfile --output outputfile 54 | ``` 55 | The inputfile should contain one document by line, and they have to be tokenized. 56 | 57 | ## Tag the documents using the Att-BiLSTM-CRF model 58 | Recognize the chemical entities from the documents using the pretrained Att-BiLSTM-CRF model, and you need to provide the pretrained model, inputfile and outputfile: 59 | 60 | ``` 61 | python Atten_tagger.py --model Att-BiLSTM-CRF.model --input inputfile --output outputfile 62 | ``` 63 | 64 | The inputfile should contain one document by line, and they have to be tokenized. 65 | 66 | 67 | *** 68 | 69 | -------------------------------------------------------------------------------- /data/cdr_corpus.rar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lingluodlut/Att-ChemdNER/81db44f5cbd5bbbb1d1dee72a528280425de7bc9/data/cdr_corpus.rar -------------------------------------------------------------------------------- /data/chemdner_corpus.rar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lingluodlut/Att-ChemdNER/81db44f5cbd5bbbb1d1dee72a528280425de7bc9/data/chemdner_corpus.rar -------------------------------------------------------------------------------- /models/chem_Att-BiLSTM-CRF_word_char.rar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lingluodlut/Att-ChemdNER/81db44f5cbd5bbbb1d1dee72a528280425de7bc9/models/chem_Att-BiLSTM-CRF_word_char.rar -------------------------------------------------------------------------------- /models/chemdner_word2vec.rar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lingluodlut/Att-ChemdNER/81db44f5cbd5bbbb1d1dee72a528280425de7bc9/models/chemdner_word2vec.rar -------------------------------------------------------------------------------- /models/chemner_BiLSTM-CRF_word_char.rar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lingluodlut/Att-ChemdNER/81db44f5cbd5bbbb1d1dee72a528280425de7bc9/models/chemner_BiLSTM-CRF_word_char.rar -------------------------------------------------------------------------------- /src/AttenTrain.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import numpy as np 5 | SEED=1234; 6 | np.random.seed(1234); 7 | import optparse 8 | import itertools 9 | import time 10 | import subprocess 11 | from collections import OrderedDict 12 | from utils import create_input 13 | import loader 14 | 15 | from utils import models_path, evaluate, eval_script, eval_temp,create_mapping; 16 | from loader import word_mapping, char_mapping, tag_mapping 17 | from loader import update_tag_scheme, prepare_dataset 18 | from loader import augment_with_pretrained,feature_mapping; 19 | from model import Model 20 | from utils import generateDocSentLen; 21 | #import random ; 22 | #for bash color 23 | BASH_RED="\033[0;31m"; 24 | BASH_GREEN="\033[0;32m" 25 | BASH_YELLOW="\033[0;33m" 26 | BASH_CYAN="\033[0;36m" 27 | BASH_CLEAR="\033[0m" 28 | 29 | #prepare for model 30 | #{{{ 31 | # Read parameters from command line 32 | #{{{ 33 | optparser = optparse.OptionParser() 34 | optparser.add_option( 35 | "-T", "--train", default="training.ner.doc.token4.BIO", 36 | help="Train set location" 37 | ) 38 | optparser.add_option( 39 | "-d", "--dev", default="development.ner.doc.token4.BIO", 40 | help="Dev set location" 41 | ) 42 | optparser.add_option( 43 | "-t", "--test", default="evaluation.ner.doc.token4.BIO", 44 | help="Test set location" 45 | ) 46 | optparser.add_option( 47 | "-s", "--tag_scheme", default="iob", 48 | help="Tagging scheme (IOB or IOBES)" 49 | ) 50 | optparser.add_option( 51 | "-l", "--lower", default="0", 52 | type='int', help="Lowercase words (this will not affect character inputs)" 53 | ) 54 | optparser.add_option( 55 | "-z", "--zeros", default="0", 56 | type='int', help="Replace digits with 0" 57 | ) 58 | optparser.add_option( 59 | "-c", "--char_dim", default="25", 60 | type='int', help="Char embedding dimension" 61 | ) 62 | optparser.add_option( 63 | "-C", "--char_lstm_dim", default="25", 64 | type='int', help="Char LSTM hidden layer size" 65 | ) 66 | optparser.add_option( 67 | "-b", "--char_bidirect", default="1", 68 | type='int', help="Use a bidirectional LSTM for chars" 69 | ) 70 | optparser.add_option( 71 | "-w", "--word_dim", default="50", 72 | type='int', help="Token embedding dimension" 73 | ) 74 | optparser.add_option( 75 | "-W", "--word_lstm_dim", default="100", 76 | type='int', help="Token LSTM hidden layer size" 77 | ) 78 | optparser.add_option( 79 | "-B", "--word_bidirect", default="1", 80 | type='int', help="Use a bidirectional LSTM for words" 81 | ) 82 | optparser.add_option( 83 | "-p", "--pre_emb", default="./word2vec_model/chemdner_pubmed_drug.word2vec_model_token4_d50", 84 | help="Location of pretrained embeddings" 85 | ) 86 | optparser.add_option( 87 | "-A", "--all_emb", default="0", 88 | type='int', help="Load all embeddings" 89 | ) 90 | optparser.add_option( 91 | "-a", "--cap_dim", default="1", 92 | type='int', help="Capitalization feature dimension (0 to disable)" 93 | ) 94 | optparser.add_option( 95 | "-f", "--crf", default="1", 96 | type='int', help="Use CRF (0 to disable)" 97 | ) 98 | optparser.add_option( 99 | "-D", "--dropout", default="0.5", 100 | type='float', help="Droupout on the input (0 = no dropout)" 101 | ) 102 | optparser.add_option( 103 | "-L", "--lr_method", default="sgd-lr_.001", 104 | help="Learning method (SGD, Adadelta, Adam..)" 105 | ) 106 | optparser.add_option( 107 | "-r", "--reload", default="0", 108 | type='int', help="Reload the last saved model" 109 | ) 110 | optparser.add_option( 111 | "-S","--String",default="", 112 | help="some about this model" 113 | ) 114 | opts = optparser.parse_args()[0] 115 | #}}} 116 | 117 | 118 | #according corpus to set some parameter for loading file 119 | CORPUS="chem"; 120 | tagFilter=None; 121 | attenScoreFunTotal=['Euclidean','forwardNN','Cosine','Manhatten']; 122 | attenScoreFun=attenScoreFunTotal[0] 123 | if CORPUS == "chem": 124 | #{{{ 125 | opts.train="./chemdner_corpus/chemdner_training.ner.doc.token4.BIO_allfea"; 126 | opts.dev="./chemdner_corpus/chemdner_development.ner.doc.token4.BIO_allfea"; 127 | opts.test="./chemdner_corpus/chemdner_evaluation.ner.doc.token4.BIO_allfea"; 128 | opts.pre_emb="./word2vec_model/chemdner_pubmed_drug.word2vec_model_token4_d50"; 129 | ssplitTrainFName="./chemdner_corpus/training.ner.ssplit.token4.BIO"; 130 | ssplitDevFName="./chemdner_corpus/development.ner.ssplit.token4.BIO"; 131 | ssplitTestFName="./chemdner_corpus/evaluation.ner.ssplit.token4.BIO"; 132 | tagFilter=None; 133 | #}}} 134 | elif CORPUS == "CDR": 135 | #{{{ 136 | opts.train="./cdr_corpus/cdr_training.ner.doc.token4.BIO_allfea_drug"; 137 | opts.dev="./chemdner_corpus/cdr_development.ner.doc.token4.BIO_allfea_drug"; 138 | opts.test="./chemdner_corpus/cdr_test.ner.doc.token4.BIO_allfea_drug"; 139 | opts.pre_emb="./word2vec_model/chemdner_pubmed_drug.word2vec_model_token4_d50"; 140 | ssplitTrainFName="./chemdner_corpus/cdr_training.ner.sen.token4.BIO_allfea_drug"; 141 | ssplitDevFName="./chemdner_corpus/cdr_development.ner.sen.token4.BIO_allfea_drug"; 142 | ssplitTestFName="./chemdner_corpus/cdr_dtest.ner.sen.token4.BIO_allfea_drug"; 143 | tagFilter=['Disease']; 144 | #}}} 145 | 146 | else: 147 | assert 0,"unknown corpus"; 148 | 149 | #read word_dim from word2vec_model 150 | #{{{ 151 | with open(opts.pre_emb) as file: 152 | first_line = file.readline() 153 | #create vec_table 154 | frequency = int(first_line.split()[0]); 155 | vec_size = int(first_line.split()[1]); 156 | opts.word_dim=vec_size; 157 | opts.word_lstm_dim=vec_size; 158 | #}}} 159 | 160 | # Parse parameters 161 | #{{{ 162 | parameters = OrderedDict() 163 | parameters['tag_scheme'] = opts.tag_scheme 164 | parameters['lower'] = opts.lower == 1 165 | parameters['zeros'] = opts.zeros == 1 166 | parameters['char_dim'] = opts.char_dim 167 | parameters['char_lstm_dim'] = opts.char_lstm_dim 168 | parameters['char_bidirect'] = opts.char_bidirect == 1 169 | parameters['word_dim'] = opts.word_dim 170 | parameters['word_lstm_dim'] = opts.word_lstm_dim 171 | parameters['word_bidirect'] = opts.word_bidirect == 1 172 | parameters['pre_emb'] = opts.pre_emb 173 | parameters['all_emb'] = opts.all_emb == 1 174 | parameters['cap_dim'] = opts.cap_dim 175 | parameters['crf'] = opts.crf == 1 176 | parameters['dropout'] = opts.dropout 177 | parameters['lr_method'] = opts.lr_method 178 | #}}} 179 | 180 | # Check parameters validity 181 | #{{{ 182 | assert os.path.isfile(opts.train) 183 | assert os.path.isfile(opts.dev) 184 | assert os.path.isfile(opts.test) 185 | assert parameters['char_dim'] > 0 or parameters['word_dim'] > 0 186 | assert 0. <= parameters['dropout'] < 1.0 187 | assert parameters['tag_scheme'] in ['iob', 'iobes'] 188 | assert not parameters['all_emb'] or parameters['pre_emb'] 189 | assert not parameters['pre_emb'] or parameters['word_dim'] > 0 190 | assert not parameters['pre_emb'] or os.path.isfile(parameters['pre_emb']) 191 | #}}} 192 | # Check evaluation script / folders 193 | if not os.path.isfile(eval_script): 194 | raise Exception('CoNLL evaluation script not found at "%s"' % eval_script) 195 | if not os.path.exists(eval_temp): 196 | os.makedirs(eval_temp) 197 | if not os.path.exists(models_path): 198 | os.makedirs(models_path) 199 | #}}} 200 | #prepare for train 201 | #{{{ 202 | 203 | # Data parameters 204 | lower = parameters['lower'] 205 | zeros = parameters['zeros'] 206 | tag_scheme = parameters['tag_scheme'] 207 | 208 | # Load sentences 209 | train_sentences = loader.load_sentences(opts.train, lower, zeros) 210 | dev_sentences = loader.load_sentences(opts.dev, lower, zeros) 211 | test_sentences = loader.load_sentences(opts.test, lower, zeros) 212 | 213 | #check 1 word sentences 214 | def check1word(sentences): 215 | Lens=[]; 216 | for elem in sentences: 217 | Lens.append(len(elem)); 218 | if min(Lens)==1: 219 | assert 0; 220 | #check1word(train_sentences); 221 | #check1word(dev_sentences); 222 | #check1word(test_sentences); 223 | 224 | #get doc Len for calcuate loss at sentences level 225 | train_Lens=generateDocSentLen(opts.train,ssplitTrainFName); 226 | dev_Lens=generateDocSentLen(opts.dev,ssplitDevFName); 227 | test_Lens=generateDocSentLen(opts.test,ssplitTestFName); 228 | 229 | #merge dev to train 230 | totalSentences=train_sentences+dev_sentences; 231 | totalLens=train_Lens+dev_Lens; 232 | #redefine train and dev 233 | #corpus are already random genergated, so no need to shuffly 234 | #random.seed(SEED); 235 | #random.shuffle(totalSentences); 236 | #random.seed(SEED); 237 | #random.shuffle(totalLens); 238 | devRatio=0.1; 239 | devBoundary=int(len(totalSentences)*(1-devRatio)) 240 | train_sentences=totalSentences[:devBoundary]; 241 | train_Lens=totalLens[:devBoundary]; 242 | dev_sentences=totalSentences[devBoundary:]; 243 | dev_Lens=totalLens[devBoundary:]; 244 | 245 | # Use selected tagging scheme (IOB / IOBES) 246 | update_tag_scheme(train_sentences, tag_scheme,tagFilter); 247 | update_tag_scheme(dev_sentences, tag_scheme,tagFilter); 248 | update_tag_scheme(test_sentences, tag_scheme,tagFilter); 249 | 250 | # Create a dictionary / mapping of words 251 | # If we use pretrained embeddings, we add them to the dictionary. 252 | if parameters['pre_emb']: 253 | dico_words_train = word_mapping(train_sentences, lower)[0] 254 | dico_words, word_to_id, id_to_word = augment_with_pretrained( 255 | dico_words_train.copy(), 256 | parameters['pre_emb'], 257 | list(itertools.chain.from_iterable( 258 | [[w[0] for w in s] for s in dev_sentences + test_sentences]) 259 | ) if not parameters['all_emb'] else None 260 | ) 261 | else: 262 | dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower) 263 | dico_words_train = dico_words 264 | 265 | # Create a dictionary and a mapping for words / POS tags / tags 266 | dico_chars, char_to_id, id_to_char = char_mapping(train_sentences) 267 | dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences) 268 | 269 | #feature mapping 270 | #{{{ 271 | featureMap={#{{{ 272 | 'word':{ 273 | 'index':1, 274 | 'isUsed':1, 275 | 'lstm-input':1, 276 | 'attended':1, 277 | }, 278 | 'char':{ 279 | 'index':0, 280 | 'isUsed':0, 281 | 'lstm-input':1, 282 | 'attended':1, 283 | }, 284 | 'lemma':{ 'index':1, 285 | 'isUsed':0, 286 | 'num':0, 287 | 'dim':25, 288 | 'lstm-input':0, 289 | 'attended':0, 290 | 'pre_emb':''}, 291 | 'pos':{ 'index':2, 292 | 'isUsed':0, 293 | 'num':0, 294 | 'dim':50, 295 | 'lstm-input':0, 296 | 'attended':0, 297 | 'pre_emb':''}, 298 | 'chunk':{ 'index':3, 299 | 'isUsed':0, 300 | 'num':0, 301 | 'lstm-input':0, 302 | 'attended':0, 303 | 'dim':10}, 304 | 'dic':{ 'index':4, 305 | 'isUsed':0, 306 | 'num':3, 307 | 'lstm-input':0, 308 | 'attended':0, 309 | 'dim':5}, 310 | }#}}} 311 | def featureMapCheck(featureMap): 312 | for item in featureMap: 313 | assert (not featureMap[item]['isUsed']) or \ 314 | (featureMap[item]['lstm-input'] or featureMap[item]['attended']) 315 | feature2IdMap={'word':word_to_id, 316 | 'char':char_to_id, 317 | 'tag':tag_to_id}; 318 | featureMapCheck(featureMap); 319 | if featureMap['lemma']['isUsed'] : 320 | dico_lemma,lemma_to_id,id_to_lemma=feature_mapping(train_sentences, 321 | featureMap['lemma']['index'],'lemma'); 322 | featureMap['lemma']['num']=len(dico_lemma) 323 | feature2IdMap['lemma']=lemma_to_id; 324 | 325 | if featureMap['pos']['isUsed'] : 326 | dico_pos,pos_to_id,id_to_pos=feature_mapping(train_sentences, 327 | featureMap['pos']['index'],'pos'); 328 | featureMap['pos']['num']=len(dico_pos) 329 | feature2IdMap['pos']=pos_to_id; 330 | if featureMap['chunk']['isUsed']: 331 | dico_chunk,chunk_to_id,id_to_chunk=feature_mapping(train_sentences, 332 | featureMap['chunk']['index'],'chunk'); 333 | featureMap['chunk']['num']=len(dico_chunk) 334 | feature2IdMap['chunk']=chunk_to_id; 335 | 336 | if featureMap['dic']['isUsed']: 337 | dico_NER={'B':0,'I':1,'O':2}; 338 | NER_to_id,id_to_NER=create_mapping(dico_NER); 339 | feature2IdMap['dic']=NER_to_id; 340 | print BASH_YELLOW+str(featureMap)+BASH_CLEAR; 341 | featureMap['feature2IdMap']=feature2IdMap; 342 | parameters['features']=featureMap; 343 | #}}} 344 | 345 | 346 | 347 | # Build the model 348 | parameters['loading']=False; 349 | parameters['loading_path']="./models/bilstm-crf-chemdner50d/"; 350 | parameters['sentencesLevelLoss']=False; 351 | saveModel=False; 352 | parameters['training']=True; 353 | parameters['attenScoreFun']=attenScoreFun; 354 | parameters['useAttend']=True; 355 | useEarlyStopping=False; 356 | # Initialize model 357 | model = Model(parameters=parameters, models_path=models_path,model_path="./models/attention_test/",Training=True) 358 | # Save the mappings to disk 359 | print 'Saving the mappings to disk...' 360 | model.save_mappings(id_to_word, id_to_char, id_to_tag) 361 | print BASH_YELLOW+"Model location: "+BASH_CLEAR+ "%s" % model.model_path 362 | print BASH_YELLOW+"model important point:"+BASH_CLEAR,opts.String; 363 | if parameters['loading']: 364 | print BASH_YELLOW+"loading:"+BASH_CLEAR,parameters['loading_path']; 365 | print BASH_YELLOW+'save model:'+BASH_CLEAR,saveModel; 366 | print BASH_YELLOW+"sentences Level Loss:"+BASH_CLEAR,parameters['sentencesLevelLoss']; 367 | 368 | # Index data 369 | train_data = prepare_dataset( 370 | train_sentences,train_Lens, parameters, lower 371 | ) 372 | dev_data = prepare_dataset( 373 | dev_sentences,dev_Lens,parameters, lower 374 | ) 375 | test_data = prepare_dataset( 376 | test_sentences,test_Lens, parameters, lower 377 | ) 378 | 379 | print "%i / %i / %i sentences in train / dev / test." % ( 380 | len(train_data), len(dev_data), len(test_data)) 381 | 382 | #load pre-train word_embending 383 | f_train, f_eval = model.build4(parameters) 384 | 385 | 386 | # Reload previous model values 387 | if opts.reload: 388 | print 'Reloading previous model...' 389 | model.reload() 390 | #}}} 391 | # 392 | # Train network 393 | # 394 | singletons = set([word_to_id[k] for k, v 395 | in dico_words_train.items() if v == 1]) 396 | freq_eval = int(len(train_data)*0.3) # evaluate on dev every freq_eval steps 397 | count = 0 398 | limitPrint=0; 399 | param = { 400 | #'lr':0.005, 401 | 'lr':0.001, 402 | 'verbose':1, 403 | 'decay':True, # decay on the learning rate if improvement stops 404 | 'bs':5, # number of backprop through time steps 405 | 'seed':345, 406 | 'epochs':30, 407 | 'crf':True, 408 | 'shuffle':True}; 409 | folder_out = '../log/Attention/' 410 | print BASH_YELLOW+"folder_out:"+BASH_CLEAR,folder_out; 411 | best_f1=-np.inf; 412 | 413 | def attenVisualFun(words,energy,index): 414 | #{{{ 415 | print "energy should:",energy[index][index],words[index]; 416 | print "filter energy:"; 417 | energyInd=energy[index].argsort()[::-1][:10]; 418 | attenVisual=[]; 419 | for i in energyInd: 420 | attenVisual.append([words[i],energy[index][i]]); 421 | print attenVisual; 422 | 423 | #print energyInd; 424 | #for i in range(len(words)): 425 | # attenVisual.append([words[i],energy[0][i]]); 426 | #print attenVisual; 427 | 428 | return ; 429 | #}}} 430 | 431 | #generate FILE NAME PREFIX 432 | fileNamePrefix=""; 433 | if opts.String != "": 434 | fileNamePrefix=opts.String; 435 | fileNamePrefix.replace(",","_"); 436 | fileNamePrefix.replace(" ","_"); 437 | #train model 438 | if useEarlyStopping: 439 | #{{{ 440 | from utils import EarlyStopping; 441 | eStop=EarlyStopping(mode='max'); 442 | eStop.on_train_begin(); 443 | 444 | #start train our model 445 | for epoch in xrange(param['epochs']): 446 | epoch_costs = [] 447 | startTime=time.time(); 448 | 449 | #decide whether early stop 450 | if eStop.stop_training: 451 | break; 452 | 453 | print "Starting epoch %i..." % epoch 454 | for i, index in enumerate(np.random.permutation(len(train_data))): 455 | count += 1 456 | input = create_input(train_data[index], parameters, True, singletons) 457 | new_cost = f_train(*input) 458 | if np.isnan(new_cost): 459 | print index,"nan" 460 | epoch_costs.append(new_cost) 461 | #validation 462 | res_dev = evaluate(parameters, f_eval, dev_sentences, 463 | dev_data, id_to_tag, dico_tags, 464 | folder_out+fileNamePrefix+'.dev.txt') 465 | eStop.on_epoch_end(epoch,res_dev['f1']) ; 466 | print BASH_YELLOW+"avg error:"+BASH_CLEAR,np.mean(epoch_costs),\ 467 | " dev F1:",res_dev['f1']; 468 | print BASH_YELLOW+"One epch espliced:"+BASH_CLEAR,time.time()-startTime; 469 | 470 | #start evaluate on test 471 | res_test = evaluate(parameters, f_eval, test_sentences, 472 | test_data, id_to_tag, dico_tags, 473 | folder_out+fileNamePrefix+'.test.txt') 474 | if saveModel: 475 | print "Saving model to disk..." 476 | model.save() 477 | print BASH_RED+'TEST: epoch'+BASH_CLEAR, epoch, 'F1', res_test['f1'],'p:',res_test['p'],'r:',res_test['r'], ' '*15 478 | print BASH_YELLOW+"model important point:"+BASH_CLEAR,opts.String; 479 | #}}} 480 | else: 481 | for epoch in xrange(param['epochs']): 482 | epoch_costs = [] 483 | startTime=time.time(); 484 | print "Starting epoch %i..." % epoch 485 | for i, index in enumerate(np.random.permutation(len(train_data))): 486 | count += 1 487 | input = create_input(train_data[index], parameters, True, singletons) 488 | new_cost,energy = f_train(*input) 489 | #print attention energy for test 490 | if epoch>=limitPrint and count %freq_eval==0: 491 | attenVisualFun(train_data[index]['str_words'], 492 | energy, 493 | np.random.randint(0,len(train_data[index]))); 494 | if np.isnan(new_cost): 495 | print "NaN,index:",index; 496 | epoch_costs.append(new_cost) 497 | if count % freq_eval == 0 and epoch>=limitPrint: 498 | res_dev = evaluate(parameters, f_eval, dev_sentences, 499 | dev_data, id_to_tag, dico_tags, 500 | folder_out+fileNamePrefix+'.dev.txt') 501 | #new F1 value on dev 502 | if res_dev['f1'] > best_f1: 503 | best_f1 = res_dev['f1'] 504 | if param['verbose']: 505 | print BASH_CYAN+'NEW DEV BEST: epoch'+BASH_CLEAR, epoch, 'best dev F1', res_dev['f1'],'p:',res_dev['p'],'r:',res_dev['r'], ' '*15 506 | 507 | #new F1 value on dev, so evaluate on test 508 | res_test = evaluate(parameters, f_eval, test_sentences, 509 | test_data, id_to_tag, dico_tags, 510 | folder_out+fileNamePrefix+'.test.txt') 511 | if saveModel: 512 | print "Saving model to disk..." 513 | model.save() 514 | print BASH_RED+'THIS TEST: epoch'+BASH_CLEAR, epoch, 'F1', res_test['f1'],'p:',res_test['p'],'r:',res_test['r'], ' '*15 515 | param['tf1'], param['tp'], param['tr'] = res_test['f1'], res_test['p'], res_test['r'] 516 | param['be'] = epoch 517 | print BASH_YELLOW+"avg error:"+BASH_CLEAR,np.mean(epoch_costs); 518 | print BASH_YELLOW+"One epch espliced:"+BASH_CLEAR,time.time()-startTime; 519 | print BASH_GREEN+'FINAL TEST RESULT: epoch'+BASH_CLEAR, param['be'], 'final test F1', param['tf1'],'best p:',param['tp'],'best r:',param['tr'] 520 | print BASH_YELLOW+"model important point:"+BASH_CLEAR,opts.String; 521 | 522 | 523 | -------------------------------------------------------------------------------- /src/Atten_tagger.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import time 5 | import codecs 6 | import optparse 7 | import numpy as np 8 | from loader import prepare_dataset; 9 | from utils import create_input, iobes_iob; 10 | from model import Model 11 | 12 | optparser = optparse.OptionParser() 13 | optparser.add_option( 14 | "-m", "--model", default="../models/chemAtten_word_char/", 15 | help="Model location" 16 | ) 17 | optparser.add_option( 18 | "-i", "--input", default="../data/chemnder_test.txt", 19 | help="Input file location" 20 | ) 21 | optparser.add_option( 22 | "-o", "--output", default="./chemdner_test.tsv", 23 | help="Output file location" 24 | ) 25 | optparser.add_option( 26 | "-d", "--delimiter", default="__", 27 | help="Delimiter to separate words from their tags" 28 | ) 29 | opts = optparser.parse_args()[0] 30 | 31 | # Check parameters validity 32 | assert opts.delimiter 33 | assert os.path.isdir(opts.model) 34 | assert os.path.isfile(opts.input) 35 | 36 | # Load existing model 37 | print "Loading model..." 38 | model = Model(model_path=opts.model) 39 | 40 | # Load reverse mappings 41 | word_to_id, char_to_id, tag_to_id = [ 42 | {v: k for k, v in x.items()} 43 | for x in [model.id_to_word, model.id_to_char, model.id_to_tag] 44 | ] 45 | parameters = model.parameters 46 | 47 | # Load the model 48 | _, f_eval = model.build4(parameters) 49 | model.reload() 50 | 51 | #load test sentence 52 | def load_sentences(path): 53 | sentences = [] 54 | for line in codecs.open(path, 'r', 'utf8'): 55 | sentence =[]; 56 | line = line.rstrip() 57 | if line: 58 | word = line.split() 59 | for elem in word: 60 | sentence.append([elem]); 61 | sentences.append(sentence) 62 | return sentences 63 | 64 | opts.train="../data/chemdner_training.ner.doc.token4.BIO_allfea"; 65 | opts.dev="../data/chemdner_development.ner.doc.token4.BIO_allfea"; 66 | opts.test="../data/chemdner_evaluation.ner.doc.token4.BIO_allfea"; 67 | ssplitTrainFName="../data/training.ner.ssplit.token4.BIO"; 68 | ssplitDevFName="../data/development.ner.ssplit.token4.BIO"; 69 | ssplitTestFName="../data/evaluation.ner.ssplit.token4.BIO"; 70 | from utils import generateDocSentLen; 71 | #get doc Len for calcuate loss at sentences level 72 | train_Lens=generateDocSentLen(opts.train,ssplitTrainFName); 73 | dev_Lens=generateDocSentLen(opts.dev,ssplitDevFName); 74 | test_Lens=generateDocSentLen(opts.test,ssplitTestFName); 75 | 76 | test_sentences=load_sentences(opts.input); 77 | test_data=prepare_dataset(test_sentences,test_Lens,parameters,parameters['lower'],isTest=True); 78 | f_output = codecs.open(opts.output, 'w', 'utf-8') 79 | start = time.time() 80 | 81 | def xmlformat(sentence,tags): 82 | #{{{ 83 | assert len(sentence)==len(tags); 84 | res=[]; 85 | preTag=""; 86 | for i in range(len(tags)): 87 | if tags[i][0]=='B': 88 | if len(preTag): 89 | res.append(""); 90 | preTag=""; 91 | res.append("<"+tags[i][2:]+">"); 92 | preTag=tags[i][2:]; 93 | if tags[i][0]=='I': 94 | if preTag!=tags[i][2:]: 95 | if len(preTag): 96 | res.append(""); 97 | preTag=""; 98 | 99 | if tags[i][0]=='O': 100 | if len(preTag): 101 | res.append(""); 102 | preTag=""; 103 | res.append(sentence[i]); 104 | if len(preTag): 105 | res.append(""); 106 | return res; 107 | #}}} 108 | print 'Tagging...' 109 | for line in test_data: 110 | # Prepare input 111 | input = create_input(line, parameters, False,useAttend=parameters['useAttend']); 112 | words=line['str_words']; 113 | # Decoding 114 | if parameters['crf']: 115 | y_preds = np.array(f_eval(*input)) 116 | else: 117 | y_preds = f_eval(*input).argmax(axis=1) 118 | y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds] 119 | # Output tags in the IOB2 format 120 | if parameters['tag_scheme'] == 'iobes': 121 | y_preds = iobes_iob(y_preds) 122 | # Write tags 123 | assert len(y_preds) == len(words) 124 | for i in range(len(words)): 125 | f_output.write(words[i]+'\t'+y_preds[i]+'\n') 126 | f_output.write('\n') 127 | # for elem in xmlformat(words,y_preds): 128 | # f_output.write(elem+" "); 129 | # f_output.write("\n"); 130 | 131 | print '---- lines tagged in %.4fs ----' % ( time.time() - start) 132 | f_output.close() 133 | -------------------------------------------------------------------------------- /src/activations.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import backend as K 3 | from utils import get_from_module 4 | 5 | 6 | def softmax(x): 7 | ndim = K.ndim(x) 8 | if ndim == 2: 9 | return K.softmax(x) 10 | elif ndim == 3: 11 | e = K.exp(x - K.max(x, axis=-1, keepdims=True)) 12 | s = K.sum(e, axis=-1, keepdims=True) 13 | return e / s 14 | else: 15 | raise ValueError('Cannot apply softmax to a tensor ' 16 | 'that is not 2D or 3D. ' 17 | 'Here, ndim=' + str(ndim)) 18 | 19 | 20 | def elu(x, alpha=1.0): 21 | return K.elu(x, alpha) 22 | 23 | 24 | def softplus(x): 25 | return K.softplus(x) 26 | 27 | 28 | def softsign(x): 29 | return K.softsign(x) 30 | 31 | 32 | def relu(x, alpha=0., max_value=None): 33 | return K.relu(x, alpha=alpha, max_value=max_value) 34 | 35 | 36 | def tanh(x): 37 | return K.tanh(x) 38 | 39 | 40 | def sigmoid(x): 41 | return K.sigmoid(x) 42 | 43 | 44 | def hard_sigmoid(x): 45 | return K.hard_sigmoid(x) 46 | 47 | 48 | def linear(x): 49 | return x 50 | 51 | 52 | def get(identifier): 53 | if identifier is None: 54 | return linear 55 | return get_from_module(identifier, globals(), 'activation function') 56 | -------------------------------------------------------------------------------- /src/backend/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import print_function 3 | import os 4 | import json 5 | import sys 6 | from .common import epsilon 7 | from .common import floatx 8 | from .common import set_epsilon 9 | from .common import set_floatx 10 | from .common import get_uid 11 | from .common import cast_to_floatx 12 | from .common import image_dim_ordering 13 | from .common import set_image_dim_ordering 14 | from .common import is_keras_tensor 15 | from .common import legacy_weight_ordering 16 | from .common import set_legacy_weight_ordering 17 | 18 | _keras_base_dir = os.path.expanduser('~') 19 | if not os.access(_keras_base_dir, os.W_OK): 20 | _keras_base_dir = '/tmp' 21 | 22 | _keras_dir = os.path.join(_keras_base_dir, '.keras') 23 | if not os.path.exists(_keras_dir): 24 | os.makedirs(_keras_dir) 25 | 26 | # Default backend: TensorFlow. 27 | _BACKEND = 'tensorflow' 28 | 29 | _config_path = os.path.expanduser(os.path.join(_keras_dir, 'keras.json')) 30 | if os.path.exists(_config_path): 31 | _config = json.load(open(_config_path)) 32 | _floatx = _config.get('floatx', floatx()) 33 | assert _floatx in {'float16', 'float32', 'float64'} 34 | _epsilon = _config.get('epsilon', epsilon()) 35 | assert isinstance(_epsilon, float) 36 | _backend = _config.get('backend', _BACKEND) 37 | assert _backend in {'theano', 'tensorflow'} 38 | _image_dim_ordering = _config.get('image_dim_ordering', 39 | image_dim_ordering()) 40 | assert _image_dim_ordering in {'tf', 'th'} 41 | 42 | set_floatx(_floatx) 43 | set_epsilon(_epsilon) 44 | set_image_dim_ordering(_image_dim_ordering) 45 | _BACKEND = _backend 46 | 47 | # save config file 48 | if not os.path.exists(_config_path): 49 | _config = {'floatx': floatx(), 50 | 'epsilon': epsilon(), 51 | 'backend': _BACKEND, 52 | 'image_dim_ordering': image_dim_ordering()} 53 | with open(_config_path, 'w') as f: 54 | f.write(json.dumps(_config, indent=4)) 55 | 56 | if 'KERAS_BACKEND' in os.environ: 57 | _backend = os.environ['KERAS_BACKEND'] 58 | assert _backend in {'theano', 'tensorflow'} 59 | _BACKEND = _backend 60 | 61 | # import backend 62 | if _BACKEND == 'theano': 63 | sys.stderr.write('Using Theano backend.\n') 64 | from .theano_backend import * 65 | elif _BACKEND == 'tensorflow': 66 | sys.stderr.write('Using TensorFlow backend.\n') 67 | from .tensorflow_backend import * 68 | else: 69 | raise ValueError('Unknown backend: ' + str(_BACKEND)) 70 | 71 | 72 | def backend(): 73 | '''Publicly accessible method 74 | for determining the current backend. 75 | ''' 76 | return _BACKEND 77 | -------------------------------------------------------------------------------- /src/backend/common.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from collections import defaultdict 4 | 5 | # the type of float to use throughout the session. 6 | _FLOATX = 'float32' 7 | _EPSILON = 10e-8 8 | _UID_PREFIXES = defaultdict(int) 9 | _IMAGE_DIM_ORDERING = 'tf' 10 | _LEGACY_WEIGHT_ORDERING = False 11 | 12 | 13 | def epsilon(): 14 | '''Returns the value of the fuzz 15 | factor used in numeric expressions. 16 | 17 | # Returns 18 | A float. 19 | 20 | # Example 21 | ```python 22 | >>> keras.backend.epsilon() 23 | 1e-08 24 | ``` 25 | ''' 26 | return _EPSILON 27 | 28 | 29 | def set_epsilon(e): 30 | '''Sets the value of the fuzz 31 | factor used in numeric expressions. 32 | 33 | # Arguments 34 | e: float. New value of epsilon. 35 | 36 | # Example 37 | ```python 38 | >>> from keras import backend as K 39 | >>> K.epsilon() 40 | 1e-08 41 | >>> K.set_epsilon(1e-05) 42 | >>> K.epsilon() 43 | 1e-05 44 | ``` 45 | ''' 46 | global _EPSILON 47 | _EPSILON = e 48 | 49 | 50 | def floatx(): 51 | '''Returns the default float type, as a string 52 | (e.g. 'float16', 'float32', 'float64'). 53 | 54 | # Returns 55 | String, the current default float type. 56 | 57 | # Example 58 | ```python 59 | >>> keras.backend.floatx() 60 | 'float32' 61 | ``` 62 | ''' 63 | return _FLOATX 64 | 65 | 66 | def set_floatx(floatx): 67 | '''Sets the default float type. 68 | 69 | # Arguments 70 | String: 'float16', 'float32', or 'float64'. 71 | 72 | # Example 73 | ```python 74 | >>> from keras import backend as K 75 | >>> K.floatx() 76 | 'float32' 77 | >>> K.set_floatx('float16') 78 | >>> K.floatx() 79 | 'float16' 80 | ``` 81 | ''' 82 | global _FLOATX 83 | if floatx not in {'float16', 'float32', 'float64'}: 84 | raise ValueError('Unknown floatx type: ' + str(floatx)) 85 | _FLOATX = str(floatx) 86 | 87 | 88 | def cast_to_floatx(x): 89 | '''Cast a Numpy array to the default Keras float type. 90 | 91 | # Arguments 92 | x: Numpy array. 93 | 94 | # Returns 95 | The same Numpy array, cast to its new type. 96 | 97 | # Example 98 | ```python 99 | >>> from keras import backend as K 100 | >>> K.floatx() 101 | 'float32' 102 | >>> arr = numpy.array([1.0, 2.0], dtype='float64') 103 | >>> arr.dtype 104 | dtype('float64') 105 | >>> new_arr = K.cast_to_floatx(arr) 106 | >>> new_arr 107 | array([ 1., 2.], dtype=float32) 108 | >>> new_arr.dtype 109 | dtype('float32') 110 | ``` 111 | ''' 112 | return np.asarray(x, dtype=_FLOATX) 113 | 114 | 115 | def image_dim_ordering(): 116 | '''Returns the default image dimension ordering 117 | convention ('th' or 'tf'). 118 | 119 | # Returns 120 | A string, either `'th'` or `'tf'` 121 | 122 | # Example 123 | ```python 124 | >>> keras.backend.image_dim_ordering() 125 | 'th' 126 | ``` 127 | ''' 128 | return _IMAGE_DIM_ORDERING 129 | 130 | 131 | def set_image_dim_ordering(dim_ordering): 132 | '''Sets the value of the image dimension 133 | ordering convention ('th' or 'tf'). 134 | 135 | # Arguments 136 | dim_ordering: string. `'th'` or `'tf'`. 137 | 138 | # Example 139 | ```python 140 | >>> from keras import backend as K 141 | >>> K.image_dim_ordering() 142 | 'th' 143 | >>> K.set_image_dim_ordering('tf') 144 | >>> K.image_dim_ordering() 145 | 'tf' 146 | ``` 147 | ''' 148 | global _IMAGE_DIM_ORDERING 149 | if dim_ordering not in {'tf', 'th'}: 150 | raise ValueError('Unknown dim_ordering:', dim_ordering) 151 | _IMAGE_DIM_ORDERING = str(dim_ordering) 152 | 153 | 154 | def get_uid(prefix=''): 155 | '''Provides a unique UID given a string prefix. 156 | 157 | # Arguments 158 | prefix: string. 159 | 160 | # Returns 161 | An integer. 162 | 163 | # Example 164 | ``` 165 | >>> keras.backend.get_uid('dense') 166 | >>> 1 167 | >>> keras.backend.get_uid('dense') 168 | >>> 2 169 | ``` 170 | 171 | ''' 172 | _UID_PREFIXES[prefix] += 1 173 | return _UID_PREFIXES[prefix] 174 | 175 | 176 | def reset_uids(): 177 | global _UID_PREFIXES 178 | _UID_PREFIXES = defaultdict(int) 179 | 180 | 181 | def is_keras_tensor(x): 182 | '''Returns whether `x` is a Keras tensor. 183 | 184 | # Arguments 185 | x: a potential tensor. 186 | 187 | # Returns 188 | A boolean: whether the argument is a Keras tensor. 189 | 190 | # Examples 191 | ```python 192 | >>> from keras import backend as K 193 | >>> np_var = numpy.array([1, 2]) 194 | >>> K.is_keras_tensor(np_var) 195 | False 196 | >>> keras_var = K.variable(np_var) 197 | >>> K.is_keras_tensor(keras_var) # A variable is not a Tensor. 198 | False 199 | >>> keras_placeholder = K.placeholder(shape=(2, 4, 5)) 200 | >>> K.is_keras_tensor(keras_placeholder) # A placeholder is a Tensor. 201 | True 202 | ``` 203 | ''' 204 | if hasattr(x, '_keras_shape'): 205 | return True 206 | else: 207 | return False 208 | 209 | 210 | def set_legacy_weight_ordering(value): 211 | global _LEGACY_WEIGHT_ORDERING 212 | assert value in {True, False} 213 | _LEGACY_WEIGHT_ORDERING = value 214 | 215 | 216 | def legacy_weight_ordering(): 217 | return _LEGACY_WEIGHT_ORDERING 218 | -------------------------------------------------------------------------------- /src/evaluation/conlleval: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | # conlleval: evaluate result of processing CoNLL-2000 shared task 3 | # usage: conlleval [-l] [-r] [-d delimiterTag] [-o oTag] < file 4 | # README: http://cnts.uia.ac.be/conll2000/chunking/output.html 5 | # options: l: generate LaTeX output for tables like in 6 | # http://cnts.uia.ac.be/conll2003/ner/example.tex 7 | # r: accept raw result tags (without B- and I- prefix; 8 | # assumes one word per chunk) 9 | # d: alternative delimiter tag (default is single space) 10 | # o: alternative outside tag (default is O) 11 | # note: the file should contain lines with items separated 12 | # by $delimiter characters (default space). The final 13 | # two items should contain the correct tag and the 14 | # guessed tag in that order. Sentences should be 15 | # separated from each other by empty lines or lines 16 | # with $boundary fields (default -X-). 17 | # url: http://lcg-www.uia.ac.be/conll2000/chunking/ 18 | # started: 1998-09-25 19 | # version: 2004-01-26 20 | # author: Erik Tjong Kim Sang 21 | 22 | use strict; 23 | 24 | my $false = 0; 25 | my $true = 42; 26 | 27 | my $boundary = "-X-"; # sentence boundary 28 | my $correct; # current corpus chunk tag (I,O,B) 29 | my $correctChunk = 0; # number of correctly identified chunks 30 | my $correctTags = 0; # number of correct chunk tags 31 | my $correctType; # type of current corpus chunk tag (NP,VP,etc.) 32 | my $delimiter = " "; # field delimiter 33 | my $FB1 = 0.0; # FB1 score (Van Rijsbergen 1979) 34 | my $firstItem; # first feature (for sentence boundary checks) 35 | my $foundCorrect = 0; # number of chunks in corpus 36 | my $foundGuessed = 0; # number of identified chunks 37 | my $guessed; # current guessed chunk tag 38 | my $guessedType; # type of current guessed chunk tag 39 | my $i; # miscellaneous counter 40 | my $inCorrect = $false; # currently processed chunk is correct until now 41 | my $lastCorrect = "O"; # previous chunk tag in corpus 42 | my $latex = 0; # generate LaTeX formatted output 43 | my $lastCorrectType = ""; # type of previously identified chunk tag 44 | my $lastGuessed = "O"; # previously identified chunk tag 45 | my $lastGuessedType = ""; # type of previous chunk tag in corpus 46 | my $lastType; # temporary storage for detecting duplicates 47 | my $line; # line 48 | my $nbrOfFeatures = -1; # number of features per line 49 | my $precision = 0.0; # precision score 50 | my $oTag = "O"; # outside tag, default O 51 | my $raw = 0; # raw input: add B to every token 52 | my $recall = 0.0; # recall score 53 | my $tokenCounter = 0; # token counter (ignores sentence breaks) 54 | 55 | my %correctChunk = (); # number of correctly identified chunks per type 56 | my %foundCorrect = (); # number of chunks in corpus per type 57 | my %foundGuessed = (); # number of identified chunks per type 58 | 59 | my @features; # features on line 60 | my @sortedTypes; # sorted list of chunk type names 61 | 62 | # sanity check 63 | while (@ARGV and $ARGV[0] =~ /^-/) { 64 | if ($ARGV[0] eq "-l") { $latex = 1; shift(@ARGV); } 65 | elsif ($ARGV[0] eq "-r") { $raw = 1; shift(@ARGV); } 66 | elsif ($ARGV[0] eq "-d") { 67 | shift(@ARGV); 68 | if (not defined $ARGV[0]) { 69 | die "conlleval: -d requires delimiter character"; 70 | } 71 | $delimiter = shift(@ARGV); 72 | } elsif ($ARGV[0] eq "-o") { 73 | shift(@ARGV); 74 | if (not defined $ARGV[0]) { 75 | die "conlleval: -o requires delimiter character"; 76 | } 77 | $oTag = shift(@ARGV); 78 | } else { die "conlleval: unknown argument $ARGV[0]\n"; } 79 | } 80 | if (@ARGV) { die "conlleval: unexpected command line argument\n"; } 81 | # process input 82 | while () { 83 | chomp($line = $_); 84 | @features = split(/$delimiter/,$line); 85 | if ($nbrOfFeatures < 0) { $nbrOfFeatures = $#features; } 86 | elsif ($nbrOfFeatures != $#features and @features != 0) { 87 | printf STDERR "unexpected number of features: %d (%d)\n", 88 | $#features+1,$nbrOfFeatures+1; 89 | exit(1); 90 | } 91 | if (@features == 0 or 92 | $features[0] eq $boundary) { @features = ($boundary,"O","O"); } 93 | if (@features < 2) { 94 | die "conlleval: unexpected number of features in line $line\n"; 95 | } 96 | if ($raw) { 97 | if ($features[$#features] eq $oTag) { $features[$#features] = "O"; } 98 | if ($features[$#features-1] eq $oTag) { $features[$#features-1] = "O"; } 99 | if ($features[$#features] ne "O") { 100 | $features[$#features] = "B-$features[$#features]"; 101 | } 102 | if ($features[$#features-1] ne "O") { 103 | $features[$#features-1] = "B-$features[$#features-1]"; 104 | } 105 | } 106 | # 20040126 ET code which allows hyphens in the types 107 | if ($features[$#features] =~ /^([^-]*)-(.*)$/) { 108 | $guessed = $1; 109 | $guessedType = $2; 110 | } else { 111 | $guessed = $features[$#features]; 112 | $guessedType = ""; 113 | } 114 | pop(@features); 115 | if ($features[$#features] =~ /^([^-]*)-(.*)$/) { 116 | $correct = $1; 117 | $correctType = $2; 118 | } else { 119 | $correct = $features[$#features]; 120 | $correctType = ""; 121 | } 122 | pop(@features); 123 | # ($guessed,$guessedType) = split(/-/,pop(@features)); 124 | # ($correct,$correctType) = split(/-/,pop(@features)); 125 | $guessedType = $guessedType ? $guessedType : ""; 126 | $correctType = $correctType ? $correctType : ""; 127 | $firstItem = shift(@features); 128 | 129 | # 1999-06-26 sentence breaks should always be counted as out of chunk 130 | if ( $firstItem eq $boundary ) { $guessed = "O"; } 131 | 132 | if ($inCorrect) { 133 | if ( &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and 134 | &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and 135 | $lastGuessedType eq $lastCorrectType) { 136 | $inCorrect=$false; 137 | $correctChunk++; 138 | $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ? 139 | $correctChunk{$lastCorrectType}+1 : 1; 140 | } elsif ( 141 | &endOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) != 142 | &endOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) or 143 | $guessedType ne $correctType ) { 144 | $inCorrect=$false; 145 | } 146 | } 147 | 148 | if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) and 149 | &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) and 150 | $guessedType eq $correctType) { $inCorrect = $true; } 151 | 152 | if ( &startOfChunk($lastCorrect,$correct,$lastCorrectType,$correctType) ) { 153 | $foundCorrect++; 154 | $foundCorrect{$correctType} = $foundCorrect{$correctType} ? 155 | $foundCorrect{$correctType}+1 : 1; 156 | } 157 | if ( &startOfChunk($lastGuessed,$guessed,$lastGuessedType,$guessedType) ) { 158 | $foundGuessed++; 159 | $foundGuessed{$guessedType} = $foundGuessed{$guessedType} ? 160 | $foundGuessed{$guessedType}+1 : 1; 161 | } 162 | if ( $firstItem ne $boundary ) { 163 | if ( $correct eq $guessed and $guessedType eq $correctType ) { 164 | $correctTags++; 165 | } 166 | $tokenCounter++; 167 | } 168 | 169 | $lastGuessed = $guessed; 170 | $lastCorrect = $correct; 171 | $lastGuessedType = $guessedType; 172 | $lastCorrectType = $correctType; 173 | } 174 | if ($inCorrect) { 175 | $correctChunk++; 176 | $correctChunk{$lastCorrectType} = $correctChunk{$lastCorrectType} ? 177 | $correctChunk{$lastCorrectType}+1 : 1; 178 | } 179 | 180 | if (not $latex) { 181 | # compute overall precision, recall and FB1 (default values are 0.0) 182 | $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0); 183 | $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0); 184 | $FB1 = 2*$precision*$recall/($precision+$recall) 185 | if ($precision+$recall > 0); 186 | 187 | # print overall performance 188 | printf "processed $tokenCounter tokens with $foundCorrect phrases; "; 189 | printf "found: $foundGuessed phrases; correct: $correctChunk.\n"; 190 | if ($tokenCounter>0) { 191 | printf "accuracy: %6.2f%%; ",100*$correctTags/$tokenCounter; 192 | printf "precision: %6.2f%%; ",$precision; 193 | printf "recall: %6.2f%%; ",$recall; 194 | printf "FB1: %6.2f\n",$FB1; 195 | } 196 | } 197 | 198 | # sort chunk type names 199 | undef($lastType); 200 | @sortedTypes = (); 201 | foreach $i (sort (keys %foundCorrect,keys %foundGuessed)) { 202 | if (not($lastType) or $lastType ne $i) { 203 | push(@sortedTypes,($i)); 204 | } 205 | $lastType = $i; 206 | } 207 | # print performance per chunk type 208 | if (not $latex) { 209 | for $i (@sortedTypes) { 210 | $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0; 211 | if (not($foundGuessed{$i})) { $foundGuessed{$i} = 0; $precision = 0.0; } 212 | else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; } 213 | if (not($foundCorrect{$i})) { $recall = 0.0; } 214 | else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; } 215 | if ($precision+$recall == 0.0) { $FB1 = 0.0; } 216 | else { $FB1 = 2*$precision*$recall/($precision+$recall); } 217 | printf "%17s: ",$i; 218 | printf "precision: %6.2f%%; ",$precision; 219 | printf "recall: %6.2f%%; ",$recall; 220 | printf "FB1: %6.2f %d\n",$FB1,$foundGuessed{$i}; 221 | } 222 | } else { 223 | print " & Precision & Recall & F\$_{\\beta=1} \\\\\\hline"; 224 | for $i (@sortedTypes) { 225 | $correctChunk{$i} = $correctChunk{$i} ? $correctChunk{$i} : 0; 226 | if (not($foundGuessed{$i})) { $precision = 0.0; } 227 | else { $precision = 100*$correctChunk{$i}/$foundGuessed{$i}; } 228 | if (not($foundCorrect{$i})) { $recall = 0.0; } 229 | else { $recall = 100*$correctChunk{$i}/$foundCorrect{$i}; } 230 | if ($precision+$recall == 0.0) { $FB1 = 0.0; } 231 | else { $FB1 = 2*$precision*$recall/($precision+$recall); } 232 | printf "\n%-7s & %6.2f\\%% & %6.2f\\%% & %6.2f \\\\", 233 | $i,$precision,$recall,$FB1; 234 | } 235 | print "\\hline\n"; 236 | $precision = 0.0; 237 | $recall = 0; 238 | $FB1 = 0.0; 239 | $precision = 100*$correctChunk/$foundGuessed if ($foundGuessed > 0); 240 | $recall = 100*$correctChunk/$foundCorrect if ($foundCorrect > 0); 241 | $FB1 = 2*$precision*$recall/($precision+$recall) 242 | if ($precision+$recall > 0); 243 | printf "Overall & %6.2f\\%% & %6.2f\\%% & %6.2f \\\\\\hline\n", 244 | $precision,$recall,$FB1; 245 | } 246 | 247 | exit 0; 248 | 249 | # endOfChunk: checks if a chunk ended between the previous and current word 250 | # arguments: previous and current chunk tags, previous and current types 251 | # note: this code is capable of handling other chunk representations 252 | # than the default CoNLL-2000 ones, see EACL'99 paper of Tjong 253 | # Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006 254 | 255 | sub endOfChunk { 256 | my $prevTag = shift(@_); 257 | my $tag = shift(@_); 258 | my $prevType = shift(@_); 259 | my $type = shift(@_); 260 | my $chunkEnd = $false; 261 | 262 | if ( $prevTag eq "B" and $tag eq "B" ) { $chunkEnd = $true; } 263 | if ( $prevTag eq "B" and $tag eq "O" ) { $chunkEnd = $true; } 264 | if ( $prevTag eq "I" and $tag eq "B" ) { $chunkEnd = $true; } 265 | if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; } 266 | 267 | if ( $prevTag eq "E" and $tag eq "E" ) { $chunkEnd = $true; } 268 | if ( $prevTag eq "E" and $tag eq "I" ) { $chunkEnd = $true; } 269 | if ( $prevTag eq "E" and $tag eq "O" ) { $chunkEnd = $true; } 270 | if ( $prevTag eq "I" and $tag eq "O" ) { $chunkEnd = $true; } 271 | 272 | if ($prevTag ne "O" and $prevTag ne "." and $prevType ne $type) { 273 | $chunkEnd = $true; 274 | } 275 | 276 | # corrected 1998-12-22: these chunks are assumed to have length 1 277 | if ( $prevTag eq "]" ) { $chunkEnd = $true; } 278 | if ( $prevTag eq "[" ) { $chunkEnd = $true; } 279 | 280 | return($chunkEnd); 281 | } 282 | 283 | # startOfChunk: checks if a chunk started between the previous and current word 284 | # arguments: previous and current chunk tags, previous and current types 285 | # note: this code is capable of handling other chunk representations 286 | # than the default CoNLL-2000 ones, see EACL'99 paper of Tjong 287 | # Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006 288 | 289 | sub startOfChunk { 290 | my $prevTag = shift(@_); 291 | my $tag = shift(@_); 292 | my $prevType = shift(@_); 293 | my $type = shift(@_); 294 | my $chunkStart = $false; 295 | 296 | if ( $prevTag eq "B" and $tag eq "B" ) { $chunkStart = $true; } 297 | if ( $prevTag eq "I" and $tag eq "B" ) { $chunkStart = $true; } 298 | if ( $prevTag eq "O" and $tag eq "B" ) { $chunkStart = $true; } 299 | if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; } 300 | 301 | if ( $prevTag eq "E" and $tag eq "E" ) { $chunkStart = $true; } 302 | if ( $prevTag eq "E" and $tag eq "I" ) { $chunkStart = $true; } 303 | if ( $prevTag eq "O" and $tag eq "E" ) { $chunkStart = $true; } 304 | if ( $prevTag eq "O" and $tag eq "I" ) { $chunkStart = $true; } 305 | 306 | if ($tag ne "O" and $tag ne "." and $prevType ne $type) { 307 | $chunkStart = $true; 308 | } 309 | 310 | # corrected 1998-12-22: these chunks are assumed to have length 1 311 | if ( $tag eq "[" ) { $chunkStart = $true; } 312 | if ( $tag eq "]" ) { $chunkStart = $true; } 313 | 314 | return($chunkStart); 315 | } 316 | -------------------------------------------------------------------------------- /src/initializations.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import numpy as np 3 | import backend as K 4 | from utils import get_from_module 5 | 6 | def get_fans(shape, dim_ordering='th'): 7 | if len(shape) == 2: 8 | fan_in = shape[0] 9 | fan_out = shape[1] 10 | elif len(shape) == 4 or len(shape) == 5: 11 | # assuming convolution kernels (2D or 3D). 12 | # TH kernel shape: (depth, input_depth, ...) 13 | # TF kernel shape: (..., input_depth, depth) 14 | if dim_ordering == 'th': 15 | receptive_field_size = np.prod(shape[2:]) 16 | fan_in = shape[1] * receptive_field_size 17 | fan_out = shape[0] * receptive_field_size 18 | elif dim_ordering == 'tf': 19 | receptive_field_size = np.prod(shape[:2]) 20 | fan_in = shape[-2] * receptive_field_size 21 | fan_out = shape[-1] * receptive_field_size 22 | else: 23 | raise ValueError('Invalid dim_ordering: ' + dim_ordering) 24 | else: 25 | # no specific assumptions 26 | fan_in = np.sqrt(np.prod(shape)) 27 | fan_out = np.sqrt(np.prod(shape)) 28 | return fan_in, fan_out 29 | 30 | 31 | def uniform(shape, scale=0.05, name=None): 32 | return K.random_uniform_variable(shape, -scale, scale, name=name) 33 | 34 | 35 | def normal(shape, scale=0.05, name=None): 36 | return K.random_normal_variable(shape, 0.0, scale, name=name) 37 | 38 | 39 | def lecun_uniform(shape, name=None, dim_ordering='th'): 40 | ''' Reference: LeCun 98, Efficient Backprop 41 | http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf 42 | ''' 43 | fan_in, fan_out = get_fans(shape, dim_ordering=dim_ordering) 44 | scale = np.sqrt(3. / fan_in) 45 | return uniform(shape, scale, name=name) 46 | 47 | 48 | def glorot_normal(shape, name=None, dim_ordering='th'): 49 | ''' Reference: Glorot & Bengio, AISTATS 2010 50 | ''' 51 | fan_in, fan_out = get_fans(shape, dim_ordering=dim_ordering) 52 | s = np.sqrt(2. / (fan_in + fan_out)) 53 | return normal(shape, s, name=name) 54 | 55 | 56 | def glorot_uniform(shape, name=None, dim_ordering='th'): 57 | fan_in, fan_out = get_fans(shape, dim_ordering=dim_ordering) 58 | s = np.sqrt(6. / (fan_in + fan_out)) 59 | return uniform(shape, s, name=name) 60 | 61 | 62 | def he_normal(shape, name=None, dim_ordering='th'): 63 | ''' Reference: He et al., http://arxiv.org/abs/1502.01852 64 | ''' 65 | fan_in, fan_out = get_fans(shape, dim_ordering=dim_ordering) 66 | s = np.sqrt(2. / fan_in) 67 | return normal(shape, s, name=name) 68 | 69 | 70 | def he_uniform(shape, name=None, dim_ordering='th'): 71 | fan_in, fan_out = get_fans(shape, dim_ordering=dim_ordering) 72 | s = np.sqrt(6. / fan_in) 73 | return uniform(shape, s, name=name) 74 | 75 | 76 | def orthogonal(shape, scale=1.1, name=None): 77 | ''' From Lasagne. Reference: Saxe et al., http://arxiv.org/abs/1312.6120 78 | ''' 79 | flat_shape = (shape[0], np.prod(shape[1:])) 80 | a = np.random.normal(0.0, 1.0, flat_shape) 81 | u, _, v = np.linalg.svd(a, full_matrices=False) 82 | # pick the one with the correct shape 83 | q = u if u.shape == flat_shape else v 84 | q = q.reshape(shape) 85 | return K.variable(scale * q[:shape[0], :shape[1]], name=name) 86 | 87 | 88 | def identity(shape, scale=1, name=None): 89 | if len(shape) != 2 or shape[0] != shape[1]: 90 | raise ValueError('Identity matrix initialization can only be used ' 91 | 'for 2D square matrices.') 92 | else: 93 | return K.variable(scale * np.identity(shape[0]), name=name) 94 | 95 | 96 | def zero(shape, name=None): 97 | return K.zeros(shape, name=name) 98 | 99 | 100 | def one(shape, name=None): 101 | return K.ones(shape, name=name) 102 | 103 | 104 | def get(identifier, **kwargs): 105 | return get_from_module(identifier, globals(), 106 | 'initialization', kwargs=kwargs) 107 | -------------------------------------------------------------------------------- /src/loader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import codecs 4 | from utils import create_dico, create_mapping, zero_digits 5 | from utils import iob2, iob_iobes 6 | unknown_word=''; 7 | 8 | def load_sentences(path, lower, zeros): 9 | #{{{ 10 | """ 11 | Load sentences. A line must contain at least a word and its tag. 12 | Sentences are separated by empty lines. 13 | """ 14 | sentences = [] 15 | sentence = [] 16 | for line in codecs.open(path, 'r', 'utf8'): 17 | line = zero_digits(line.rstrip()) if zeros else line.rstrip() 18 | if not line: 19 | if len(sentence) > 0: 20 | if 'DOCSTART' not in sentence[0][0]: 21 | sentences.append(sentence) 22 | sentence = [] 23 | else: 24 | word = line.split() 25 | assert len(word) >= 2 26 | sentence.append(word) 27 | if len(sentence) > 0: 28 | if 'DOCSTART' not in sentence[0][0]: 29 | sentences.append(sentence) 30 | return sentences 31 | #}}} 32 | 33 | def update_tag_scheme(sentences, tag_scheme,removeTag=None): 34 | #{{{ 35 | """ 36 | Check and update sentences tagging scheme to IOB2. 37 | Only IOB1 and IOB2 schemes are accepted. 38 | """ 39 | for i, s in enumerate(sentences): 40 | tags = [w[-1] for w in s] 41 | # Check that tags are given in the IOB format 42 | if not iob2(tags): 43 | s_str = '\n'.join(' '.join(w) for w in s) 44 | raise Exception('Sentences should be given in IOB format! ' + 45 | 'Please check sentence %i:\n%s' % (i, s_str)) 46 | if tag_scheme == 'iob': 47 | # If format was IOB1, we convert to IOB2 48 | for word, new_tag in zip(s, tags): 49 | if removeTag is not None: 50 | if new_tag[2:] in removeTag: 51 | word[-1]='O'; 52 | else: 53 | word[-1]=new_tag; 54 | elif tag_scheme == 'iobes': 55 | new_tags = iob_iobes(tags) 56 | for word, new_tag in zip(s, new_tags): 57 | word[-1] = new_tag 58 | else: 59 | raise Exception('Unknown tagging scheme!') 60 | #}}} 61 | 62 | def word_mapping(sentences, lower): 63 | #{{{ 64 | """ 65 | Create a dictionary and a mapping of words, sorted by frequency. 66 | """ 67 | words = [[x[0].lower() if lower else x[0] for x in s] for s in sentences] 68 | dico = create_dico(words) 69 | dico[''] = 10000000 70 | word_to_id, id_to_word = create_mapping(dico) 71 | print "Found %i unique words (%i in total)" % ( 72 | len(dico), sum(len(x) for x in words) 73 | ) 74 | return dico, word_to_id, id_to_word 75 | #}}} 76 | 77 | def feature_mapping(sentences,index,featureName="",isPos=False): 78 | #{{{ 79 | """ 80 | Create a dictionary and mapping of characters, sorted by frequency. 81 | """ 82 | if isPos: 83 | features = [[w[0].lower()+"_"+w[index] for w in s] for s in sentences] 84 | else: 85 | features = [[w[index] for w in s] for s in sentences] 86 | dico = create_dico(features) 87 | dico[unknown_word]=10000000 88 | feature_to_id, id_to_feature = create_mapping(dico) 89 | print "Found %i unique %s features" % (len(dico),featureName) 90 | return dico, feature_to_id, id_to_feature 91 | #}}} 92 | 93 | def char_mapping(sentences): 94 | #{{{ 95 | """ 96 | Create a dictionary and mapping of characters, sorted by frequency. 97 | """ 98 | chars = ["".join([w[0] for w in s]) for s in sentences] 99 | dico = create_dico(chars) 100 | char_to_id, id_to_char = create_mapping(dico) 101 | print "Found %i unique characters" % len(dico) 102 | return dico, char_to_id, id_to_char 103 | #}}} 104 | 105 | def tag_mapping(sentences): 106 | #{{{ 107 | """ 108 | Create a dictionary and a mapping of tags, sorted by frequency. 109 | """ 110 | tags = [[word[-1] for word in s] for s in sentences] 111 | dico = create_dico(tags) 112 | tag_to_id, id_to_tag = create_mapping(dico) 113 | print "Found %i unique named entity tags" % len(dico) 114 | return dico, tag_to_id, id_to_tag 115 | #}}} 116 | 117 | def cap_feature(s): 118 | #{{{ 119 | """ 120 | Capitalization feature: 121 | 0 = low caps 122 | 1 = all caps 123 | 2 = first letter caps 124 | 3 = one capital (not first letter) 125 | """ 126 | if s.lower() == s: 127 | return 0 128 | elif s.upper() == s: 129 | return 1 130 | elif s[0].upper() == s[0]: 131 | return 2 132 | else: 133 | return 3 134 | #}}} 135 | 136 | def prepare_sentence(str_words, word_to_id, char_to_id, lower=False): 137 | #{{{ 138 | """ 139 | Prepare a sentence for evaluation. 140 | """ 141 | def f(x,flag=lower): return x.lower() if flag else x 142 | words = [word_to_id[f(w) if f(w) in word_to_id else ''] 143 | for w in str_words] 144 | charLower=False; 145 | if charLower: 146 | chars = [[char_to_id[c] for c in w.lower() if c in char_to_id] 147 | for w in str_words] 148 | else: 149 | chars = [[char_to_id[c] for c in w if c in char_to_id] 150 | for w in str_words] 151 | caps = [cap_feature(w) for w in str_words] 152 | return { 153 | 'str_words': str_words, 154 | 'words': words, 155 | 'chars': chars, 156 | 'caps': caps 157 | } 158 | #}}} 159 | 160 | def prepare_dataset(sentences,docLen,parameters, 161 | lower=False,isTest=False): 162 | #{{{ 163 | """ 164 | Prepare the dataset. Return a list of lists of dictionaries containing: 165 | - word indexes 166 | - word char indexes 167 | - tag indexes 168 | """ 169 | def f(x): return x.lower() if lower else x 170 | #get mapping 171 | #{{{ 172 | features=parameters['features']; 173 | feature2IdMap=features['feature2IdMap']; 174 | word_to_id=feature2IdMap['word']; 175 | char_to_id=feature2IdMap['char']; 176 | tag_to_id=feature2IdMap['tag']; 177 | if features['lemma']['isUsed']: 178 | lemma_to_id=feature2IdMap['lemma']; 179 | if features['pos']['isUsed']: 180 | pos_to_id=feature2IdMap['pos']; 181 | if features['chunk']['isUsed']: 182 | chunk_to_id=feature2IdMap['chunk']; 183 | if features['dic']['isUsed']: 184 | dic_to_id=feature2IdMap['dic']; 185 | #}}} 186 | data = [] 187 | if docLen is not None and len(sentences) != len(docLen): 188 | print "len(doc) != len(docLen)"; 189 | assert 0; 190 | i=0; 191 | for s in sentences: 192 | str_words = [w[0] for w in s] 193 | elem=prepare_sentence(str_words,word_to_id,char_to_id,lower); 194 | words = elem['words'] 195 | # Skip characters that are not in the training set 196 | chars = elem['chars'] 197 | caps = elem['caps']; 198 | if not isTest: 199 | tags = [tag_to_id[w[-1]] for w in s] 200 | 201 | e={ 202 | 'str_words': str_words, 203 | 'words': words, 204 | 'chars': chars, 205 | 'caps': caps, 206 | } 207 | 208 | #add features 209 | #{{{ 210 | if features['lemma']['isUsed']: 211 | lemma=[lemma_to_id[w[1]] 212 | if w[1] in lemma_to_id 213 | else lemma_to_id[unknown_word] for w in s]; 214 | e['lemma']=lemma; 215 | if features['pos']['isUsed']: 216 | pos=[pos_to_id[w[2]] 217 | if w[2] in pos_to_id 218 | else pos_to_id[unknown_word] for w in s]; 219 | e['pos']=pos; 220 | if features['chunk']['isUsed']: 221 | chunk=[chunk_to_id[w[3]] 222 | if w[3] in chunk_to_id 223 | else chunk_to_id[unknown_word] for w in s]; 224 | e['chunk']=chunk; 225 | if features['dic']['isUsed']: 226 | ner=[dic_to_id[w[4]] for w in s]; 227 | e['dic']=ner; 228 | #}}} 229 | 230 | #append doc len to data 231 | if parameters.has_key('sentencesLevelLoss') \ 232 | and parameters['sentencesLevelLoss']: 233 | lens=docLen[i]; 234 | i+=1; 235 | e['lens']=lens; 236 | 237 | if not isTest: 238 | e['tags']=tags; 239 | 240 | 241 | 242 | data.append(e); 243 | return data 244 | #}}} 245 | 246 | def augment_with_pretrained(dictionary, ext_emb_path, words): 247 | #{{{ 248 | """ 249 | Augment the dictionary with words that have a pretrained embedding. 250 | If `words` is None, we add every word that has a pretrained embedding 251 | to the dictionary, otherwise, we only add the words that are given by 252 | `words` (typically the words in the development and test sets.) 253 | """ 254 | print 'Loading pretrained embeddings from %s...' % ext_emb_path 255 | assert os.path.isfile(ext_emb_path) 256 | 257 | # Load pretrained embeddings from file 258 | pretrained = set([ 259 | line.rstrip().split()[0].strip() 260 | for line in codecs.open(ext_emb_path, 'r', 'utf-8') 261 | if len(ext_emb_path) > 0 262 | ]) 263 | 264 | # We either add every word in the pretrained file, 265 | # or only words given in the `words` list to which 266 | # we can assign a pretrained embedding 267 | if words is None: 268 | for word in pretrained: 269 | if word not in dictionary: 270 | dictionary[word] = 0 271 | else: 272 | for word in words: 273 | if any(x in pretrained for x in [ 274 | word, 275 | word.lower(), 276 | re.sub('\d', '0', word.lower()) 277 | ]) and word not in dictionary: 278 | dictionary[word] = 0 279 | 280 | word_to_id, id_to_word = create_mapping(dictionary) 281 | return dictionary, word_to_id, id_to_word 282 | #}}} 283 | -------------------------------------------------------------------------------- /src/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import numpy as np 4 | import scipy.io 5 | import theano 6 | import theano.tensor as T 7 | import codecs 8 | import cPickle 9 | 10 | from utils import shared, set_values, get_name 11 | from nn import HiddenLayer, EmbeddingLayer, DropoutLayer, forward 12 | from nn import LSTM; 13 | #from nn import LSTM_normal as LSTM; 14 | from nn import AttentionLayer; 15 | from optimization import Optimization 16 | 17 | def loadPreEmbFeatures(fName,feature_to_id,weights,lower=False): 18 | #{{{ 19 | def f(x): return x.lower() if lower else x 20 | #to lower 21 | feature_to_id_=feature_to_id; 22 | if lower: 23 | feature_to_id_lower={}; 24 | for elem in feature_to_id.items(): 25 | feature_to_id_lower[elem[0].lower()]=elem[1]; 26 | feature_to_id_=feature_to_id_lower; 27 | feature_dim=weights.shape[1]; 28 | 29 | invalid_count=0; 30 | valid_count=0; 31 | for line in codecs.open(fName,'r','utf-8'): 32 | line=line.rstrip().split(); 33 | if len(line) == feature_dim+1 and line[0] in feature_to_id_: 34 | weights[feature_to_id_[line[0]]]=np.array( 35 | [float(x) for x in line[1:]] 36 | ).astype(theano.config.floatX) 37 | valid_count+=1; 38 | else: 39 | invalid_count+=1; 40 | print "when loading %s ,%d Invalid line,%d valid line" %(fName,invalid_count,valid_count); 41 | #}}} 42 | 43 | class Model(object): 44 | """ 45 | Network architecture. 46 | """ 47 | def __init__(self, parameters=None, models_path=None, 48 | model_path=None,Training=False): 49 | #{{{ 50 | """ 51 | Initialize the model. We either provide the parameters and a path where 52 | we store the models, or the location of a trained model. 53 | """ 54 | if Training: 55 | #{{{ 56 | assert parameters and models_path 57 | # Create a name based on the parameters 58 | self.parameters = parameters 59 | self.name = get_name(parameters) 60 | # Model location 61 | if model_path is None: 62 | model_path = os.path.join(models_path, self.name) 63 | self.model_path = model_path 64 | self.parameters_path = os.path.join(model_path, 'parameters.pkl') 65 | self.mappings_path = os.path.join(model_path, 'mappings.pkl') 66 | # Create directory for the model if it does not exist 67 | if not os.path.exists(self.model_path): 68 | os.makedirs(self.model_path) 69 | # Save the parameters to disk 70 | with open(self.parameters_path, 'wb') as f: 71 | cPickle.dump(parameters, f) 72 | #}}} 73 | else: 74 | #{{{ 75 | # Model location 76 | self.model_path = model_path 77 | self.parameters_path = os.path.join(model_path, 'parameters.pkl') 78 | self.mappings_path = os.path.join(model_path, 'mappings.pkl') 79 | # Create directory for the model if it does not exist 80 | if not os.path.exists(self.model_path): 81 | os.makedirs(self.model_path) 82 | # Save the parameters to disk 83 | with open(self.parameters_path, 'rb') as f: 84 | self.parameters=cPickle.load(f); 85 | self.reload_mappings(); 86 | self.components = {} 87 | #}}} 88 | #}}} 89 | 90 | def save_mappings(self, id_to_word, id_to_char, id_to_tag): 91 | #{{{ 92 | """ 93 | We need to save the mappings if we want to use the model later. 94 | """ 95 | self.id_to_word = id_to_word 96 | self.id_to_char = id_to_char 97 | self.id_to_tag = id_to_tag 98 | with open(self.mappings_path, 'wb') as f: 99 | mappings = { 100 | 'id_to_word': self.id_to_word, 101 | 'id_to_char': self.id_to_char, 102 | 'id_to_tag': self.id_to_tag, 103 | } 104 | cPickle.dump(mappings, f) 105 | #}}} 106 | 107 | def reload_mappings(self): 108 | #{{{ 109 | """ 110 | Load mappings from disk. 111 | """ 112 | with open(self.mappings_path, 'rb') as f: 113 | mappings = cPickle.load(f) 114 | self.id_to_word = mappings['id_to_word'] 115 | self.id_to_char = mappings['id_to_char'] 116 | self.id_to_tag = mappings['id_to_tag'] 117 | #}}} 118 | 119 | def add_component(self, param): 120 | """ 121 | Add a new parameter to the network. 122 | """ 123 | if param.name in self.components: 124 | raise Exception('The network already has a parameter "%s"!' 125 | % param.name) 126 | self.components[param.name] = param 127 | 128 | def modelScore(self,tag_ids,scores,s_len): 129 | #{{{ 130 | """ 131 | ATTENTATION THIS FUNCTION IS SYMBOL PROGRAMMING 132 | this function is to return the score of our model at a fixed sentence label 133 | @param: 134 | scores: the scores matrix ,the output of our model 135 | tag: a numpy array, which represent one sentence label 136 | sent_lens: a scalar number, the length of sentence. 137 | because our sentence label will be expand to max sentence length, 138 | so we will use this to get the original sentence label. 139 | @return: 140 | a scalar number ,the score; 141 | """ 142 | #{{{ 143 | n_tags=self.output_dim; 144 | transitions=self.transitions; 145 | #score from tags_scores 146 | real_path_score = scores[T.arange(s_len), tag_ids].sum() 147 | 148 | # Score from transitions 149 | b_id = theano.shared(value=np.array([n_tags], dtype=np.int32)) 150 | e_id = theano.shared(value=np.array([n_tags + 1], dtype=np.int32)) 151 | padded_tags_ids = T.concatenate([b_id, tag_ids, e_id], axis=0) 152 | real_path_score += transitions[ 153 | padded_tags_ids[T.arange(s_len + 1)], 154 | padded_tags_ids[T.arange(s_len + 1) + 1] 155 | ].sum() 156 | #to prevent T.exp(real_path_score) to be inf 157 | #return real_path_score; 158 | return real_path_score/s_len; 159 | #}}} 160 | #}}} 161 | 162 | def save(self): 163 | #{{{ 164 | """ 165 | Write components values to disk. 166 | """ 167 | for name, param in self.components.items(): 168 | param_path = os.path.join(self.model_path, "%s.mat" % name) 169 | if hasattr(param, 'params'): 170 | param_values = {p.name: p.get_value() for p in param.params} 171 | else: 172 | param_values = {name: param.get_value()} 173 | scipy.io.savemat(param_path, param_values) 174 | #}}} 175 | 176 | def reload(self,features=None): 177 | #{{{ 178 | """ 179 | Load components values from disk. 180 | """ 181 | featureLayerNameMap=['pos_layer','lemma_layer', 182 | 'chunk_layer','dic_layer']; 183 | for name, param in self.components.items(): 184 | #when feature is use to attended and not lstm-input, 185 | #we will not reload the param 186 | if features is not None and name in featureLayerNameMap: 187 | featuresName=name[:name.find('_')]; 188 | if features[featuresName]['attended']==1 and \ 189 | features[featuresName]['lstm-input']==0: 190 | continue; 191 | param_path = os.path.join(self.model_path, "%s.mat" % name) 192 | param_values = scipy.io.loadmat(param_path) 193 | if hasattr(param, 'params'): 194 | for p in param.params: 195 | set_values(p.name, p, param_values[p.name]) 196 | else: 197 | set_values(name, param, param_values[name]) 198 | #}}} 199 | 200 | def build4(self,parameters): 201 | #{{{ 202 | """ 203 | Build the network. 204 | """ 205 | #some parameters 206 | dropout=parameters['dropout'] ; 207 | char_dim=parameters['char_dim']; 208 | char_lstm_dim=parameters['char_lstm_dim']; 209 | char_bidirect=parameters['char_bidirect']; 210 | word_dim=parameters['word_dim']; 211 | word_lstm_dim=parameters['word_lstm_dim']; 212 | word_bidirect=parameters['word_bidirect']; 213 | lr_method=parameters['lr_method']; 214 | pre_emb=parameters['pre_emb']; 215 | crf=parameters['crf']; 216 | cap_dim=parameters['cap_dim']; 217 | training=parameters['training']; 218 | features=parameters['features']; 219 | useAttend=parameters['useAttend']; 220 | if useAttend: 221 | reloadParam=parameters['loading']; 222 | else: 223 | reloadParam=None; 224 | if reloadParam is not None: 225 | reloadPath=parameters['loading_path']; 226 | sentencesLevelLoss=parameters['sentencesLevelLoss']; 227 | 228 | # Training parameters 229 | n_words = len(self.id_to_word) 230 | n_chars = len(self.id_to_char) 231 | n_tags = len(self.id_to_tag) 232 | self.output_dim = len(self.id_to_tag); 233 | self.transitions = shared((self.output_dim+ 1, self.output_dim ), 'transitions') 234 | 235 | # Number of capitalization features 236 | if cap_dim: 237 | n_cap = 4 238 | 239 | # Network variables 240 | is_train = T.iscalar('is_train') 241 | word_ids = T.ivector(name='word_ids') 242 | wordTrue_ids=T.ivector(name='wordTrue_ids'); 243 | char_for_ids = T.imatrix(name='char_for_ids') 244 | char_rev_ids = T.imatrix(name='char_rev_ids') 245 | char_pos_ids = T.ivector(name='char_pos_ids') 246 | docLen=T.ivector(name='docLen'); 247 | tag_ids = T.ivector(name='tag_ids') 248 | if cap_dim: 249 | cap_ids = T.ivector(name='cap_ids') 250 | 251 | #some features 252 | if features is not None and features['lemma']['isUsed']: 253 | lemma_ids=T.ivector(name='lemma_ids'); 254 | if features is not None and features['pos']['isUsed']: 255 | pos_ids=T.ivector(name='pos_ids'); 256 | if features is not None and features['chunk']['isUsed']: 257 | chunk_ids=T.ivector(name='chunk_ids'); 258 | if features is not None and features['dic']['isUsed']: 259 | dic_ids=T.ivector(name='dic_ids'); 260 | 261 | # Sentence length 262 | s_len = (word_ids if word_dim else char_pos_ids).shape[0] 263 | 264 | # Final input (all word features) 265 | input_dim = 0 266 | inputs = [] 267 | 268 | # Word inputs 269 | #{{{ 270 | if word_dim: 271 | input_dim += word_dim 272 | word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer') 273 | word_input = word_layer.link(word_ids) 274 | wordTrue_input=word_layer.link(wordTrue_ids); 275 | inputs.append(word_input) 276 | # Initialize with pretrained embeddings 277 | if pre_emb and training: 278 | new_weights = word_layer.embeddings.get_value() 279 | print 'Loading pretrained embeddings from %s...' % pre_emb 280 | pretrained = {} 281 | emb_invalid = 0 282 | for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')): 283 | line = line.rstrip().split() 284 | if len(line) == word_dim + 1: 285 | pretrained[line[0]] = np.array( 286 | [float(x) for x in line[1:]] 287 | ).astype(np.float32) 288 | else: 289 | emb_invalid += 1 290 | if emb_invalid > 0: 291 | print 'WARNING: %i invalid lines' % emb_invalid 292 | c_found = 0 293 | c_lower = 0 294 | c_zeros = 0 295 | # Lookup table initialization 296 | for i in xrange(n_words): 297 | word = self.id_to_word[i] 298 | if word in pretrained: 299 | new_weights[i] = pretrained[word] 300 | c_found += 1 301 | elif word.lower() in pretrained: 302 | new_weights[i] = pretrained[word.lower()] 303 | c_lower += 1 304 | elif re.sub('\d', '0', word.lower()) in pretrained: 305 | new_weights[i] = pretrained[ 306 | re.sub('\d', '0', word.lower()) 307 | ] 308 | c_zeros += 1 309 | word_layer.embeddings.set_value(new_weights) 310 | print 'Loaded %i pretrained embeddings.' % len(pretrained) 311 | print ('%i / %i (%.4f%%) words have been initialized with ' 312 | 'pretrained embeddings.') % ( 313 | c_found + c_lower + c_zeros, n_words, 314 | 100. * (c_found + c_lower + c_zeros) / n_words 315 | ) 316 | print ('%i found directly, %i after lowercasing, ' 317 | '%i after lowercasing + zero.') % ( 318 | c_found, c_lower, c_zeros 319 | )#}}} 320 | 321 | # Chars inputs 322 | #{{{ 323 | if char_dim: 324 | input_dim += char_lstm_dim 325 | char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer') 326 | 327 | char_lstm_for = LSTM(char_dim, char_lstm_dim, with_batch=True, 328 | name='char_lstm_for') 329 | char_lstm_rev = LSTM(char_dim, char_lstm_dim, with_batch=True, 330 | name='char_lstm_rev') 331 | 332 | char_lstm_for.link(char_layer.link(char_for_ids)) 333 | char_lstm_rev.link(char_layer.link(char_rev_ids)) 334 | 335 | char_for_output = char_lstm_for.h.dimshuffle((1, 0, 2))[ 336 | T.arange(s_len), char_pos_ids 337 | ] 338 | char_rev_output = char_lstm_rev.h.dimshuffle((1, 0, 2))[ 339 | T.arange(s_len), char_pos_ids 340 | ] 341 | char_output=T.concatenate([char_for_output,char_rev_output],axis=-1); 342 | inputs.append(char_for_output) 343 | if char_bidirect: 344 | inputs.append(char_rev_output) 345 | input_dim += char_lstm_dim 346 | #}}} 347 | 348 | # Capitalization feature 349 | # 350 | if cap_dim: 351 | input_dim += cap_dim 352 | cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer') 353 | inputs.append(cap_layer.link(cap_ids)) 354 | 355 | #add feature 356 | #{{{ 357 | if features is not None and features['lemma']['isUsed']: 358 | lemma_layer=EmbeddingLayer(features['lemma']['num'], 359 | features['lemma']['dim'], 360 | name='lemma_layer'); 361 | if features['lemma']['pre_emb'] is not "": 362 | new_weights=lemma_layer.embeddings.get_value(); 363 | loadPreEmbFeatures(features['lemma']['pre_emb'], 364 | features['feature_to_id_map']['lemma'], 365 | new_weights, 366 | lower=True); 367 | lemma_layer.embeddings.set_value(new_weights); 368 | lemma_output=lemma_layer.link(lemma_ids); 369 | if features['lemma']['lstm-input']: 370 | input_dim+=features['lemma']['dim']; 371 | inputs.append(lemma_output); 372 | if features is not None and features['pos']['isUsed']: 373 | pos_layer=EmbeddingLayer(features['pos']['num'], 374 | features['pos']['dim'], 375 | name='pos_layer'); 376 | if features['pos']['pre_emb'] is not "": 377 | new_weights=pos_layer.embeddings.get_value(); 378 | loadPreEmbFeatures(features['pos']['pre_emb'], 379 | features['feature_to_id_map']['pos'], 380 | new_weights); 381 | pos_layer.embeddings.set_value(new_weights); 382 | pos_output=pos_layer.link(pos_ids); 383 | if features['pos']['lstm-input']: 384 | input_dim+=features['pos']['dim']; 385 | inputs.append(pos_output); 386 | if features is not None and features['chunk']['isUsed']: 387 | chunk_layer=EmbeddingLayer(features['chunk']['num'], 388 | features['chunk']['dim'], 389 | name='chunk_layer'); 390 | chunk_output=chunk_layer.link(chunk_ids); 391 | if features['chunk']['lstm-input']: 392 | input_dim+=features['chunk']['dim']; 393 | inputs.append(chunk_output) 394 | if features is not None and features['dic']['isUsed']: 395 | dic_layer=EmbeddingLayer(features['dic']['num'], 396 | features['dic']['dim'], 397 | name='dic_layer'); 398 | dic_output=dic_layer.link(dic_ids); 399 | if features['dic']['lstm-input']: 400 | input_dim+=features['dic']['dim']; 401 | inputs.append(dic_output); 402 | #}}} 403 | 404 | # Prepare final input 405 | if len(inputs) != 1: 406 | inputs = T.concatenate(inputs, axis=1) 407 | 408 | # 409 | # Dropout on final input 410 | # 411 | if dropout: 412 | dropout_layer = DropoutLayer(p=dropout) 413 | input_train = dropout_layer.link(inputs) 414 | input_test = (1 - dropout) * inputs 415 | inputs = T.switch(T.neq(is_train, 0), input_train,input_test); 416 | 417 | # LSTM for words 418 | word_lstm_for = LSTM(input_dim, word_lstm_dim, with_batch=False, 419 | name='word_lstm_for') 420 | word_lstm_rev = LSTM(input_dim, word_lstm_dim, with_batch=False, 421 | name='word_lstm_rev') 422 | if sentencesLevelLoss: 423 | def sentLSTM(i,output,input,lenVec): 424 | #{{{ 425 | Len=lenVec[i]; 426 | accLen=lenVec[:i].sum(); 427 | currentInput=input[accLen:accLen+Len]; 428 | word_lstm_for.link(currentInput); 429 | word_lstm_rev.link(currentInput[::-1,:]); 430 | wordForOutput=word_lstm_for.h; 431 | wordRevOutput=word_lstm_rev.h[::-1,:]; 432 | finalOutput=T.concatenate( 433 | [wordForOutput,wordRevOutput],axis=-1 434 | ) 435 | output=T.set_subtensor(output[accLen:accLen+Len], 436 | finalOutput); 437 | return output; 438 | #}}} 439 | result,update=theano.scan(fn=sentLSTM, 440 | outputs_info=T.zeros((inputs.shape[0],word_lstm_dim*2),dtype='float32'), 441 | sequences=[T.arange(docLen.shape[0])], 442 | non_sequences=[inputs,docLen]); 443 | 444 | word_lstm_for.link(inputs) 445 | word_lstm_rev.link(inputs[::-1, :]) 446 | word_for_output = word_lstm_for.h 447 | word_for_c=word_lstm_for.c; 448 | word_rev_output = word_lstm_rev.h[::-1, :] 449 | word_rev_c=word_lstm_rev.c[::-1,:]; 450 | 451 | final_c=T.concatenate( 452 | [word_for_c,word_rev_c], 453 | axis=-1 454 | ) 455 | final_output=result[-1] 456 | else : 457 | word_lstm_for.link(inputs) 458 | word_lstm_rev.link(inputs[::-1, :]) 459 | word_for_output = word_lstm_for.h 460 | word_for_c=word_lstm_for.c; 461 | word_rev_output = word_lstm_rev.h[::-1, :] 462 | word_rev_c=word_lstm_rev.c[::-1,:]; 463 | final_output = T.concatenate( 464 | [word_for_output, word_rev_output], 465 | axis=-1 466 | ) 467 | final_c=T.concatenate( 468 | [word_for_c,word_rev_c], 469 | axis=-1 470 | ) 471 | 472 | if useAttend: 473 | #attention layer 474 | attended=[]; 475 | attendedDim=0; 476 | if features is not None and features['word']['attended']: 477 | attended.append(wordTrue_input); 478 | attendedDim+=word_dim; 479 | if features is not None and features['char']['attended']: 480 | attended.append(char_output); 481 | attendedDim+=char_lstm_dim*2; 482 | if features is not None and features['lemma']['attended']: 483 | attended.append(lemma_output); 484 | attendedDim+=features['lemma']['dim']; 485 | if features is not None and features['pos']['attended']: 486 | attended.append(pos_output); 487 | attendedDim+=features['pos']['dim']; 488 | if features is not None and features['chunk']['attended']: 489 | attended.append(chunk_output); 490 | attendedDim+=features['chunk']['dim']; 491 | if features is not None and features['dic']['attended']: 492 | attended.append(dic_output); 493 | attendedDim+=features['dic']['dim']; 494 | 495 | attention_layer=AttentionLayer(attended_dim=attendedDim, 496 | state_dim=attendedDim, 497 | #attention_layer=AttentionLayer(attended_dim=word_lstm_dim*2, 498 | # state_dim=word_lstm_dim*2, 499 | source_dim=word_lstm_dim*2, 500 | scoreFunName=parameters['attenScoreFun'], 501 | name='attention_layer'); 502 | 503 | if len(attended)>1: 504 | attendedInput=T.concatenate(attended,axis=-1); 505 | else: 506 | attendedInput=attended[0]; 507 | 508 | final_output=attention_layer.link(attendedInput,attendedInput,final_output); 509 | #using lstm_state to compute attention 510 | #final_output=attention_layer.link(final_output,final_c,final_output); 511 | self.energy=attention_layer.energy; 512 | else: 513 | final_output=final_output; 514 | 515 | tanh_layer = HiddenLayer(2 * word_lstm_dim, word_lstm_dim, 516 | name='tanh_layer', activation='tanh') 517 | final_output = tanh_layer.link(final_output) 518 | 519 | # Sentence to Named Entity tags - Score 520 | final_layer = HiddenLayer(word_lstm_dim, n_tags, name='final_layer', 521 | activation=(None if crf else 'softmax')) 522 | tags_scores = final_layer.link(final_output) 523 | 524 | # No CRF 525 | if not crf: 526 | cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean() 527 | # CRF 528 | else: 529 | if sentencesLevelLoss: 530 | #calcuate loss according to sentence instead of docLen 531 | def sentLoss(i,scores,trueIds,transitions,lenVec): 532 | #{{{ 533 | Len=lenVec[i]; 534 | accLen=lenVec[:i].sum(); 535 | currentTagsScores=scores[accLen:accLen+Len]; 536 | currentIds=trueIds[accLen:accLen+Len]; 537 | real_path_score = currentTagsScores[T.arange(Len), 538 | currentIds].sum() 539 | # Score from transitions 540 | padded_tags_ids = T.concatenate([[n_tags],currentIds], axis=0) 541 | real_path_score += transitions[ 542 | padded_tags_ids[T.arange(Len )], 543 | padded_tags_ids[T.arange(Len ) + 1] 544 | ].sum() 545 | 546 | all_paths_scores = forward(currentTagsScores,transitions) 547 | cost = - (real_path_score - all_paths_scores) 548 | return cost; 549 | #}}} 550 | result,update=theano.scan(fn=sentLoss, 551 | outputs_info=None, 552 | sequences=[T.arange(docLen.shape[0])], 553 | non_sequences=[tags_scores,tag_ids,self.transitions,docLen]) 554 | cost=result.sum(); 555 | else: 556 | real_path_score = tags_scores[T.arange(s_len), tag_ids].sum() 557 | 558 | # Score from transitions 559 | padded_tags_ids = T.concatenate([[n_tags], tag_ids], axis=0) 560 | real_path_score += self.transitions[ 561 | padded_tags_ids[T.arange(s_len )], 562 | padded_tags_ids[T.arange(s_len ) + 1] 563 | ].sum() 564 | 565 | all_paths_scores = forward(tags_scores, self.transitions) 566 | cost = - (real_path_score - all_paths_scores) 567 | 568 | # Network parameters 569 | params = [] 570 | if word_dim: 571 | self.add_component(word_layer) 572 | params.extend(word_layer.params) 573 | if char_dim: 574 | self.add_component(char_layer) 575 | self.add_component(char_lstm_for) 576 | params.extend(char_layer.params) 577 | params.extend(char_lstm_for.params) 578 | if char_bidirect: 579 | self.add_component(char_lstm_rev) 580 | params.extend(char_lstm_rev.params) 581 | self.add_component(word_lstm_for) 582 | params.extend(word_lstm_for.params) 583 | if word_bidirect: 584 | self.add_component(word_lstm_rev) 585 | params.extend(word_lstm_rev.params) 586 | if cap_dim: 587 | self.add_component(cap_layer) 588 | params.extend(cap_layer.params) 589 | self.add_component(final_layer) 590 | params.extend(final_layer.params) 591 | if crf: 592 | self.add_component(self.transitions) 593 | params.append(self.transitions) 594 | if word_bidirect: 595 | self.add_component(tanh_layer) 596 | params.extend(tanh_layer.params) 597 | #add feature layer 598 | if features is not None and features['lemma']['isUsed']: 599 | self.add_component(lemma_layer); 600 | params.extend(lemma_layer.params); 601 | if features is not None and features['pos']['isUsed']: 602 | self.add_component(pos_layer); 603 | params.extend(pos_layer.params); 604 | if features is not None and features['chunk']['isUsed']: 605 | self.add_component(chunk_layer); 606 | params.extend(chunk_layer.params); 607 | if features is not None and features['dic']['isUsed']: 608 | self.add_component(dic_layer); 609 | params.extend(dic_layer.params); 610 | 611 | if useAttend and reloadParam: 612 | #reload pre-train params 613 | model_path=self.model_path; 614 | self.model_path=reloadPath; 615 | print "loading:",self.model_path; 616 | self.reload(features); 617 | self.model_path=model_path; 618 | 619 | if useAttend: 620 | #add attention_layer 621 | self.add_component(attention_layer); 622 | params.extend(attention_layer.params); 623 | 624 | # Prepare train and eval inputs 625 | eval_inputs = [] 626 | if word_dim: 627 | eval_inputs.append(word_ids) 628 | if char_dim: 629 | eval_inputs.append(char_for_ids) 630 | if char_bidirect: 631 | eval_inputs.append(char_rev_ids) 632 | eval_inputs.append(char_pos_ids) 633 | if cap_dim: 634 | eval_inputs.append(cap_ids) 635 | if useAttend: 636 | eval_inputs.append(wordTrue_ids); 637 | if sentencesLevelLoss: 638 | eval_inputs.append(docLen); 639 | #add feature input 640 | if features is not None and features['lemma']['isUsed']: 641 | eval_inputs.append(lemma_ids); 642 | if features is not None and features['pos']['isUsed']: 643 | eval_inputs.append(pos_ids); 644 | if features is not None and features['chunk']['isUsed']: 645 | eval_inputs.append(chunk_ids); 646 | if features is not None and features['dic']['isUsed']: 647 | eval_inputs.append(dic_ids); 648 | train_inputs = eval_inputs + [tag_ids] 649 | 650 | # Parse optimization method parameters 651 | if "-" in lr_method: 652 | lr_method_name = lr_method[:lr_method.find('-')] 653 | lr_method_parameters = {} 654 | for x in lr_method[lr_method.find('-') + 1:].split('-'): 655 | split = x.split('_') 656 | assert len(split) == 2 657 | lr_method_parameters[split[0]] = float(split[1]) 658 | else: 659 | lr_method_name = lr_method 660 | lr_method_parameters = {} 661 | 662 | # Compile training function 663 | print 'Compiling...' 664 | if training: 665 | #constraints 666 | if useAttend: 667 | self.constraints=attention_layer.constraints; 668 | else: 669 | self.constraints={}; 670 | from keras import optimizers ; 671 | self.optimizer=optimizers.SGD(lr=0.001,momentum=0.9, 672 | decay=0.,nesterov=True,clipvalue=5); 673 | self.optimizer=optimizers.RMSprop(); 674 | #self.optimizer=SGD(lr=lr_method_parameters['lr'],clipvalue=5,gradient_noise=0.01) 675 | updates = Optimization(clip=5.0).get_updates(lr_method_name, cost, params,constraints=self.constraints, **lr_method_parameters) 676 | #updates = self.optimizer.get_updates(params,self.constraints,cost); 677 | f_train_outputs=[cost]; 678 | if useAttend: 679 | f_train_outputs.append(self.energy); 680 | 681 | f_train = theano.function( 682 | inputs=train_inputs, 683 | outputs=f_train_outputs, 684 | updates=updates, 685 | on_unused_input='ignore', 686 | givens=({is_train: np.cast['int32'](1)} if dropout else {}) 687 | ) 688 | 689 | f_test = theano.function( 690 | inputs=train_inputs, 691 | outputs=cost, 692 | on_unused_input='ignore', 693 | givens=({is_train: np.cast['int32'](0)} if dropout else {}) 694 | ) 695 | self.f_test=f_test; 696 | else: 697 | f_train = None 698 | 699 | # Compile evaluation function 700 | if not crf: 701 | f_eval = theano.function( 702 | inputs=eval_inputs, 703 | outputs=tags_scores, 704 | givens=({is_train: np.cast['int32'](0)} if dropout else {}) 705 | ) 706 | else: 707 | if sentencesLevelLoss: 708 | def sentVitebe(i,predictTag,scores,transitions,lenVec): 709 | #{{{ 710 | Len=lenVec[i]; 711 | accLen=lenVec[:i].sum(); 712 | currentTagsScores=scores[accLen:accLen+Len]; 713 | currentPredictIds=forward(currentTagsScores, 714 | transitions,viterbi=True, 715 | return_alpha=False, 716 | return_best_sequence=True) ; 717 | predictTag=T.set_subtensor(predictTag[accLen:accLen+Len],currentPredictIds); 718 | return predictTag; 719 | #}}} 720 | predictTag,update=theano.scan(fn=sentVitebe, 721 | outputs_info=T.zeros((tags_scores.shape[0],),dtype='int32'), 722 | sequences=[T.arange(docLen.shape[0])], 723 | non_sequences=[tags_scores,self.transitions,docLen]); 724 | predictTag=predictTag[-1]; 725 | else: 726 | predictTag=forward(tags_scores, self.transitions, 727 | viterbi=True,return_alpha=False, 728 | return_best_sequence=True) 729 | f_eval = theano.function( 730 | inputs=eval_inputs, 731 | outputs=predictTag, 732 | on_unused_input='ignore', 733 | givens=({is_train: np.cast['int32'](0)} if dropout else {}) 734 | ) 735 | #f_AttenVisual=theano.function( 736 | # inputs=eval_inputs, 737 | # outputs=[predictTag,self.energy], 738 | # on_unused_input='ignore', 739 | # givens=({is_train: np.cast['int32'](0)} if dropout else {}) 740 | # ) 741 | #self.f_AttenVisual=f_AttenVisual; 742 | 743 | return f_train, f_eval; 744 | #}}} 745 | 746 | def build(self,parameters): 747 | #{{{ 748 | """ 749 | Build the network. 750 | """ 751 | #some parameters 752 | dropout=parameters['dropout'] ; 753 | char_dim=parameters['char_dim']; 754 | char_lstm_dim=parameters['char_lstm_dim']; 755 | char_bidirect=parameters['char_bidirect']; 756 | word_dim=parameters['word_dim']; 757 | word_lstm_dim=parameters['word_lstm_dim']; 758 | word_bidirect=parameters['word_bidirect']; 759 | lr_method=parameters['lr_method']; 760 | pre_emb=parameters['pre_emb']; 761 | crf=parameters['crf']; 762 | cap_dim=parameters['cap_dim']; 763 | training=parameters['training']; 764 | features=parameters['features']; 765 | 766 | # Training parameters 767 | n_words = len(self.id_to_word) 768 | n_chars = len(self.id_to_char) 769 | n_tags = len(self.id_to_tag) 770 | self.output_dim = len(self.id_to_tag); 771 | self.transitions = shared((self.output_dim+ 1, self.output_dim ), 'transitions') 772 | 773 | # Number of capitalization features 774 | if cap_dim: 775 | n_cap = 4 776 | 777 | if features is not None and features['lemma']['isUsed']: 778 | lemma_ids=T.ivector(name='lemma_ids'); 779 | if features is not None and features['pos']['isUsed']: 780 | pos_ids=T.ivector(name='pos_ids'); 781 | if features is not None and features['chunk']['isUsed']: 782 | chunk_ids=T.ivector(name='chunk_ids'); 783 | if features is not None and features['NER']['isUsed']: 784 | dic_ids=T.ivector(name='dic_ids'); 785 | 786 | # Network variables 787 | is_train = T.iscalar('is_train') 788 | word_ids = T.ivector(name='word_ids') 789 | char_for_ids = T.imatrix(name='char_for_ids') 790 | char_rev_ids = T.imatrix(name='char_rev_ids') 791 | char_pos_ids = T.ivector(name='char_pos_ids') 792 | tag_ids = T.ivector(name='tag_ids') 793 | if cap_dim: 794 | cap_ids = T.ivector(name='cap_ids') 795 | 796 | # Sentence length 797 | s_len = (word_ids if word_dim else char_pos_ids).shape[0] 798 | 799 | # Final input (all word features) 800 | input_dim = 0 801 | inputs = [] 802 | 803 | # Word inputs 804 | #{{{ 805 | if word_dim: 806 | input_dim += word_dim 807 | word_layer = EmbeddingLayer(n_words, word_dim, name='word_layer') 808 | word_input = word_layer.link(word_ids) 809 | #for attention 810 | inputs.append(word_input) 811 | # Initialize with pretrained embeddings 812 | if pre_emb and training: 813 | new_weights = word_layer.embeddings.get_value() 814 | print 'Loading pretrained embeddings from %s...' % pre_emb 815 | pretrained = {} 816 | emb_invalid = 0 817 | for i, line in enumerate(codecs.open(pre_emb, 'r', 'utf-8')): 818 | line = line.rstrip().split() 819 | if len(line) == word_dim + 1: 820 | pretrained[line[0]] = np.array( 821 | [float(x) for x in line[1:]] 822 | ).astype(np.float32) 823 | else: 824 | emb_invalid += 1 825 | if emb_invalid > 0: 826 | print 'WARNING: %i invalid lines' % emb_invalid 827 | c_found = 0 828 | c_lower = 0 829 | c_zeros = 0 830 | # Lookup table initialization 831 | for i in xrange(n_words): 832 | word = self.id_to_word[i] 833 | if word in pretrained: 834 | new_weights[i] = pretrained[word] 835 | c_found += 1 836 | elif word.lower() in pretrained: 837 | new_weights[i] = pretrained[word.lower()] 838 | c_lower += 1 839 | elif re.sub('\d', '0', word.lower()) in pretrained: 840 | new_weights[i] = pretrained[ 841 | re.sub('\d', '0', word.lower()) 842 | ] 843 | c_zeros += 1 844 | word_layer.embeddings.set_value(new_weights) 845 | print 'Loaded %i pretrained embeddings.' % len(pretrained) 846 | print ('%i / %i (%.4f%%) words have been initialized with ' 847 | 'pretrained embeddings.') % ( 848 | c_found + c_lower + c_zeros, n_words, 849 | 100. * (c_found + c_lower + c_zeros) / n_words 850 | ) 851 | print ('%i found directly, %i after lowercasing, ' 852 | '%i after lowercasing + zero.') % ( 853 | c_found, c_lower, c_zeros 854 | )#}}} 855 | 856 | # Chars inputs 857 | #{{{ 858 | if char_dim: 859 | input_dim += char_lstm_dim 860 | char_layer = EmbeddingLayer(n_chars, char_dim, name='char_layer') 861 | 862 | char_lstm_for = LSTM(char_dim, char_lstm_dim, with_batch=True, 863 | name='char_lstm_for') 864 | char_lstm_rev = LSTM(char_dim, char_lstm_dim, with_batch=True, 865 | name='char_lstm_rev') 866 | 867 | char_lstm_for.link(char_layer.link(char_for_ids)) 868 | char_lstm_rev.link(char_layer.link(char_rev_ids)) 869 | 870 | char_for_output = char_lstm_for.h.dimshuffle((1, 0, 2))[ 871 | T.arange(s_len), char_pos_ids 872 | ] 873 | char_rev_output = char_lstm_rev.h.dimshuffle((1, 0, 2))[ 874 | T.arange(s_len), char_pos_ids 875 | ] 876 | 877 | inputs.append(char_for_output) 878 | if char_bidirect: 879 | inputs.append(char_rev_output) 880 | input_dim += char_lstm_dim 881 | #}}} 882 | 883 | # Capitalization feature 884 | # 885 | if cap_dim: 886 | input_dim += cap_dim 887 | cap_layer = EmbeddingLayer(n_cap, cap_dim, name='cap_layer') 888 | inputs.append(cap_layer.link(cap_ids)) 889 | 890 | # Prepare final input 891 | if len(inputs) != 1: 892 | inputs = T.concatenate(inputs, axis=1) 893 | 894 | # 895 | # Dropout on final input 896 | # 897 | if dropout: 898 | dropout_layer = DropoutLayer(p=dropout) 899 | input_train = dropout_layer.link(inputs) 900 | input_test = (1 - dropout) * inputs 901 | inputs = T.switch(T.neq(is_train, 0), input_train, input_test) 902 | 903 | # LSTM for words 904 | word_lstm_for = LSTM(input_dim, word_lstm_dim, with_batch=False, 905 | name='word_lstm_for') 906 | word_lstm_rev = LSTM(input_dim, word_lstm_dim, with_batch=False, 907 | name='word_lstm_rev') 908 | word_lstm_for.link(inputs) 909 | word_lstm_rev.link(inputs[::-1, :]) 910 | word_for_output = word_lstm_for.h 911 | word_rev_output = word_lstm_rev.h[::-1, :] 912 | if word_bidirect: 913 | final_output = T.concatenate( 914 | [word_for_output, word_rev_output], 915 | axis=1 916 | ) 917 | tanh_layer = HiddenLayer(2 * word_lstm_dim, word_lstm_dim, 918 | name='tanh_layer', activation='tanh') 919 | final_output = tanh_layer.link(final_output) 920 | else: 921 | final_output = word_for_output 922 | 923 | # Sentence to Named Entity tags - Score 924 | final_layer = HiddenLayer(word_lstm_dim, n_tags, name='final_layer', 925 | activation=(None if crf else 'softmax')) 926 | tags_scores = final_layer.link(final_output) 927 | 928 | # No CRF 929 | if not crf: 930 | cost = T.nnet.categorical_crossentropy(tags_scores, tag_ids).mean() 931 | # CRF 932 | else: 933 | 934 | #all_paths_scores = forward(observations, self.transitions) 935 | #cost = - (self.modelScore(tag_ids,tags_scores,s_len) - all_paths_scores) 936 | #real_path_score=self.modelScore(tag_ids,tags_scores,tag_ids.shape[0]) ; 937 | #error=real_path_score+self.noiseLoss(tags_scores,tag_ids,0.5); 938 | #cost=-error; 939 | #cost=self.likehoodLoss(tags_scores,tag_ids,observations,2) 940 | 941 | real_path_score = tags_scores[T.arange(s_len), tag_ids].sum() 942 | 943 | # Score from transitions 944 | padded_tags_ids = T.concatenate([[n_tags], tag_ids], axis=0) 945 | real_path_score += self.transitions[ 946 | padded_tags_ids[T.arange(s_len )], 947 | padded_tags_ids[T.arange(s_len ) + 1] 948 | ].sum() 949 | 950 | all_paths_scores = forward(tags_scores, self.transitions) 951 | cost = - (real_path_score - all_paths_scores) 952 | 953 | # Network parameters 954 | params = [] 955 | if word_dim: 956 | self.add_component(word_layer) 957 | params.extend(word_layer.params) 958 | if char_dim: 959 | self.add_component(char_layer) 960 | self.add_component(char_lstm_for) 961 | params.extend(char_layer.params) 962 | params.extend(char_lstm_for.params) 963 | if char_bidirect: 964 | self.add_component(char_lstm_rev) 965 | params.extend(char_lstm_rev.params) 966 | self.add_component(word_lstm_for) 967 | params.extend(word_lstm_for.params) 968 | if word_bidirect: 969 | self.add_component(word_lstm_rev) 970 | params.extend(word_lstm_rev.params) 971 | if cap_dim: 972 | self.add_component(cap_layer) 973 | params.extend(cap_layer.params) 974 | self.add_component(final_layer) 975 | params.extend(final_layer.params) 976 | if crf: 977 | self.add_component(self.transitions) 978 | params.append(self.transitions) 979 | if word_bidirect: 980 | self.add_component(tanh_layer) 981 | params.extend(tanh_layer.params) 982 | 983 | # Prepare train and eval inputs 984 | eval_inputs = [] 985 | if word_dim: 986 | eval_inputs.append(word_ids) 987 | if char_dim: 988 | eval_inputs.append(char_for_ids) 989 | if char_bidirect: 990 | eval_inputs.append(char_rev_ids) 991 | eval_inputs.append(char_pos_ids) 992 | if cap_dim: 993 | eval_inputs.append(cap_ids) 994 | train_inputs = eval_inputs + [tag_ids] 995 | 996 | # Parse optimization method parameters 997 | if "-" in lr_method: 998 | lr_method_name = lr_method[:lr_method.find('-')] 999 | lr_method_parameters = {} 1000 | for x in lr_method[lr_method.find('-') + 1:].split('-'): 1001 | split = x.split('_') 1002 | assert len(split) == 2 1003 | lr_method_parameters[split[0]] = float(split[1]) 1004 | else: 1005 | lr_method_name = lr_method 1006 | lr_method_parameters = {} 1007 | 1008 | # Compile training function 1009 | print 'Compiling...' 1010 | if training: 1011 | import optimizers ; 1012 | self.optimizer=optimizers.RMSprop(lr=0.001); 1013 | updates = Optimization(clip=5.0).get_updates(lr_method_name, cost, params, **lr_method_parameters) 1014 | self.constraints={}; 1015 | #updates = self.optimizer.get_updates(params,self.constraints,cost); 1016 | f_train = theano.function( 1017 | inputs=train_inputs, 1018 | outputs=cost, 1019 | updates=updates, 1020 | givens=({is_train: np.cast['int32'](1)} if dropout else {}) 1021 | ) 1022 | #for debug 1023 | #f_Debug = theano.function( 1024 | # inputs=train_inputs, 1025 | # outputs=cost, 1026 | # updates=self.update, 1027 | # givens=({is_train: np.cast['int32'](1)} if dropout else {}) 1028 | #) 1029 | #debug end 1030 | else: 1031 | f_train = None 1032 | 1033 | # Compile evaluation function 1034 | if not crf: 1035 | f_eval = theano.function( 1036 | inputs=eval_inputs, 1037 | outputs=tags_scores, 1038 | givens=({is_train: np.cast['int32'](0)} if dropout else {}) 1039 | ) 1040 | else: 1041 | f_eval = theano.function( 1042 | inputs=eval_inputs, 1043 | outputs=forward(tags_scores, self.transitions, viterbi=True, 1044 | return_alpha=False, return_best_sequence=True), 1045 | givens=({is_train: np.cast['int32'](0)} if dropout else {}) 1046 | ) 1047 | 1048 | return f_train, f_eval 1049 | #}}} 1050 | -------------------------------------------------------------------------------- /src/nn.py: -------------------------------------------------------------------------------- 1 | import theano 2 | import theano.tensor as T 3 | from utils import shared 4 | import numpy as np 5 | 6 | class HiddenLayer(object): 7 | #{{{ 8 | """ 9 | Hidden layer with or without bias. 10 | Input: tensor of dimension (dims*, input_dim) 11 | Output: tensor of dimension (dims*, output_dim) 12 | """ 13 | def __init__(self, input_dim, output_dim, bias=True, activation='sigmoid', 14 | name='hidden_layer'): 15 | self.input_dim = input_dim 16 | self.output_dim = output_dim 17 | self.bias = bias 18 | self.name = name 19 | if activation is None: 20 | self.activation = None 21 | elif activation == 'tanh': 22 | self.activation = T.tanh 23 | elif activation == 'sigmoid': 24 | self.activation = T.nnet.sigmoid 25 | elif activation == 'softmax': 26 | self.activation = T.nnet.softmax 27 | else: 28 | raise Exception("Unknown activation function: " % activation) 29 | 30 | # Initialize weights and bias 31 | self.weights = shared((input_dim, output_dim), name + '_weights') 32 | self.bias = shared((output_dim,), name + '_bias') 33 | 34 | # Define parameters 35 | if self.bias: 36 | self.params = [self.weights, self.bias] 37 | else: 38 | self.params = [self.weights] 39 | 40 | def link(self, input): 41 | """ 42 | The input has to be a tensor with the right 43 | most dimension equal to input_dim. 44 | """ 45 | self.input = input 46 | self.linear_output = T.dot(self.input, self.weights) 47 | if self.bias: 48 | self.linear_output = self.linear_output + self.bias 49 | if self.activation is None: 50 | self.output = self.linear_output 51 | else: 52 | self.output = self.activation(self.linear_output) 53 | return self.output 54 | #}}} 55 | 56 | class EmbeddingLayer(object): 57 | #{{{ 58 | """ 59 | Embedding layer: word embeddings representations 60 | Input: tensor of dimension (dim*) with values in range(0, input_dim) 61 | Output: tensor of dimension (dim*, output_dim) 62 | """ 63 | 64 | def __init__(self, input_dim, output_dim, name='embedding_layer'): 65 | """ 66 | Typically, input_dim is the vocabulary size, 67 | and output_dim the embedding dimension. 68 | """ 69 | self.input_dim = input_dim 70 | self.output_dim = output_dim 71 | self.name = name 72 | 73 | # Randomly generate weights 74 | self.embeddings = shared((input_dim, output_dim), 75 | self.name + '__embeddings') 76 | 77 | # Define parameters 78 | self.params = [self.embeddings] 79 | 80 | def link(self, input): 81 | """ 82 | Return the embeddings of the given indexes. 83 | Input: tensor of shape (dim*) 84 | Output: tensor of shape (dim*, output_dim) 85 | """ 86 | self.input = input 87 | self.output = self.embeddings[self.input] 88 | return self.output 89 | #}}} 90 | 91 | 92 | class DropoutLayer(object): 93 | #{{{ 94 | """ 95 | Dropout layer. Randomly set to 0 values of the input 96 | with probability p. 97 | """ 98 | def __init__(self, p=0.5, name='dropout_layer'): 99 | """ 100 | p has to be between 0 and 1 (1 excluded). 101 | p is the probability of dropping out a unit, so 102 | setting p to 0 is equivalent to have an identity layer. 103 | """ 104 | assert 0. <= p < 1. 105 | self.p = p 106 | self.rng = T.shared_randomstreams.RandomStreams(seed=123456) 107 | self.name = name 108 | 109 | def link(self, input): 110 | """ 111 | Dropout link: we just apply mask to the input. 112 | """ 113 | if self.p > 0: 114 | mask = self.rng.binomial(n=1, p=1-self.p, size=input.shape, 115 | dtype=theano.config.floatX) 116 | self.output = input * mask 117 | else: 118 | self.output = input 119 | 120 | return self.output 121 | #}}} 122 | 123 | from keras import activations; 124 | from keras import backend as K; 125 | from keras import initializers as initializations; 126 | 127 | class Layer(object): 128 | def __init__(self): 129 | self.build(); 130 | return; 131 | def build(self): 132 | return; 133 | 134 | class Convolution1D(Layer): 135 | #{{{ 136 | def __init__(self,nb_filter,filter_length,input_dim,init='glorot_uniform', 137 | activation=None,border_mode='valid',subsample_length=1, 138 | bias=True, 139 | name='Convolution1D'): 140 | #{{{ 141 | self.nb_filter = nb_filter 142 | self.filter_length = filter_length 143 | self.init = initializations.get(init, dim_ordering='th') 144 | self.activation = activations.get(activation) 145 | self.border_mode = border_mode 146 | self.subsample_length = subsample_length 147 | 148 | self.subsample = (subsample_length, 1) 149 | self.bias=bias; 150 | self.input_dim = input_dim 151 | self.name=name; 152 | 153 | super(Convolution1D,self).__init__(); 154 | #}}} 155 | def build(self): 156 | #{{{ 157 | self.W_shape=(self.filter_length,1,self.input_dim,self.nb_filter); 158 | 159 | self.W=self.init(self.W_shape,name='{}_W'.format(self.name)); 160 | if self.bias: 161 | init=initializations.get('zero'); 162 | self.b=init((self.nb_filter,), 163 | name='{}_b'.format(self.name)); 164 | 165 | self.params=[self.W,self.b]; 166 | #}}} 167 | def call(self,x): 168 | #{{{ 169 | x=K.expand_dims(x,0); 170 | x=K.expand_dims(x,2); 171 | output=K.conv2d(x,self.W,strides=self.subsample, 172 | border_mode=self.border_mode, 173 | dim_ordering='tf'); 174 | output=K.squeeze(output,2); 175 | if self.bias: 176 | output+=K.reshape(self.b,(1,1,self.nb_filter)); 177 | output=self.activation(output); 178 | output=K.squeeze(output,0); 179 | return output; 180 | #}}} 181 | #}}} 182 | 183 | class LSTM(object): 184 | #{{{ 185 | #{{{ 186 | """ 187 | Long short-term memory (LSTM). Can be used with or without batches. 188 | Without batches: 189 | Input: matrix of dimension (sequence_length, input_dim) 190 | Output: vector of dimension (output_dim) 191 | With batches: 192 | Input: tensor3 of dimension (batch_size, sequence_length, input_dim) 193 | Output: matrix of dimension (batch_size, output_dim) 194 | """ 195 | #}}} 196 | def __init__(self, input_dim, output_dim, with_batch=True, 197 | activation='tanh',inner_activation='hard_sigmoid', 198 | name='LSTM'): 199 | #{{{ 200 | """ 201 | Initialize neural network. 202 | """ 203 | self.input_dim = input_dim 204 | self.output_dim = output_dim; 205 | self.with_batch = with_batch 206 | self.name = name 207 | self.inner_activation=activations.get(inner_activation); 208 | self.activation=activations.get(activation); 209 | self.build(); 210 | #}}} 211 | def build(self): 212 | #{{{ 213 | self.W=shared((self.input_dim,self.output_dim*3),name='{}_W'.format(self.name)); 214 | self.U=shared((self.output_dim,self.output_dim*3),name='{}_U'.format(self.name)); 215 | self.w_ci = shared((self.output_dim, self.output_dim), name='{}_w_ci'.format(self.name) ) 216 | self.w_co = shared((self.output_dim, self.output_dim), name='{}_w_co'.format(self.name) ) 217 | self.b=shared((self.output_dim*3,),name='{}_b'.format(self.name)); 218 | self.c_0 = shared((self.output_dim,), name='{}_c_0'.format(self.name) ) 219 | self.h_0 = shared((self.output_dim,), name='{}_h_0'.format(self.name) ) 220 | self.params=[self.W,self.U, 221 | self.w_ci,self.w_co,self.b, 222 | self.c_0,self.h_0, 223 | ]; 224 | #}}} 225 | 226 | def get_initial_states(self, x): 227 | # build an all-zero tensor of shape (samples, output_dim) 228 | initial_state = K.zeros_like(x) # (samples, timesteps, input_dim) 229 | initial_state = K.sum(initial_state, axis=(1, 2)) # (samples,) 230 | initial_state = K.expand_dims(initial_state) # (samples, 1) 231 | initial_state = K.tile(initial_state, [1, self.output_dim]) # (samples, output_dim) 232 | initial_states = [initial_state for _ in range(len(self.states))] 233 | return initial_states 234 | def step(self,x, h_tm1,c_tm1): 235 | #{{{ 236 | z=T.dot(x,self.W)+T.dot(h_tm1,self.U)+self.b; 237 | if self.with_batch: 238 | z_i=z[:,:self.output_dim]; 239 | z_c=z[:,self.output_dim:2*self.output_dim]; 240 | z_o=z[:,2*self.output_dim:]; 241 | else: 242 | z_i=z[:self.output_dim]; 243 | z_c=z[self.output_dim:2*self.output_dim]; 244 | z_o=z[2*self.output_dim:]; 245 | 246 | i_t = self.inner_activation(z_i + 247 | T.dot(c_tm1, self.w_ci)) 248 | # f_t = T.nnet.sigmoid(T.dot(x_t, self.w_xf) + 249 | # T.dot(h_tm1, self.w_hf) + 250 | # T.dot(c_tm1, self.w_cf) + 251 | # self.b_f) 252 | c_t = (1 - i_t) * c_tm1 + i_t * self.activation(z_c) 253 | o_t = self.inner_activation(z_o + 254 | T.dot(c_t, self.w_co)) 255 | h_t = o_t * self.activation(c_t) 256 | return h_t,c_t 257 | #}}} 258 | def link(self, input): 259 | #{{{ 260 | """ 261 | Propagate the input through the network and return the last hidden 262 | vector. The whole sequence is also accessible via self.h, but 263 | where self.h of shape (sequence_length, batch_size, output_dim) 264 | """ 265 | 266 | # If we use batches, we have to permute the first and second dimension. 267 | if self.with_batch: 268 | self.input = input.dimshuffle(1, 0, 2) 269 | initial_states = [T.alloc(x, self.input.shape[1], self.output_dim) 270 | for x in [self.h_0, self.c_0]] 271 | else: 272 | self.input = input 273 | initial_states = [self.h_0, self.c_0] 274 | 275 | [h,c], _ = theano.scan( 276 | fn=self.step, 277 | sequences=self.input, 278 | outputs_info=initial_states, 279 | ) 280 | self.h = h 281 | self.c=c 282 | self.output = h[-1] 283 | 284 | return self.output 285 | #}}} 286 | #}}} 287 | 288 | class LSTM_normal(object): 289 | #{{{ 290 | #{{{ 291 | """ 292 | Long short-term memory (LSTM). Can be used with or without batches. 293 | Without batches: 294 | Input: matrix of dimension (sequence_length, input_dim) 295 | Output: vector of dimension (output_dim) 296 | With batches: 297 | Input: tensor3 of dimension (batch_size, sequence_length, input_dim) 298 | Output: matrix of dimension (batch_size, output_dim) 299 | """ 300 | #}}} 301 | def __init__(self, input_dim, output_dim, with_batch=True, 302 | activation='tanh',inner_activation='hard_sigmoid', 303 | name='LSTM_normal'): 304 | #{{{ 305 | """ 306 | Initialize neural network. 307 | """ 308 | self.input_dim = input_dim 309 | self.output_dim = output_dim; 310 | self.with_batch = with_batch 311 | self.name = name 312 | self.inner_activation=activations.get(inner_activation); 313 | self.forget_bias_init = initializations.get('one') 314 | self.activation=activations.get(activation); 315 | self.build(); 316 | #}}} 317 | def build(self): 318 | #{{{ 319 | import numpy as np; 320 | self.W = shared((self.input_dim, 4 * self.output_dim), 321 | name='{}_W'.format(self.name)) 322 | self.U = shared((self.output_dim, 4 * self.output_dim), 323 | name='{}_U'.format(self.name)) 324 | 325 | self.b = K.variable(np.hstack((np.zeros(self.output_dim), 326 | K.get_value(self.forget_bias_init( 327 | (self.output_dim,))), 328 | np.zeros(self.output_dim), 329 | np.zeros(self.output_dim))), 330 | name='{}_b'.format(self.name)) 331 | #self.c_0 = shared((self.output_dim,), name='{}_c_0'.format(self.name) ) 332 | #self.h_0 = shared((self.output_dim,), name='{}_h_0'.format(self.name) ) 333 | self.c_0=np.zeros(self.output_dim).astype(theano.config.floatX); 334 | self.h_0=np.zeros(self.output_dim).astype(theano.config.floatX); 335 | self.params=[self.W,self.U, 336 | self.b, 337 | # self.c_0,self.h_0 338 | ]; 339 | #}}} 340 | def step(self,x, h_tm1,c_tm1): 341 | #{{{ 342 | z = K.dot(x , self.W) + K.dot(h_tm1 , self.U) + self.b 343 | if self.with_batch: 344 | z0 = z[:,:self.output_dim] 345 | z1 = z[:,self.output_dim: 2 * self.output_dim] 346 | z2 = z[:,2 * self.output_dim: 3 * self.output_dim] 347 | z3 = z[:,3 * self.output_dim:] 348 | else: 349 | z0 = z[:self.output_dim] 350 | z1 = z[self.output_dim: 2 * self.output_dim] 351 | z2 = z[2 * self.output_dim: 3 * self.output_dim] 352 | z3 = z[3 * self.output_dim:] 353 | 354 | 355 | i = self.inner_activation(z0) 356 | f = self.inner_activation(z1) 357 | c = f * c_tm1 + i * self.activation(z2) 358 | o = self.inner_activation(z3) 359 | h=o*self.activation(c); 360 | return h,c; 361 | #}}} 362 | 363 | def link(self, input): 364 | #{{{ 365 | """ 366 | Propagate the input through the network and return the last hidden 367 | vector. The whole sequence is also accessible via self.h, but 368 | where self.h of shape (sequence_length, batch_size, output_dim) 369 | """ 370 | 371 | # If we use batches, we have to permute the first and second dimension. 372 | self.input = input 373 | if self.with_batch: 374 | self.input = input.dimshuffle(1, 0, 2) 375 | initial_states = [T.alloc(x, self.input.shape[1], self.output_dim) 376 | for x in [self.h_0, self.c_0]] 377 | else: 378 | self.input = input 379 | initial_states = [self.h_0, self.c_0] 380 | step_function=self.step; 381 | 382 | [h,c], _ = theano.scan( 383 | fn=step_function, 384 | sequences=self.input, 385 | outputs_info=initial_states, 386 | ) 387 | self.h = h 388 | self.output = h[-1] 389 | 390 | return self.output 391 | #}}} 392 | #}}} 393 | 394 | class AttentionLSTM(LSTM): 395 | def build(self): 396 | #{{{ 397 | super(AttentionLSTM,self).build() ; 398 | self.W_A=shared((self.input_dim+self.output_dim,1),name='{}_W_A'.format(self.name)); 399 | self.b_A=shared((1,),name='{}_b_A'.format(self.name)); 400 | self.params+=[self.W_A,self.b_A]; 401 | #}}} 402 | def step(self, h_tm1,c_tm1,x): 403 | #{{{ 404 | assert x.ndim==2; 405 | H=x; 406 | input_length=x.shape[0]; 407 | C=T.repeat(c_tm1.reshape((1,-1)),input_length,axis=0); 408 | _HC=K.concatenate([H,C]); 409 | energy=T.dot(_HC,self.W_A.reshape((-1,1)))+self.b_A; 410 | energy=K.softmax(energy.reshape((1,-1))); 411 | x=(H*energy.reshape((-1,1))).sum(axis=0) 412 | 413 | h_t,c_t=super(AttentionLSTM,self).step_noBatch(x,h_tm1,c_tm1); 414 | return h_t,c_t 415 | #}}} 416 | def link(self, input): 417 | #{{{ 418 | """ 419 | Propagate the input through the network and return the last hidden 420 | vector. The whole sequence is also accessible via self.h, but 421 | where self.h of shape (sequence_length, batch_size, output_dim) 422 | """ 423 | 424 | # If we use batches, we have to permute the first and second dimension. 425 | if self.with_batch: 426 | assert 0,"AttentionLSTM not implement with_batch"; 427 | else: 428 | self.input=input; 429 | initial_states = [self.h_0, self.c_0] 430 | 431 | step_function=self.step; 432 | 433 | [h,c], _ = theano.scan( 434 | fn=step_function, 435 | outputs_info=initial_states, 436 | non_sequences=[self.input], 437 | n_steps=self.input.shape[0] 438 | ) 439 | self.h = h 440 | self.output = self.h[-1] 441 | 442 | return self.output 443 | #}}} 444 | 445 | class AttentionLSTM2(AttentionLSTM): 446 | #{{{ 447 | def __init__(self,attended_dim,wordInput_dim, 448 | combineOput_dim,output_dim, **kwargs): 449 | #{{{ 450 | self.attendedInput_dim=attended_dim; 451 | self.wordInput_dim=wordInput_dim; 452 | self.combineOput_dim=combineOput_dim; 453 | super(AttentionLSTM2, self).__init__(output_dim=output_dim, 454 | input_dim=combineOput_dim, 455 | **kwargs) 456 | #}}} 457 | def build(self): 458 | #{{{ 459 | if self.input_dim is None: 460 | self.input_dim=self.combineOput_dim; 461 | super(AttentionLSTM,self).build() ; 462 | #attention weight 463 | self.W_A=shared((self.attendedInput_dim+self.output_dim,1),name='{}_W_A'.format(self.name)); 464 | self.b_A=shared((1,),name='{}_b_A'.format(self.name)); 465 | 466 | #combine weight 467 | self.W_combine=shared((self.attendedInput_dim+self.wordInput_dim, 468 | self.combineOput_dim), 469 | name='{}_W_combine'.format(self.name)); 470 | self.b_combine=shared((self.combineOput_dim,), 471 | name='{}_b_combine'.format(self.name)); 472 | self.params+=[self.W_A,self.b_A]; 473 | self.params+=[self.W_combine,self.b_combine]; 474 | 475 | #}}} 476 | def step(self, word,h_tm1,c_tm1,x): 477 | #{{{ 478 | H=x; 479 | input_length=x.shape[0]; 480 | C=T.repeat(c_tm1.reshape((1,-1)),input_length,axis=0); 481 | _HC=K.concatenate([H,C]); 482 | energy=T.dot(_HC,self.W_A.reshape((-1,1)))+self.b_A; 483 | energy=K.softmax(energy.reshape((1,-1))); 484 | x=(H*energy.reshape((-1,1))).sum(axis=0) 485 | 486 | #combine glimpsed with word; 487 | combine=K.concatenate([x,word]); 488 | combined=K.dot(combine,self.W_combine)+self.b_combine; 489 | #original LSTM step 490 | h_t,c_t=super(AttentionLSTM,self).step_noBatch(combined,h_tm1,c_tm1); 491 | return h_t,c_t 492 | #}}} 493 | def link(self, input,words): 494 | #{{{ 495 | """ 496 | Propagate the input through the network and return the last hidden 497 | vector. The whole sequence is also accessible via self.h, but 498 | where self.h of shape (sequence_length, batch_size, output_dim) 499 | """ 500 | 501 | # If we use batches, we have to permute the first and second dimension. 502 | if self.with_batch: 503 | assert 0,"AttentionLSTM not implement with_batch"; 504 | else: 505 | self.input = input 506 | initial_states = [self.h_0, self.c_0] 507 | 508 | step_function=self.step; 509 | 510 | [h,c], _ = theano.scan( 511 | fn=step_function, 512 | sequences=[words], 513 | outputs_info=initial_states, 514 | non_sequences=[self.input], 515 | ) 516 | self.h = h 517 | self.output = h[-1] 518 | 519 | return self.output 520 | #}}} 521 | 522 | #}}} 523 | 524 | class AttentionLSTM3(LSTM): 525 | #{{{ 526 | def __init__(self,attended_dim,wordInput_dim, 527 | output_dim,mode='concat', **kwargs): 528 | #{{{ 529 | self.attendedInput_dim=attended_dim; 530 | self.wordInput_dim=wordInput_dim; 531 | self.attendedMode=mode; 532 | self.init=initializations.get('glorot_uniform'); 533 | super(AttentionLSTM3, self).__init__(output_dim=output_dim, 534 | input_dim=attended_dim+wordInput_dim, 535 | **kwargs) 536 | #}}} 537 | def build(self): 538 | #{{{ 539 | if self.input_dim is None: 540 | self.input_dim=self.combineOput_dim; 541 | super(AttentionLSTM3,self).build() ; 542 | #attention weight 543 | self.W_A_X=shared((self.attendedInput_dim,self.output_dim), 544 | name='{}_W_A_X'); 545 | #self.b_A_X=shared((self.output_dim,), 546 | # name='{}_b_A_X'); 547 | self.W_A_h=shared((self.output_dim,self.output_dim), 548 | name='{}_W_A_h'); 549 | #self.b_A_h=shared((self.output_dim,), 550 | # name='{}_b_A_h'); 551 | self.W_A=self.init((self.output_dim,),name='{}_W_A'.format(self.name)); 552 | #self.b_A=shared((1,),name='{}_b_A'.format(self.name)); 553 | self.params+=[self.W_A_X, 554 | #self.b_A_X, 555 | self.W_A_h, 556 | #self.b_A_h, 557 | self.W_A, 558 | #self.b_A, 559 | ]; 560 | 561 | 562 | #}}} 563 | def step(self, word,index,energy_tm1,h_tm1,c_tm1,x): 564 | #{{{ 565 | #attention 566 | H=x; 567 | if self.attendedMode is "concat": 568 | M_X=T.dot(x,self.W_A_X)#+self.b_A_X; 569 | M_state=T.dot(self.W_A_h,c_tm1)#+self.b_A_h; 570 | M=T.tanh(M_X+M_state) 571 | _energy=T.dot(M,self.W_A.T)#+self.b_A; 572 | elif self.attendedMode is "dot": 573 | energy=None; 574 | assert 0,"not implement"; 575 | elif self.attendedMode is "general": 576 | M_X=T.dot(x,self.W_A_X)#+self.b_A_X; 577 | M_state=T.dot(self.W_A_h,c_tm1)#+self.b_A_h; 578 | M=T.tanh(M_X*M_state); 579 | _energy=T.dot(M,self.W_A.T)#+self.b_A; 580 | #mask 581 | mask=T.zeros((1,x.shape[0]),dtype=theano.config.floatX); 582 | energy=T.nnet.softmax(_energy[:index+1]); 583 | masked_energy=T.set_subtensor(mask[0,:index+1],energy.flatten()); 584 | glimpsed=(masked_energy.T*H).sum(axis=0) 585 | #combine glimpsed with word; 586 | if self.wordInput_dim==0: 587 | combined=glimpsed; 588 | else: 589 | combine=K.concatenate([glimpsed,word]); 590 | combined=combine; 591 | #original LSTM step 592 | h_t,c_t=super(AttentionLSTM3,self).step(combined,h_tm1,c_tm1); 593 | return masked_energy.flatten(),h_t,c_t 594 | #}}} 595 | def link(self, input,words): 596 | #{{{ 597 | """ 598 | Propagate the input through the network and return the last hidden 599 | vector. The whole sequence is also accessible via self.h, but 600 | where self.h of shape (sequence_length, batch_size, output_dim) 601 | """ 602 | 603 | # If we use batches, we have to permute the first and second dimension. 604 | if self.with_batch: 605 | assert 0,"AttentionLSTM not implement with_batch"; 606 | else: 607 | self.input = input 608 | initial_states = [self.h_0, self.c_0] 609 | 610 | step_function=self.step; 611 | 612 | [e,h,c], _ = theano.scan( 613 | fn=step_function, 614 | sequences=[words,T.arange(words.shape[0])], 615 | outputs_info=[T.zeros((input.shape[0],), 616 | dtype=theano.config.floatX)]+initial_states, 617 | non_sequences=[self.input], 618 | ) 619 | self.h = h 620 | self.output = h[-1] 621 | self.e=e; 622 | self.c=c; 623 | return self.output 624 | #}}} 625 | 626 | #}}} 627 | 628 | class AttentionLayer(Layer): 629 | def __init__(self,attended_dim,state_dim, 630 | source_dim,scoreFunName='Euclidean', 631 | atten_activation='tanh',name='AttentionLayer'): 632 | #{{{ 633 | self.attended_dim=attended_dim; 634 | self.state_dim=state_dim; 635 | self.source_dim=source_dim; 636 | self.init=initializations.get('glorot_uniform'); 637 | self.name=name; 638 | self.one_init=initializations.get('one'); 639 | self.atten_activation=activations.get(atten_activation); 640 | self.scoreFunName=scoreFunName; 641 | self.eps=1e-5; 642 | #self.source_dim=glimpsed_dim; 643 | super(AttentionLayer,self).__init__(); 644 | #}}} 645 | def euclideanScore(self,attended,state,W): 646 | #{{{ 647 | #Euclidean distance 648 | M=(attended-state)**2; 649 | M=T.dot(M,W); 650 | _energy=M.max()-M; 651 | return _energy; 652 | #}}} 653 | def manhattenScore(self,attended,state,W): 654 | #{{{ 655 | #Manhattan Distance 656 | #eps for avoid gradient to be NaN; 657 | M=T.abs_(T.maximum(attended-state,self.eps)); 658 | M=T.dot(M,W); 659 | _energy=M.max()-M; 660 | return _energy; 661 | #}}} 662 | def bilinearScore(self,attended,state,W): 663 | #{{{ 664 | #Bilinear function 665 | M=(attended*state*W).sum(axis=-1); 666 | _energy=self.atten_activation(M); 667 | return _energy; 668 | #}}} 669 | def forwardNNScore(self,attended,state,W): 670 | #{{{ 671 | #get weights 672 | W_1=W[:(self.attended_dim+self.state_dim)*self.state_dim]; 673 | W_1=W_1.reshape((self.attended_dim+self.state_dim,self.state_dim)); 674 | W_2=W[(self.attended_dim+self.state_dim)*self.state_dim:]; 675 | 676 | #forward neural network 677 | state_=T.repeat(state.reshape((1,-1)),attended.shape[0],axis=0); 678 | input=T.concatenate([attended,state_],axis=-1); 679 | M1=self.atten_activation(T.dot(input,W_1)); 680 | M2=self.atten_activation(T.dot(M1,W_2)); 681 | _energy=M2; 682 | return _energy; 683 | #}}} 684 | def CNNScore(self,attended,state,W): 685 | #{{{ 686 | state_=T.repeat(state.reshape((1,-1)),attended.shape[0],axis=0); 687 | input=T.concatenate([attended,state_],axis=-1); 688 | M1=self.CNN1.call(input); 689 | M2=self.CNN2.call(M1); 690 | _energy=M2.flatten(); 691 | return _energy; 692 | #}}} 693 | def CosineScore(self,attended,state,W): 694 | #{{{ 695 | dotProduct=T.dot(attended,state.T); 696 | Al2Norm=T.sqrt((attended**2).sum(axis=-1)); 697 | Bl2Norm=T.sqrt((state**2).sum(axis=-1)); 698 | M=dotProduct/(Al2Norm*Bl2Norm); 699 | _energy=T.exp(M+2); 700 | return _energy; 701 | #}}} 702 | def vanilaScore(self,attended,state,W): 703 | """ 704 | the origin score proprosed by Bahdanau 2015 705 | """ 706 | 707 | def build(self): 708 | #{{{ 709 | self.W_A_X=shared((self.attended_dim,self.attended_dim), 710 | name='{}_W_A_X'.format(self.name)); 711 | self.b_A_X=shared((self.attended_dim,), 712 | name='{}_W_A_b'.format(self.name)); 713 | self.W_A_h=shared((self.attended_dim,self.attended_dim), 714 | name='{}_W_A_h'.format(self.name)); 715 | self.W_A_combine=shared((self.source_dim*2, 716 | self.source_dim), 717 | name='{}_W_A_combine'.format(self.name)); 718 | self.b_A_combine=shared((self.source_dim,), 719 | name='{}_b_A_combine'.format(self.name)) 720 | #self.W_A_combine=shared((self.source_dim, 721 | # self.source_dim), 722 | # name='{}_W_A_combine'.format(self.name)); 723 | #self.b_A_combine=shared((self.source_dim,), 724 | # name='{}_b_A_combine'.format(self.name)) 725 | #use constraint 726 | self.constraints={} 727 | 728 | self.params=[ 729 | self.W_A_X,self.b_A_X, 730 | # self.W_A_h, 731 | self.W_A_combine,self.b_A_combine 732 | ]; 733 | 734 | #for attention weight and score function 735 | if self.scoreFunName == "Euclidean": 736 | #{{{ 737 | self.W_A=shared((self.state_dim,), 738 | name='{}_W_A'.format(self.name)); 739 | self.W_A.set_value(np.ones((self.state_dim,),dtype=theano.config.floatX)); 740 | self.constraints[self.W_A]=self.NonNegConstraint; 741 | self.scoreFun=self.euclideanScore; 742 | self.params.append(self.W_A); 743 | #}}} 744 | elif self.scoreFunName == "Bilinear": 745 | #{{{ 746 | assert self.attended_dim==self.state_dim,"in Bilinear score function,"\ 747 | " attended_dim must be equal to state_dim" 748 | self.W_A=self.init((self.state_dim,), 749 | name="{}_W_A".format(self.name)); 750 | self.scoreFun=self.bilinearScore; 751 | self.params.append(self.W_A); 752 | #}}} 753 | elif self.scoreFunName == "forwardNN": 754 | #{{{ 755 | #this is two layer NN 756 | #first layer (attended_dim+state_dim,state_dim); 757 | #second layer (state_dim,1); 758 | self.W_A=shared(((self.attended_dim+self.state_dim)\ 759 | *self.state_dim+self.state_dim,), 760 | name="{}_W_A".format(self.name)); 761 | self.scoreFun=self.forwardNNScore; 762 | self.params.append(self.W_A); 763 | #}}} 764 | elif self.scoreFunName == "CNN": 765 | #{{{ 766 | #this if one layer CNN and pool layer; 767 | nb_filter=(self.attended_dim+self.state_dim)/2; 768 | filter_length=3; 769 | input_dim=self.attended_dim+self.state_dim; 770 | self.CNN1=Convolution1D(nb_filter=nb_filter, 771 | filter_length=filter_length, 772 | input_dim=input_dim,activation='tanh', 773 | border_mode='same'); 774 | self.CNN2=Convolution1D(nb_filter=1, 775 | filter_length=filter_length, 776 | input_dim=nb_filter,activation='tanh', 777 | border_mode='same'); 778 | self.W_A=self.CNN1.W; 779 | self.scoreFun=self.CNNScore; 780 | self.params.append(self.W_A); 781 | self.params.append(self.CNN2.W); 782 | #}}} 783 | elif self.scoreFunName == "Cosine": 784 | #{{{ 785 | self.scoreFun=self.CosineScore; 786 | self.W_A=None; 787 | #}}} 788 | elif self.scoreFunName == "Manhatten": 789 | #{{{ 790 | self.scoreFun=self.manhattenScore; 791 | self.W_A=self.one_init((self.state_dim,), 792 | name='{}_W_A'.format(self.name)); 793 | self.constraints[self.W_A]=self.NonNegConstraint; 794 | self.params.append(self.W_A); 795 | #}}} 796 | else: 797 | assert 0, "we only have Euclidean, Bilinear, forwardNN"\ 798 | " score function for attention"; 799 | 800 | #}}} 801 | def softmaxReScale(self,energy_,threshould): 802 | #{{{ 803 | #in energy_, the goundthrud should be max 804 | assert energy_.ndim==1; 805 | #convert threshould from percentage to energy_; 806 | threshould_=T.log(T.exp(energy_-energy_.max()).sum())+T.log(threshould)+energy_.max() 807 | energy=self.reScale(energy_,threshould_); 808 | return T.nnet.softmax(energy); 809 | #}}} 810 | def reScale(self,energy,threshold,replaceValue=1e-7): 811 | #{{{ 812 | assert energy.ndim==1; 813 | maxValue=energy.max(); 814 | def checkThreshold(value,threshold,replaceValue): 815 | return T.switch(T.lt(value,threshold),replaceValue,value); 816 | result,update=theano.scan(fn=checkThreshold, 817 | outputs_info=None, 818 | sequences=[energy], 819 | non_sequences=[threshold,replaceValue]); 820 | return T.switch(T.lt(maxValue,threshold),energy,result); 821 | #}}} 822 | 823 | def step(self,state,attended,source): 824 | #from theano.gradient import disconnected_grad; 825 | #state=disconnected_grad(state_); 826 | #M_state=T.dot(self.W_A_h,state) ; 827 | 828 | _energy=self.scoreFun(attended,state,self.W_A) 829 | energy=T.nnet.softmax(_energy); 830 | #energy=self.softmaxReScale(_energy,0.02); 831 | #energy=self.reScale(energy.flatten(),0.02).reshape((1,-1)) 832 | #energyIndex=energy.flatten().argmin(axis=-1); 833 | glimpsed=(energy.T*source).sum(axis=0) 834 | #glimpsed=source[energyIndex]; 835 | return energy.flatten(),glimpsed; 836 | 837 | def NonNegConstraint(self,p): 838 | p*=K.cast(p>=0.,K.floatx()); 839 | return p; 840 | 841 | def link(self,attended,state,source): 842 | step_function=self.step; 843 | attended_=T.tanh(T.dot(attended,self.W_A_X))+self.b_A_X; 844 | #attended_=attended; 845 | [energy,glimpsed],_=theano.scan(fn=step_function, 846 | sequences=[attended_], 847 | outputs_info=None, 848 | non_sequences=[attended_,source]); 849 | self.energy=energy; 850 | 851 | #combine 852 | #combine=T.concatenate([glimpsed,attended],axis=-1); 853 | combine=T.concatenate([glimpsed,source],axis=-1); 854 | combined=T.tanh(T.dot(combine,self.W_A_combine))+self.b_A_combine; 855 | #no source 856 | #combined=T.tanh(T.dot(glimpsed,self.W_A_combine))+self.b_A_combine; 857 | return combined; 858 | 859 | def log_sum_exp(x, axis=None): 860 | """ 861 | Sum probabilities in the log-space. 862 | """ 863 | xmax = x.max(axis=axis, keepdims=True) 864 | xmax_ = x.max(axis=axis) 865 | return xmax_ + T.log(T.exp(x - xmax).sum(axis=axis)) 866 | 867 | def forward(observations, transitions, viterbi=False, 868 | return_alpha=False, return_best_sequence=False): 869 | """ 870 | Takes as input: 871 | - observations, sequence of shape (n_steps, n_classes) 872 | - transitions, sequence of shape (n_classes, n_classes) 873 | Probabilities must be given in the log space. 874 | Compute alpha, matrix of size (n_steps, n_classes), such that 875 | alpha[i, j] represents one of these 2 values: 876 | - the probability that the real path at node i ends in j 877 | - the maximum probability of a path finishing in j at node i (Viterbi) 878 | Returns one of these 2 values: 879 | - alpha 880 | - the final probability, which can be: 881 | - the sum of the probabilities of all paths 882 | - the probability of the best path (Viterbi) 883 | """ 884 | #the last row of transitions is the inital state 885 | trans=transitions[:-1]; 886 | assert not return_best_sequence or (viterbi and not return_alpha) 887 | assert viterbi==return_best_sequence 888 | 889 | def recurrence(obs, previous, transitions): 890 | previous = previous.dimshuffle(0, 'x') 891 | obs = obs.dimshuffle('x', 0) 892 | if viterbi: 893 | scores = previous + obs + transitions 894 | out = scores.max(axis=0) 895 | 896 | out2 = scores.argmax(axis=0) 897 | return out, out2 898 | else: 899 | return log_sum_exp(previous + obs + transitions, axis=0) 900 | 901 | initial = transitions[-1]+observations[0] 902 | alpha, _ = theano.scan( 903 | fn=recurrence, 904 | outputs_info=(initial, None) if return_best_sequence else initial, 905 | sequences=[observations[1:]], 906 | non_sequences=[trans] 907 | ) 908 | if viterbi: 909 | alpha0=T.concatenate([[initial],alpha[0]],axis=0); 910 | alpha=[alpha0,alpha[1]]; 911 | #else: 912 | # alpha=T.concatenate([log_sum_exp(initial,axis=0).dimshuffle('x',0), 913 | # alpha],axis=0); 914 | 915 | if return_alpha: 916 | return alpha 917 | elif return_best_sequence: 918 | sequence, _ = theano.scan( 919 | fn=lambda beta_i, previous: beta_i[previous], 920 | outputs_info=T.cast(T.argmax(alpha[0][-1]), 'int32'), 921 | sequences=T.cast(alpha[1][::-1], 'int32') 922 | ) 923 | sequence = T.concatenate([sequence[::-1], [T.argmax(alpha[0][-1])]]) 924 | return sequence 925 | else: 926 | if viterbi: 927 | return alpha[-1].max(axis=0) 928 | else: 929 | return log_sum_exp(alpha[-1], axis=0) 930 | 931 | 932 | def forward_org(observations, transitions, viterbi=False, 933 | return_alpha=False, return_best_sequence=False): 934 | """ 935 | Takes as input: 936 | - observations, sequence of shape (n_steps, n_classes) 937 | - transitions, sequence of shape (n_classes, n_classes) 938 | Probabilities must be given in the log space. 939 | Compute alpha, matrix of size (n_steps, n_classes), such that 940 | alpha[i, j] represents one of these 2 values: 941 | - the probability that the real path at node i ends in j 942 | - the maximum probability of a path finishing in j at node i (Viterbi) 943 | Returns one of these 2 values: 944 | - alpha 945 | - the final probability, which can be: 946 | - the sum of the probabilities of all paths 947 | - the probability of the best path (Viterbi) 948 | 949 | """ 950 | 951 | assert not return_best_sequence or (viterbi and not return_alpha) 952 | 953 | def recurrence(obs, previous, transitions): 954 | previous = previous.dimshuffle(0, 'x') 955 | obs = obs.dimshuffle('x', 0) 956 | if viterbi: 957 | scores = previous + obs + transitions 958 | out = scores.max(axis=0) 959 | if return_best_sequence: 960 | out2 = scores.argmax(axis=0) 961 | return out, out2 962 | else: 963 | return out 964 | else: 965 | return log_sum_exp(previous + obs + transitions, axis=0) 966 | 967 | initial = observations[0] 968 | alpha, _ = theano.scan( 969 | fn=recurrence, 970 | outputs_info=(initial, None) if return_best_sequence else initial, 971 | sequences=[observations[1:]], 972 | non_sequences=transitions 973 | ) 974 | 975 | if return_alpha: 976 | return alpha 977 | elif return_best_sequence: 978 | sequence, _ = theano.scan( 979 | fn=lambda beta_i, previous: beta_i[previous], 980 | outputs_info=T.cast(T.argmax(alpha[0][-1]), 'int32'), 981 | sequences=T.cast(alpha[1][::-1], 'int32') 982 | ) 983 | #sequence = T.concatenate([sequence[::-1], [T.argmax(alpha[0][-1])]]) 984 | return alpha 985 | else: 986 | if viterbi: 987 | return alpha[-1].max(axis=0) 988 | else: 989 | return log_sum_exp(alpha[-1], axis=0) 990 | 991 | -------------------------------------------------------------------------------- /src/optimization.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import theano 3 | import theano.tensor as T 4 | 5 | floatX = theano.config.floatX 6 | device = theano.config.device 7 | 8 | 9 | class Optimization: 10 | 11 | def __init__(self, clip=None): 12 | """ 13 | Initialization 14 | """ 15 | self.clip = clip 16 | 17 | def get_gradients(self, cost, params): 18 | """ 19 | Compute the gradients, and clip them if required. 20 | """ 21 | if self.clip is None: 22 | return T.grad(cost, params) 23 | else: 24 | assert self.clip > 0 25 | return T.grad( 26 | theano.gradient.grad_clip(cost, -1 * self.clip, self.clip), 27 | params 28 | ) 29 | 30 | def get_updates(self, method, cost, params,constraints={}, *args, **kwargs): 31 | """ 32 | Compute the updates for different optimizers. 33 | """ 34 | if method == 'sgd': 35 | updates = self.sgd(cost, params,constraints=constraints, **kwargs) 36 | elif method == 'sgdmomentum': 37 | updates = self.sgdmomentum(cost, params **kwargs) 38 | elif method == 'adagrad': 39 | updates = self.adagrad(cost, params, **kwargs) 40 | elif method == 'adadelta': 41 | updates = self.adadelta(cost, params, **kwargs) 42 | elif method == 'adam': 43 | updates = self.adam(cost, params, **kwargs) 44 | elif method == 'rmsprop': 45 | updates = self.rmsprop(cost, params, **kwargs) 46 | else: 47 | raise("Not implemented learning method: %s" % method) 48 | return updates 49 | 50 | def sgd(self, cost, params,constraints={}, lr=0.01): 51 | #{{{ 52 | """ 53 | Stochatic gradient descent. 54 | """ 55 | updates = [] 56 | 57 | lr = theano.shared(np.float32(lr).astype(floatX)) 58 | gradients = self.get_gradients(cost, params) 59 | 60 | for p, g in zip(params, gradients): 61 | v=-lr*g; 62 | new_p=p+v; 63 | # apply constraints 64 | if p in constraints: 65 | c=constraints[p]; 66 | new_p=c(new_p); 67 | updates.append((p, new_p)) 68 | 69 | return updates 70 | #}}} 71 | def sgdmomentum(self, cost, params,constraints={}, lr=0.01,consider_constant=None, momentum=0.): 72 | """ 73 | Stochatic gradient descent with momentum. Momentum has to be in [0, 1) 74 | """ 75 | # Check that the momentum is a correct value 76 | assert 0 <= momentum < 1 77 | 78 | lr = theano.shared(np.float32(lr).astype(floatX)) 79 | momentum = theano.shared(np.float32(momentum).astype(floatX)) 80 | 81 | gradients = self.get_gradients(cost, params) 82 | velocities = [theano.shared(np.zeros_like(param.get_value(borrow=True)).astype(floatX)) for param in params] 83 | 84 | updates = [] 85 | for param, gradient, velocity in zip(params, gradients, velocities): 86 | new_velocity = momentum * velocity - lr * gradient 87 | updates.append((velocity, new_velocity)) 88 | new_p=param+new_velocity; 89 | # apply constraints 90 | if param in constraints: 91 | c=constraints[param]; 92 | new_p=c(new_p); 93 | updates.append((param, new_p)) 94 | return updates 95 | 96 | def adagrad(self, cost, params, lr=1.0, epsilon=1e-6,consider_constant=None): 97 | """ 98 | Adagrad. Based on http://www.ark.cs.cmu.edu/cdyer/adagrad.pdf 99 | """ 100 | lr = theano.shared(np.float32(lr).astype(floatX)) 101 | epsilon = theano.shared(np.float32(epsilon).astype(floatX)) 102 | 103 | gradients = self.get_gradients(cost, params,consider_constant) 104 | gsums = [theano.shared(np.zeros_like(param.get_value(borrow=True)).astype(floatX)) for param in params] 105 | 106 | updates = [] 107 | for param, gradient, gsum in zip(params, gradients, gsums): 108 | new_gsum = gsum + gradient ** 2. 109 | updates.append((gsum, new_gsum)) 110 | updates.append((param, param - lr * gradient / (T.sqrt(gsum + epsilon)))) 111 | return updates 112 | 113 | def adadelta(self, cost, params, rho=0.95, epsilon=1e-6,consider_constant=None): 114 | """ 115 | Adadelta. Based on: 116 | http://www.matthewzeiler.com/pubs/googleTR2012/googleTR2012.pdf 117 | """ 118 | rho = theano.shared(np.float32(rho).astype(floatX)) 119 | epsilon = theano.shared(np.float32(epsilon).astype(floatX)) 120 | 121 | gradients = self.get_gradients(cost, params,consider_constant) 122 | accu_gradients = [theano.shared(np.zeros_like(param.get_value(borrow=True)).astype(floatX)) for param in params] 123 | accu_deltas = [theano.shared(np.zeros_like(param.get_value(borrow=True)).astype(floatX)) for param in params] 124 | 125 | updates = [] 126 | for param, gradient, accu_gradient, accu_delta in zip(params, gradients, accu_gradients, accu_deltas): 127 | new_accu_gradient = rho * accu_gradient + (1. - rho) * gradient ** 2. 128 | delta_x = - T.sqrt((accu_delta + epsilon) / (new_accu_gradient + epsilon)) * gradient 129 | new_accu_delta = rho * accu_delta + (1. - rho) * delta_x ** 2. 130 | updates.append((accu_gradient, new_accu_gradient)) 131 | updates.append((accu_delta, new_accu_delta)) 132 | updates.append((param, param + delta_x)) 133 | return updates 134 | 135 | def adam(self, cost, params, lr=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,consider_constant=None): 136 | """ 137 | Adam. Based on http://arxiv.org/pdf/1412.6980v4.pdf 138 | """ 139 | updates = [] 140 | gradients = self.get_gradients(cost, params,consider_constant) 141 | 142 | t = theano.shared(np.float32(1.).astype(floatX)) 143 | 144 | for param, gradient in zip(params, gradients): 145 | value = param.get_value(borrow=True) 146 | m_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) 147 | v_prev = theano.shared(np.zeros(value.shape, dtype=value.dtype), broadcastable=param.broadcastable) 148 | 149 | m = beta1 * m_prev + (1. - beta1) * gradient 150 | v = beta2 * v_prev + (1. - beta2) * gradient ** 2. 151 | m_hat = m / (1. - beta1 ** t) 152 | v_hat = v / (1. - beta2 ** t) 153 | theta = param - (lr * m_hat) / (T.sqrt(v_hat) + epsilon) 154 | 155 | updates.append((m_prev, m)) 156 | updates.append((v_prev, v)) 157 | updates.append((param, theta)) 158 | 159 | updates.append((t, t + 1.)) 160 | return updates 161 | 162 | def rmsprop(self, cost, params, lr=0.001, rho=0.9, eps=1e-6,consider_constant=None): 163 | """ 164 | RMSProp. 165 | """ 166 | lr = theano.shared(np.float32(lr).astype(floatX)) 167 | 168 | gradients = self.get_gradients(cost, params,consider_constant) 169 | accumulators = [theano.shared(np.zeros_like(p.get_value()).astype(np.float32)) for p in params] 170 | 171 | updates = [] 172 | 173 | for param, gradient, accumulator in zip(params, gradients, accumulators): 174 | new_accumulator = rho * accumulator + (1 - rho) * gradient ** 2 175 | updates.append((accumulator, new_accumulator)) 176 | 177 | new_param = param - lr * gradient / T.sqrt(new_accumulator + eps) 178 | updates.append((param, new_param)) 179 | 180 | return updates 181 | -------------------------------------------------------------------------------- /src/tagger.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import time 5 | import codecs 6 | import optparse 7 | import numpy as np 8 | from loader import prepare_dataset; 9 | from utils import create_input, iobes_iob; 10 | from model import Model 11 | 12 | optparser = optparse.OptionParser() 13 | optparser.add_option( 14 | "-m", "--model", default="../models/chemnerModel/", 15 | help="Model location" 16 | ) 17 | optparser.add_option( 18 | "-i", "--input", default="../data/chemnder_test.txt", 19 | help="Input file location" 20 | ) 21 | optparser.add_option( 22 | "-o", "--output", default="./chemnder_test.tsv", 23 | help="Output file location" 24 | ) 25 | optparser.add_option( 26 | "-d", "--delimiter", default="__", 27 | help="Delimiter to separate words from their tags" 28 | ) 29 | opts = optparser.parse_args()[0] 30 | 31 | # Check parameters validity 32 | assert opts.delimiter 33 | assert os.path.isdir(opts.model) 34 | assert os.path.isfile(opts.input) 35 | 36 | # Load existing model 37 | print "Loading model..." 38 | model = Model(model_path=opts.model) 39 | 40 | # Load reverse mappings 41 | word_to_id, char_to_id, tag_to_id = [ 42 | {v: k for k, v in x.items()} 43 | for x in [model.id_to_word, model.id_to_char, model.id_to_tag] 44 | ] 45 | parameters = model.parameters 46 | #print model.parameters 47 | # Load the model 48 | _, f_eval = model.build4(parameters) 49 | model.reload() 50 | 51 | #load test sentence 52 | def load_sentences(path): 53 | sentences = [] 54 | for line in codecs.open(path, 'r', 'utf8'): 55 | sentence =[]; 56 | line = line.rstrip() 57 | if line: 58 | word = line.split() 59 | for elem in word: 60 | sentence.append([elem]); 61 | sentences.append(sentence) 62 | return sentences 63 | 64 | test_sentences=load_sentences(opts.input); 65 | test_data=prepare_dataset(test_sentences,None,parameters,parameters['lower'],isTest=True); 66 | f_output = codecs.open(opts.output, 'w', 'utf-8') 67 | start = time.time() 68 | 69 | def xmlformat(sentence,tags): 70 | #{{{ 71 | assert len(sentence)==len(tags); 72 | res=[]; 73 | preTag="drug"; 74 | for i in range(len(tags)): 75 | if tags[i][0]=='B': 76 | if len(preTag): 77 | res.append(""); 78 | preTag=""; 79 | res.append("<"+tags[i][2:]+">"); 80 | preTag=tags[i][2:]; 81 | if tags[i][0]=='I': 82 | if preTag!=tags[i][2:]: 83 | if len(preTag): 84 | res.append(""); 85 | preTag=""; 86 | 87 | if tags[i][0]=='O': 88 | if len(preTag): 89 | res.append(""); 90 | preTag=""; 91 | res.append(sentence[i]); 92 | if len(preTag): 93 | res.append(""); 94 | return res; 95 | #}}} 96 | print 'Tagging...' 97 | for line in test_data: 98 | # Prepare input 99 | input = create_input(line, parameters, False,useAttend=parameters['useAttend']); 100 | words=line['str_words']; 101 | # Decoding 102 | if parameters['crf']: 103 | y_preds = np.array(f_eval(*input)) 104 | else: 105 | y_preds = f_eval(*input).argmax(axis=1) 106 | y_preds = [model.id_to_tag[y_pred] for y_pred in y_preds] 107 | # Output tags in the IOB2 format 108 | if parameters['tag_scheme'] == 'iobes': 109 | y_preds = iobes_iob(y_preds) 110 | # Write tags 111 | assert len(y_preds) == len(words) 112 | 113 | # print words 114 | for i in range(len(words)): 115 | f_output.write(words[i]+'\t'+y_preds[i]+'\n') 116 | f_output.write('\n') 117 | # for elem in xmlformat(words,y_preds): 118 | # f_output.write(elem+" "); 119 | # f_output.write("\n"); 120 | 121 | print '---- lines tagged in %.4fs ----' % ( time.time() - start) 122 | f_output.close() 123 | -------------------------------------------------------------------------------- /src/train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import numpy as np 5 | SEED=1234; 6 | np.random.seed(SEED); 7 | import optparse 8 | import itertools 9 | import time 10 | import subprocess 11 | from collections import OrderedDict 12 | from utils import create_input 13 | import loader 14 | 15 | from utils import models_path, evaluate, eval_script, eval_temp,create_mapping; 16 | from loader import word_mapping, char_mapping, tag_mapping 17 | from loader import update_tag_scheme, prepare_dataset 18 | from loader import augment_with_pretrained, feature_mapping; 19 | from model import Model 20 | #import random ; 21 | #for bash color 22 | BASH_RED="\033[0;31m"; 23 | BASH_GREEN="\033[0;32m" 24 | BASH_YELLOW="\033[0;33m" 25 | BASH_CYAN="\033[0;36m" 26 | BASH_CLEAR="\033[0m" 27 | 28 | #prepare for model 29 | #{{{ 30 | # Read parameters from command line 31 | #{{{ 32 | optparser = optparse.OptionParser() 33 | optparser.add_option( 34 | "-T", "--train", default="training.ner.ssplit.token4.BIO", 35 | help="Train set location" 36 | ) 37 | optparser.add_option( 38 | "-d", "--dev", default="development.ner.ssplit.token4.BIO", 39 | help="Dev set location" 40 | ) 41 | optparser.add_option( 42 | "-t", "--test", default="evaluation.ner.ssplit.token4.BIO", 43 | help="Test set location" 44 | ) 45 | optparser.add_option( 46 | "-s", "--tag_scheme", default="iob", 47 | help="Tagging scheme (IOB or IOBES)" 48 | ) 49 | optparser.add_option( 50 | "-l", "--lower", default="0", 51 | type='int', help="Lowercase words (this will not affect character inputs)" 52 | ) 53 | optparser.add_option( 54 | "-z", "--zeros", default="0", 55 | type='int', help="Replace digits with 0" 56 | ) 57 | optparser.add_option( 58 | "-c", "--char_dim", default="25", 59 | type='int', help="Char embedding dimension" 60 | ) 61 | optparser.add_option( 62 | "-C", "--char_lstm_dim", default="25", 63 | type='int', help="Char LSTM hidden layer size" 64 | ) 65 | optparser.add_option( 66 | "-b", "--char_bidirect", default="1", 67 | type='int', help="Use a bidirectional LSTM for chars" 68 | ) 69 | optparser.add_option( 70 | "-w", "--word_dim", default="50", 71 | type='int', help="Token embedding dimension" 72 | ) 73 | optparser.add_option( 74 | "-W", "--word_lstm_dim", default="100", 75 | type='int', help="Token LSTM hidden layer size" 76 | ) 77 | optparser.add_option( 78 | "-B", "--word_bidirect", default="1", 79 | type='int', help="Use a bidirectional LSTM for words" 80 | ) 81 | optparser.add_option( 82 | "-p", "--pre_emb", default="./word2vec_model/chemdner_pubmed_drug.word2vec_model_token4_d50", 83 | help="Location of pretrained embeddings" 84 | ) 85 | optparser.add_option( 86 | "-A", "--all_emb", default="0", 87 | type='int', help="Load all embeddings" 88 | ) 89 | optparser.add_option( 90 | "-a", "--cap_dim", default="1", 91 | type='int', help="Capitalization feature dimension (0 to disable)" 92 | ) 93 | optparser.add_option( 94 | "-f", "--crf", default="1", 95 | type='int', help="Use CRF (0 to disable)" 96 | ) 97 | optparser.add_option( 98 | "-D", "--dropout", default="0.5", 99 | type='float', help="Droupout on the input (0 = no dropout)" 100 | ) 101 | optparser.add_option( 102 | "-L", "--lr_method", default="sgd-lr_.003", 103 | help="Learning method (SGD, Adadelta, Adam..)" 104 | ) 105 | optparser.add_option( 106 | "-r", "--reload", default="0", 107 | type='int', help="Reload the last saved model" 108 | ) 109 | optparser.add_option( 110 | "-S","--String",default="bilstm-crf-chemdner100d", 111 | help="some about this model" 112 | ) 113 | opts = optparser.parse_args()[0] 114 | #}}} 115 | 116 | 117 | #according corpus to set some parameter for loading file 118 | CORPUS="chem"; 119 | tagFilter=None; 120 | if CORPUS == "chem": 121 | #{{{ 122 | opts.train="./chemdner_corpus/chemdner_training.ner.sen.token4.BIO_allfea"; 123 | opts.dev="./chemdner_corpus/chemdner_development.ner.sen.token4.BIO_allfea"; 124 | opts.test="./chemdner_corpus/chemdner_evaluation.ner.sen.token4.BIO_allfea"; 125 | opts.pre_emb="./word2vec_model/chemdner_pubmed_drug.word2vec_model_token4_d50"; 126 | tagFilter=None; 127 | devBoundary=55508 128 | #}}} 129 | elif CORPUS == "CDR": 130 | #{{{ 131 | opts.train="./cdr_corpus/cdr_training.ner.sen.token4.BIO_allfea_drug"; 132 | opts.dev="./cdr_corpus/cdr_development.ner.sen.token4.BIO_allfea_drug"; 133 | opts.test="./cdr_corpus/cdr_test.ner.sen.token4.BIO_allfea_drug"; 134 | opts.pre_emb="./word2vec_model/chemdner_pubmed_drug.word2vec_model_token4_d50"; 135 | tagFilter=['Disease']; 136 | devBoundary=8319; 137 | #}}} 138 | 139 | else: 140 | assert 0,"unknown corpus"; 141 | 142 | #read word_dim from word2vec_model 143 | #{{{ 144 | with open(opts.pre_emb) as file: 145 | first_line = file.readline() 146 | #create vec_table 147 | frequency = int(first_line.split()[0]); 148 | vec_size = int(first_line.split()[1]); 149 | opts.word_dim=vec_size; 150 | opts.word_lstm_dim=vec_size; 151 | #}}} 152 | 153 | # Parse parameters 154 | #{{{ 155 | parameters = OrderedDict() 156 | parameters['tag_scheme'] = opts.tag_scheme 157 | parameters['lower'] = opts.lower == 1 158 | parameters['zeros'] = opts.zeros == 1 159 | parameters['char_dim'] = opts.char_dim 160 | parameters['char_lstm_dim'] = opts.char_lstm_dim 161 | parameters['char_bidirect'] = opts.char_bidirect == 1 162 | parameters['word_dim'] = opts.word_dim 163 | parameters['word_lstm_dim'] = opts.word_lstm_dim 164 | parameters['word_bidirect'] = opts.word_bidirect == 1 165 | parameters['pre_emb'] = opts.pre_emb 166 | parameters['all_emb'] = opts.all_emb == 1 167 | parameters['cap_dim'] = opts.cap_dim 168 | parameters['crf'] = opts.crf == 1 169 | parameters['dropout'] = opts.dropout 170 | parameters['lr_method'] = opts.lr_method 171 | #}}} 172 | 173 | # Check parameters validity 174 | #{{{ 175 | assert os.path.isfile(opts.train) 176 | assert os.path.isfile(opts.dev) 177 | assert os.path.isfile(opts.test) 178 | assert parameters['char_dim'] > 0 or parameters['word_dim'] > 0 179 | assert 0. <= parameters['dropout'] < 1.0 180 | assert parameters['tag_scheme'] in ['iob', 'iobes'] 181 | assert not parameters['all_emb'] or parameters['pre_emb'] 182 | assert not parameters['pre_emb'] or parameters['word_dim'] > 0 183 | assert not parameters['pre_emb'] or os.path.isfile(parameters['pre_emb']) 184 | #}}} 185 | # Check evaluation script / folders 186 | if not os.path.isfile(eval_script): 187 | raise Exception('CoNLL evaluation script not found at "%s"' % eval_script) 188 | if not os.path.exists(eval_temp): 189 | os.makedirs(eval_temp) 190 | if not os.path.exists(models_path): 191 | os.makedirs(models_path) 192 | #}}} 193 | #prepare for train 194 | #{{{ 195 | # Data parameters 196 | lower = parameters['lower'] 197 | zeros = parameters['zeros'] 198 | tag_scheme = parameters['tag_scheme'] 199 | 200 | # Load sentences 201 | train_sentences = loader.load_sentences(opts.train, lower, zeros) 202 | dev_sentences = loader.load_sentences(opts.dev, lower, zeros) 203 | test_sentences = loader.load_sentences(opts.test, lower, zeros) 204 | #show dev boundary in doc ratio 205 | def showDevBoundary(docDataset,sentDataset,ratio): 206 | count=0; 207 | for elem in docDataset[:int(len(docDataset)*(1-ratio))]: 208 | count+=len(elem); 209 | i=0; 210 | count_=0 211 | for elem in sentDataset: 212 | i+=1; 213 | if count_ < count: 214 | count_+=len(elem); 215 | else: 216 | break; 217 | return i-1; 218 | #check 1 word sentences 219 | def check1word(sentences): 220 | Lens=[]; 221 | for elem in sentences: 222 | Lens.append(len(elem)); 223 | if min(Lens)==1: 224 | assert 0; 225 | #check1word(train_sentences); 226 | #check1word(dev_sentences); 227 | #check1word(test_sentences); 228 | 229 | #merge dev to train 230 | totalSentences=train_sentences+dev_sentences; 231 | #redefine train and dev 232 | #corpus are already random genergated, so no need to shuffly 233 | #random.seed(SEED); 234 | #random.shuffle(totalSentences); 235 | devRatio=0.1; 236 | #doc 10% != sentence 10% 237 | #devBoundary=int(len(totalSentences)*(1-devRatio)) 238 | train_sentences=totalSentences[:devBoundary]; 239 | dev_sentences=totalSentences[devBoundary:]; 240 | 241 | # Use selected tagging scheme (IOB / IOBES) 242 | update_tag_scheme(train_sentences, tag_scheme,tagFilter) 243 | update_tag_scheme(dev_sentences, tag_scheme,tagFilter) 244 | update_tag_scheme(test_sentences, tag_scheme,tagFilter) 245 | 246 | # Create a dictionary / mapping of words 247 | # If we use pretrained embeddings, we add them to the dictionary. 248 | if parameters['pre_emb']: 249 | dico_words_train = word_mapping(train_sentences, lower)[0] 250 | dico_words, word_to_id, id_to_word = augment_with_pretrained( 251 | dico_words_train.copy(), 252 | parameters['pre_emb'], 253 | list(itertools.chain.from_iterable( 254 | [[w[0] for w in s] for s in dev_sentences + test_sentences]) 255 | ) if not parameters['all_emb'] else None 256 | ) 257 | else: 258 | dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower) 259 | dico_words_train = dico_words 260 | 261 | # Create a dictionary and a mapping for words / POS tags / tags 262 | dico_chars, char_to_id, id_to_char = char_mapping(train_sentences) 263 | dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences) 264 | 265 | #feature mapping 266 | #{{{ 267 | featureMap={#{{{ 268 | 'word':{ 269 | 'index':1, 270 | 'lstm-input':0, 271 | 'attended':0, 272 | }, 273 | 'char':{ 274 | 'index':0, 275 | 'lstm-input':0, 276 | 'attended':0, 277 | }, 278 | 'lemma':{ 'index':1, 279 | 'isUsed':0, 280 | 'num':0, 281 | 'dim':25, 282 | 'lstm-input':0, 283 | 'attended':0, 284 | 'pre_emb':''}, 285 | 'pos':{ 'index':2, 286 | 'isUsed':0, 287 | 'num':0, 288 | 'dim':50, 289 | 'lstm-input':0, 290 | 'attended':0, 291 | 'pre_emb':''}, 292 | 'chunk':{ 'index':3, 293 | 'isUsed':0, 294 | 'num':0, 295 | 'lstm-input':0, 296 | 'attended':0, 297 | 'dim':10}, 298 | 'dic':{ 'index':4, 299 | 'isUsed':1, 300 | 'num':3, 301 | 'lstm-input':1, 302 | 'attended':0, 303 | 'dim':5}, 304 | }#}}} 305 | feature2IdMap={'word':word_to_id, 306 | 'char':char_to_id, 307 | 'tag':tag_to_id}; 308 | 309 | if featureMap['lemma']['isUsed'] : 310 | dico_lemma,lemma_to_id,id_to_lemma=feature_mapping(train_sentences, 311 | featureMap['lemma']['index'],'lemma'); 312 | featureMap['lemma']['num']=len(dico_lemma) 313 | feature2IdMap['lemma']=lemma_to_id; 314 | 315 | if featureMap['pos']['isUsed'] : 316 | dico_pos,pos_to_id,id_to_pos=feature_mapping(train_sentences, 317 | featureMap['pos']['index'],'pos'); 318 | featureMap['pos']['num']=len(dico_pos) 319 | feature2IdMap['pos']=pos_to_id; 320 | if featureMap['chunk']['isUsed']: 321 | dico_chunk,chunk_to_id,id_to_chunk=feature_mapping(train_sentences, 322 | featureMap['chunk']['index'],'chunk'); 323 | featureMap['chunk']['num']=len(dico_chunk) 324 | feature2IdMap['chunk']=chunk_to_id; 325 | if featureMap['dic']['isUsed'] : 326 | dico_dic={'B':0,'I':1,'O':2}; 327 | dic_to_id,id_to_dic=create_mapping(dico_dic); 328 | feature2IdMap['dic']=dic_to_id; 329 | print BASH_YELLOW+str(featureMap)+BASH_CLEAR; 330 | featureMap['feature2IdMap']=feature2IdMap; 331 | parameters['features']=featureMap; 332 | #}}} 333 | 334 | # Index data 335 | train_data = prepare_dataset( 336 | train_sentences,None, parameters, lower 337 | ) 338 | dev_data = prepare_dataset( 339 | dev_sentences,None, parameters, lower 340 | ) 341 | test_data = prepare_dataset( 342 | test_sentences,None, parameters, lower 343 | ) 344 | 345 | print "%i / %i / %i sentences in train / dev / test." % ( 346 | len(train_data), len(dev_data), len(test_data)) 347 | 348 | 349 | parameters['useAttend']=False; 350 | parameters['sentencesLevelLoss']=False; 351 | parameters['training']=True; 352 | saveModel=True; 353 | useEarlyStopping=False; 354 | # Initialize model 355 | model = Model(parameters=parameters, 356 | models_path=models_path, 357 | model_path="./models/bilstm-crf-dic-chemdner-50d/",Training=True); 358 | # Save the mappings to disk 359 | print 'Saving the mappings to disk...' 360 | model.save_mappings(id_to_word, id_to_char, id_to_tag) 361 | print BASH_YELLOW+"Model location: "+BASH_CLEAR+ "%s" % model.model_path 362 | print BASH_YELLOW+"model important point:"+BASH_CLEAR,opts.String; 363 | print BASH_YELLOW+'save model:'+BASH_CLEAR,saveModel; 364 | # Build the model 365 | f_train, f_eval = model.build4(parameters) 366 | 367 | # Reload previous model values 368 | if opts.reload: 369 | print 'Reloading previous model...' 370 | model.reload() 371 | #}}} 372 | # 373 | # Train network 374 | # 375 | singletons = set([word_to_id[k] for k, v 376 | in dico_words_train.items() if v == 1]) 377 | freq_eval = int(len(train_data)*0.3) # evaluate on dev every freq_eval steps 378 | count = 0 379 | limitPrint=12; 380 | param = { 381 | 'lr':0.005, 382 | 'verbose':1, 383 | 'decay':True, # decay on the learning rate if improvement stops 384 | 'bs':5, # number of backprop through time steps 385 | 'seed':345, 386 | 'epochs':40, 387 | 'crf':True, 388 | 'shuffle':True}; 389 | folder_out = '../log/BiLSTM-CRF/' 390 | print BASH_YELLOW+"folder_out:"+BASH_CLEAR,folder_out; 391 | best_f1=-np.inf; 392 | 393 | #generate FILE NAME PREFIX 394 | fileNamePrefix=""; 395 | if opts.String != "": 396 | fileNamePrefix=opts.String; 397 | fileNamePrefix=fileNamePrefix.replace(",","_"); 398 | fileNamePrefix=fileNamePrefix.replace(" ","_"); 399 | 400 | #train model 401 | if useEarlyStopping: 402 | #{{{ 403 | from utils import EarlyStopping; 404 | eStop=EarlyStopping(patience=15,mode='min'); 405 | eStop.on_train_begin(); 406 | f_test=model.f_test; 407 | 408 | #start train our model 409 | for epoch in xrange(param['epochs']): 410 | epoch_costs = [] 411 | startTime=time.time(); 412 | 413 | #decide whether early stop 414 | if eStop.stop_training: 415 | break; 416 | 417 | print "Starting epoch %i..." % epoch 418 | for i, index in enumerate(np.random.permutation(len(train_data))): 419 | count += 1 420 | input = create_input(train_data[index], parameters, True, singletons,False) 421 | new_cost = f_train(*input) 422 | if np.isnan(new_cost): 423 | print index,"nan" 424 | epoch_costs.append(new_cost) 425 | #validation 426 | if count == len(train_data): 427 | valLoss=[]; 428 | for i in range(len(dev_data)): 429 | devInput=create_input(dev_data[i],parameters,True,None,False); 430 | newDevCost=f_test(*devInput); 431 | valLoss.append(newDevCost); 432 | #res_dev = evaluate(parameters, f_eval, dev_sentences, 433 | # dev_data, id_to_tag, dico_tags, 434 | # folder_out+fileNamePrefix+'.dev.txt', 435 | # useAttend=False) 436 | eStop.on_epoch_end(epoch,np.mean(valLoss)) ; 437 | if eStop.stop_training: 438 | break; 439 | print BASH_YELLOW+"avg error:"+BASH_CLEAR,np.mean(epoch_costs),\ 440 | "avg dev loss:",np.mean(valLoss); 441 | print BASH_YELLOW+"One epch espliced:"+BASH_CLEAR,time.time()-startTime; 442 | 443 | #start evaluate on test 444 | res_test = evaluate(parameters, f_eval, test_sentences, 445 | test_data, id_to_tag, dico_tags, 446 | folder_out+fileNamePrefix+'.test.txt', 447 | useAttend=False) 448 | if saveModel: 449 | print "Saving model to disk..." 450 | model.save() 451 | print BASH_RED+'TEST: epoch'+BASH_CLEAR, epoch, 'F1', res_test['f1'],'p:',res_test['p'],'r:',res_test['r'], ' '*15 452 | print BASH_YELLOW+"model important point:"+BASH_CLEAR,opts.String; 453 | #}}} 454 | else: 455 | #{{{ 456 | #start train our model 457 | for epoch in xrange(param['epochs']): 458 | epoch_costs = [] 459 | startTime=time.time(); 460 | print "Starting epoch %i..." % epoch 461 | for i, index in enumerate(np.random.permutation(len(train_data))): 462 | count += 1 463 | input = create_input(train_data[index], parameters, True, singletons,False) 464 | new_cost = f_train(*input) 465 | if np.isnan(new_cost): 466 | print index,"nan" 467 | epoch_costs.append(new_cost) 468 | if count % freq_eval == 0 and epoch>=limitPrint: 469 | res_dev = evaluate(parameters, f_eval, dev_sentences, 470 | dev_data, id_to_tag, dico_tags, 471 | folder_out+fileNamePrefix+'.dev.txt', 472 | useAttend=False) 473 | #new F1 value on dev 474 | if res_dev['f1'] > best_f1: 475 | best_f1 = res_dev['f1'] 476 | if param['verbose']: 477 | print BASH_CYAN+'NEW DEV BEST: epoch'+BASH_CLEAR, epoch, 'best dev F1', res_dev['f1'],'p:',res_dev['p'],'r:',res_dev['r'], ' '*15 478 | 479 | #new F1 value on dev, so evaluate on test 480 | res_test = evaluate(parameters, f_eval, test_sentences, 481 | test_data, id_to_tag, dico_tags, 482 | folder_out+fileNamePrefix+'.test.txt', 483 | useAttend=False) 484 | if saveModel: 485 | print "Saving model to disk..." 486 | model.save() 487 | print BASH_RED+'THIS TEST: epoch'+BASH_CLEAR, epoch, 'F1', res_test['f1'],'p:',res_test['p'],'r:',res_test['r'], ' '*15 488 | param['tf1'], param['tp'], param['tr'] = res_test['f1'], res_test['p'], res_test['r'] 489 | param['be'] = epoch 490 | print BASH_YELLOW+"avg error:"+BASH_CLEAR,np.mean(epoch_costs); 491 | print BASH_YELLOW+"One epch espliced:"+BASH_CLEAR,time.time()-startTime; 492 | print BASH_GREEN+'FINAL TEST RESULT: epoch'+BASH_CLEAR, param['be'], 'final test F1', param['tf1'],'best p:',param['tp'],'best r:',param['tr'] 493 | print BASH_YELLOW+"model important point:"+BASH_CLEAR,opts.String; 494 | #}}} 495 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import codecs 4 | import numpy as np 5 | import six 6 | import theano 7 | 8 | 9 | models_path = "./models" 10 | eval_path = "./evaluation" 11 | eval_temp = os.path.join(eval_path, "temp") 12 | eval_script = os.path.join(eval_path, "conlleval") 13 | 14 | class EarlyStopping(object): 15 | #{{{ 16 | '''Stop training when a monitored quantity has stopped improving. 17 | 18 | # Arguments 19 | monitor: quantity to be monitored. 20 | min_delta: minimum change in the monitored quantity 21 | to qualify as an improvement, i.e. an absolute 22 | change of less than min_delta, will count as no 23 | improvement. 24 | patience: number of epochs with no improvement 25 | after which training will be stopped. 26 | ''' 27 | def __init__(self, monitor='val_loss', 28 | min_delta=1e-6, patience=5,mode='min'): 29 | #{{{ 30 | super(EarlyStopping, self).__init__() 31 | 32 | self.monitor = monitor 33 | self.patience = patience 34 | self.min_delta = min_delta 35 | self.wait = 0 36 | self.stopped_epoch = 0 37 | self.stop_training=False; 38 | 39 | if mode =="min": 40 | self.monitor_op = np.less; 41 | elif mode == "max": 42 | self.monitor_op = np.greater; 43 | else: 44 | assert 0,"unknown early stop mode:"; 45 | 46 | self.min_delta *= -1 47 | #}}} 48 | def on_train_begin(self): 49 | self.wait = 0 # Allow instances to be re-used 50 | self.best = np.Inf if self.monitor_op == np.less else -np.Inf 51 | 52 | def on_epoch_end(self, epoch, loss): 53 | #{{{ 54 | current = loss 55 | 56 | if self.monitor_op(current - self.min_delta, self.best): 57 | self.best = current 58 | self.wait = 0 59 | else: 60 | if self.wait >= self.patience: 61 | self.stopped_epoch = epoch 62 | self.stop_training = True 63 | self.wait += 1 64 | #}}} 65 | def on_train_end(self, logs={}): 66 | if self.stopped_epoch > 0 : 67 | print('Epoch %05d: early stopping' % (self.stopped_epoch)) 68 | 69 | #}}} 70 | def get_from_module(identifier, module_params, module_name, 71 | instantiate=False, kwargs=None): 72 | #{{{ 73 | if isinstance(identifier, six.string_types): 74 | res = module_params.get(identifier) 75 | if not res: 76 | raise ValueError('Invalid ' + str(module_name) + ': ' + 77 | str(identifier)) 78 | if instantiate and not kwargs: 79 | return res() 80 | elif instantiate and kwargs: 81 | return res(**kwargs) 82 | else: 83 | return res 84 | elif isinstance(identifier, dict): 85 | name = identifier.pop('name') 86 | res = module_params.get(name) 87 | if res: 88 | return res(**identifier) 89 | else: 90 | raise ValueError('Invalid ' + str(module_name) + ': ' + 91 | str(identifier)) 92 | return identifier 93 | #}}} 94 | 95 | def findNotSame(fNameX,fNameY): 96 | #{{{ 97 | """ 98 | verify two file is same or not 99 | """ 100 | space='space'; 101 | def loadFile(fName): 102 | word=[]; 103 | import codecs; 104 | for line in codecs.open(fName,'r','utf8'): 105 | line=line.rstrip(); 106 | if len(line)>0: 107 | word.append(line[0]); 108 | else: 109 | word.append(space); 110 | return word; 111 | word1=loadFile(fNameX); 112 | word2=loadFile(fNameY); 113 | i=0; 114 | j=0; 115 | while i|") 168 | #}}} 169 | 170 | def set_values(name, param, pretrained): 171 | #{{{ 172 | """ 173 | Initialize a network parameter with pretrained values. 174 | We check that sizes are compatible. 175 | """ 176 | param_value = param.get_value() 177 | if pretrained.size != param_value.size: 178 | raise Exception( 179 | "Size mismatch for parameter %s. Expected %i, found %i." 180 | % (name, param_value.size, pretrained.size) 181 | ) 182 | param.set_value(np.reshape( 183 | pretrained, param_value.shape 184 | ).astype(np.float32)) 185 | #}}} 186 | 187 | import initializations; 188 | def shared(shape, name): 189 | #{{{ 190 | """ 191 | Create a shared object of a numpy array. 192 | """ 193 | init=initializations.get('glorot_uniform'); 194 | if len(shape) == 1: 195 | value = np.zeros(shape) # bias are initialized with zeros 196 | return theano.shared(value=value.astype(theano.config.floatX), name=name) 197 | else: 198 | drange = np.sqrt(6. / (np.sum(shape))) 199 | value = drange * np.random.uniform(low=-1.0, high=1.0, size=shape) 200 | return init(shape=shape,name=name); 201 | #}}} 202 | 203 | def create_dico(item_list): 204 | #{{{ 205 | """ 206 | Create a dictionary of items from a list of list of items. 207 | """ 208 | assert type(item_list) is list 209 | dico = {} 210 | for items in item_list: 211 | for item in items: 212 | if item not in dico: 213 | dico[item] = 1 214 | else: 215 | dico[item] += 1 216 | return dico 217 | #}}} 218 | 219 | def create_mapping(dico): 220 | #{{{ 221 | """ 222 | Create a mapping (item to ID / ID to item) from a dictionary. 223 | Items are ordered by decreasing frequency. 224 | """ 225 | sorted_items = sorted(dico.items(), key=lambda x: (-x[1], x[0])) 226 | id_to_item = {i: v[0] for i, v in enumerate(sorted_items)} 227 | item_to_id = {v: k for k, v in id_to_item.items()} 228 | return item_to_id, id_to_item 229 | #}}} 230 | 231 | def zero_digits(s): 232 | #{{{ 233 | """ 234 | Replace every digit in a string by a zero. 235 | """ 236 | return re.sub('\d', '0', s) 237 | #}}} 238 | 239 | def iob2(tags): 240 | #{{{ 241 | """ 242 | Check that tags have a valid IOB format. 243 | Tags in IOB1 format are converted to IOB2. 244 | """ 245 | for i, tag in enumerate(tags): 246 | if tag == 'O': 247 | continue 248 | split = tag.split('-') 249 | if split[0] not in ['I', 'B']: 250 | #if len(split) != 2 or split[0] not in ['I', 'B']: 251 | return False 252 | if split[0] == 'B': 253 | continue 254 | elif i == 0 or tags[i - 1] == 'O': # conversion IOB1 to IOB2 255 | tags[i] = 'B' + tag[1:] 256 | elif tags[i - 1][1:] == tag[1:]: 257 | continue 258 | else: # conversion IOB1 to IOB2 259 | tags[i] = 'B' + tag[1:] 260 | return True 261 | #}}} 262 | 263 | def iob_iobes(tags): 264 | #{{{ 265 | """ 266 | IOB -> IOBES 267 | """ 268 | new_tags = [] 269 | for i, tag in enumerate(tags): 270 | if tag == 'O': 271 | new_tags.append(tag) 272 | elif tag.split('-')[0] == 'B': 273 | if i + 1 != len(tags) and \ 274 | tags[i + 1].split('-')[0] == 'I': 275 | new_tags.append(tag) 276 | else: 277 | new_tags.append(tag.replace('B-', 'S-')) 278 | elif tag.split('-')[0] == 'I': 279 | if i + 1 < len(tags) and \ 280 | tags[i + 1].split('-')[0] == 'I': 281 | new_tags.append(tag) 282 | else: 283 | new_tags.append(tag.replace('I-', 'E-')) 284 | else: 285 | raise Exception('Invalid IOB format!') 286 | return new_tags 287 | #}}} 288 | 289 | def iobes_iob(tags): 290 | #{{{ 291 | """ 292 | IOBES -> IOB 293 | """ 294 | new_tags = [] 295 | for i, tag in enumerate(tags): 296 | if tag.split('-')[0] == 'B': 297 | new_tags.append(tag) 298 | elif tag.split('-')[0] == 'I': 299 | new_tags.append(tag) 300 | elif tag.split('-')[0] == 'S': 301 | new_tags.append(tag.replace('S-', 'B-')) 302 | elif tag.split('-')[0] == 'E': 303 | new_tags.append(tag.replace('E-', 'I-')) 304 | elif tag.split('-')[0] == 'O': 305 | new_tags.append(tag) 306 | else: 307 | raise Exception('Invalid format!') 308 | return new_tags 309 | #}}} 310 | 311 | def insert_singletons(words, singletons, p=0.5): 312 | #{{{ 313 | """ 314 | Replace singletons by the unknown word with a probability p. 315 | """ 316 | new_words = [] 317 | for word in words: 318 | if word in singletons and np.random.uniform() < p: 319 | new_words.append(0) 320 | else: 321 | new_words.append(word) 322 | return new_words 323 | #}}} 324 | 325 | def pad_word_chars(words): 326 | #{{{ 327 | """ 328 | Pad the characters of the words in a sentence. 329 | Input: 330 | - list of lists of ints (list of words, a word being a list of char indexes) 331 | Output: 332 | - padded list of lists of ints 333 | - padded list of lists of ints (where chars are reversed) 334 | - list of ints corresponding to the index of the last character of each word 335 | """ 336 | max_length = max([len(word) for word in words]) 337 | char_for = [] 338 | char_rev = [] 339 | char_pos = [] 340 | for word in words: 341 | padding = [0] * (max_length - len(word)) 342 | char_for.append(word + padding) 343 | char_rev.append(word[::-1] + padding) 344 | char_pos.append(len(word) - 1) 345 | return char_for, char_rev, char_pos 346 | #}}} 347 | 348 | 349 | def create_input(data, parameters, add_label, singletons=None, 350 | useAttend=True): 351 | #{{{ 352 | """ 353 | Take sentence data and return an input for 354 | the training or the evaluation function. 355 | """ 356 | words = data['words'] 357 | wordsTrue=data['words']; 358 | chars = data['chars'] 359 | if singletons is not None: 360 | words = insert_singletons(words, singletons) 361 | if parameters['cap_dim']: 362 | caps = data['caps'] 363 | char_for, char_rev, char_pos = pad_word_chars(chars) 364 | input = [] 365 | if parameters['word_dim']: 366 | input.append(words) 367 | if parameters['char_dim']: 368 | input.append(char_for) 369 | if parameters['char_bidirect']: 370 | input.append(char_rev) 371 | input.append(char_pos) 372 | if parameters['cap_dim']: 373 | input.append(caps) 374 | if useAttend: 375 | input.append(wordsTrue); 376 | if parameters.has_key('sentencesLevelLoss') \ 377 | and parameters['sentencesLevelLoss']: 378 | input.append(data['lens']) ; 379 | 380 | #add features 381 | if parameters.has_key('features'): 382 | features=parameters['features']; 383 | else: 384 | features=None; 385 | if features is not None and features['lemma']['isUsed']: 386 | input.append(data['lemma']); 387 | if features is not None and features['pos']['isUsed']: 388 | input.append(data['pos']); 389 | if features is not None and features['chunk']['isUsed']: 390 | input.append(data['chunk']); 391 | if features is not None and features['dic']['isUsed']: 392 | input.append(data['dic']); 393 | 394 | if add_label: 395 | input.append(data['tags']) 396 | return input 397 | #}}} 398 | 399 | from os.path import isfile 400 | from os import chmod 401 | import stat 402 | import subprocess 403 | PREFIX = './evaluation/' 404 | def get_perf(filename): 405 | ''' run conlleval.pl perl script to obtain 406 | precision/recall and F1 score ''' 407 | _conlleval = PREFIX + 'conlleval' 408 | if not isfile(_conlleval): 409 | #download('http://www-etud.iro.umontreal.ca/~mesnilgr/atis/conlleval.pl') 410 | os.system('wget https://www.comp.nus.edu.sg/%7Ekanmy/courses/practicalNLP_2008/packages/conlleval.pl') 411 | chmod('conlleval.pl', stat.S_IRWXU) # give the execute permissions 412 | 413 | out = [] 414 | proc = subprocess.Popen(["perl", _conlleval], stdin=subprocess.PIPE, stdout=subprocess.PIPE) 415 | stdout, _ = proc.communicate(open(filename).read()) 416 | for line in stdout.split('\n'): 417 | if 'accuracy' in line: 418 | out = line.split() 419 | break 420 | 421 | # out = ['accuracy:', '16.26%;', 'precision:', '0.00%;', 'recall:', '0.00%;', 'FB1:', '0.00'] 422 | precision = float(out[3][:-2]) 423 | recall = float(out[5][:-2]) 424 | f1score = float(out[7]) 425 | 426 | return {'p':precision, 'r':recall, 'f1':f1score} 427 | 428 | def evaluate(parameters, f_eval, raw_sentences, parsed_sentences, 429 | id_to_tag, dictionary_tags,filename, 430 | useAttend=True): 431 | #{{{ 432 | """ 433 | Evaluate current model using CoNLL script. 434 | """ 435 | n_tags = len(id_to_tag) 436 | predictions = [] 437 | count = np.zeros((n_tags, n_tags), dtype=np.int32) 438 | 439 | for raw_sentence, data in zip(raw_sentences, parsed_sentences): 440 | input = create_input(data, parameters, False,useAttend=useAttend) 441 | if parameters['crf']: 442 | y_preds = np.array(f_eval(*input)) 443 | else: 444 | y_preds = f_eval(*input).argmax(axis=1) 445 | y_reals = np.array(data['tags']).astype(np.int32) 446 | assert len(y_preds) == len(y_reals) 447 | p_tags = [id_to_tag[y_pred] for y_pred in y_preds] 448 | r_tags = [id_to_tag[y_real] for y_real in y_reals] 449 | if parameters['tag_scheme'] == 'iobes': 450 | p_tags = iobes_iob(p_tags) 451 | r_tags = iobes_iob(r_tags) 452 | for i, (y_pred, y_real) in enumerate(zip(y_preds, y_reals)): 453 | new_line = " ".join(raw_sentence[i][:-1] + [r_tags[i], p_tags[i]]) 454 | predictions.append(new_line) 455 | count[y_real, y_pred] += 1 456 | predictions.append("") 457 | #write to file 458 | with codecs.open(filename, 'w', 'utf8') as f: 459 | f.write("\n".join(predictions)) 460 | return get_perf(filename) 461 | #}}} 462 | --------------------------------------------------------------------------------