├── .gitignore ├── LICENSE ├── README.md ├── docs ├── Makefile ├── framework.png └── source │ ├── conf.py │ ├── index.rst │ ├── model.rst │ └── modules.rst ├── eval_w.py ├── eval_wc.py ├── model ├── __init__.py ├── crf.py ├── evaluator.py ├── highway.py ├── lm_lstm_crf.py ├── lstm_crf.py ├── ner_dataset.py ├── predictor.py └── utils.py ├── requirements.txt ├── seq_w.py ├── seq_wc.py ├── train_w.py └── train_wc.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | .DS_Store 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # IDEA 107 | .idea/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {2017} {Liyuan Liu} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LM-LSTM-CRF 2 | 3 | [![Documentation Status](https://readthedocs.org/projects/lm-lstm-crf/badge/?version=latest)](http://lm-lstm-crf.readthedocs.io/en/latest/?badge=latest) 4 | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) 5 | [![Insight.io](https://insight.io/repoBadge/github.com/LiyuanLucasLiu/LM-LSTM-CRF)](https://insight.io/github.com/LiyuanLucasLiu/LM-LSTM-CRF) 6 | 7 | **Check Our New NER Toolkit🚀🚀🚀** 8 | - **Inference**: 9 | - **[LightNER](https://github.com/LiyuanLucasLiu/LightNER)**: inference w. models pre-trained / trained w. *any* following tools, *efficiently*. 10 | - **Training**: 11 | - **[LD-Net](https://github.com/LiyuanLucasLiu/LD-Net)**: train NER models w. efficient contextualized representations. 12 | - **[VanillaNER](https://github.com/LiyuanLucasLiu/Vanilla_NER)**: train vanilla NER models w. pre-trained embedding. 13 | - **Distant Training**: 14 | - **[AutoNER](https://shangjingbo1226.github.io/AutoNER/)**: train NER models w.o. line-by-line annotations and get competitive performance. 15 | 16 | -------------------------------- 17 | 18 | This project provides high-performance character-aware sequence labeling tools, including [Training](#usage), [Evaluation](#evaluation) and [Prediction](#prediction). 19 | 20 | Details about LM-LSTM-CRF can be accessed [here](http://arxiv.org/abs/1709.04109), and the implementation is based on the PyTorch library. 21 | 22 | **Important:** A serious bug was found on the ```bioes_to_span``` function in the original implementation, please refer the numbers reported in the [Benchmarks](#benchmarks) section as the accurate performance. 23 | 24 | The documents would be available [here](http://lm-lstm-crf.readthedocs.io/en/latest/). 25 | 26 | ## Quick Links 27 | 28 | - [Model](#model-notes) 29 | - [Installation](#installation) 30 | - [Data](#data) 31 | - [Usage](#usage) 32 | - [Benchmarks](#benchmarks) 33 | - [Pretrained model](#pretrained-model) 34 | 35 | ## Model Notes 36 | 37 |

38 | 39 | As visualized above, we use conditional random field (CRF) to capture label dependencies, and adopt a hierarchical LSTM to leverage both char-level and word-level inputs. 40 | The char-level structure is further guided by a language model, while pre-trained word embeddings are leveraged in word-level. 41 | The language model and the sequence labeling model are trained at the same time, and both make predictions at word-level. 42 | [Highway networks]("https://arxiv.org/abs/1507.06228") are used to transform the output of char-level LSTM into different semantic spaces, and thus mediating these two tasks and allowing language model to empower sequence labeling. 43 | 44 | ## Installation 45 | 46 | For training, a GPU is strongly recommended for speed. CPU is supported but training could be extremely slow. 47 | 48 | ### PyTorch 49 | 50 | The code is based on PyTorch and **supports PyTorch 0.4 now** . You can find installation instructions [here](http://pytorch.org/). 51 | 52 | ### Dependencies 53 | 54 | The code is written in Python 3.6. Its dependencies are summarized in the file ```requirements.txt```. You can install these dependencies like this: 55 | ``` 56 | pip3 install -r requirements.txt 57 | ``` 58 | 59 | ## Data 60 | 61 | We mainly focus on the CoNLL 2003 NER dataset, and the code takes its original format as input. 62 | However, due to the license issue, we are restricted to distribute this dataset. 63 | You should be able to get it [here](http://aclweb.org/anthology/W03-0419). 64 | You may also want to search online (e.g., Github), someone might release it accidentally. 65 | 66 | ### Format 67 | 68 | We assume the corpus is formatted as same as the CoNLL 2003 NER dataset. 69 | More specifically, **empty lines** are used as separators between sentences, and the separator between documents is a special line as below. 70 | ``` 71 | -DOCSTART- -X- -X- -X- O 72 | ``` 73 | Other lines contains words, labels and other fields. **Word** must be the **first** field, **label** mush be the **last**, and these fields are **separated by space**. 74 | For example, the first several lines in the WSJ portion of the PTB POS tagging corpus should be like the following snippet. 75 | 76 | ``` 77 | -DOCSTART- -X- -X- -X- O 78 | 79 | Pierre NNP 80 | Vinken NNP 81 | , , 82 | 61 CD 83 | years NNS 84 | old JJ 85 | , , 86 | will MD 87 | join VB 88 | the DT 89 | board NN 90 | as IN 91 | a DT 92 | nonexecutive JJ 93 | director NN 94 | Nov. NNP 95 | 29 CD 96 | . . 97 | 98 | 99 | ``` 100 | 101 | ## Usage 102 | 103 | Here we provide implementations for two models, one is **LM-LSTM-CRF** and the other is its variant, **LSTM-CRF**, which only contains the word-level structure and CRF. 104 | ```train_wc.py``` and ```eval_wc.py``` are scripts for LM-LSTM-CRF, while ```train_w.py``` and ```eval_w.py``` are scripts for LSTM-CRF. 105 | The usages of these scripts can be accessed by the parameter ````-h````, i.e., 106 | ``` 107 | python train_wc.py -h 108 | python train_w.py -h 109 | python eval_wc.py -h 110 | python eval_w.py -h 111 | ``` 112 | 113 | The default running commands for NER and POS tagging, and NP Chunking are: 114 | 115 | - Named Entity Recognition (NER): 116 | ``` 117 | python train_wc.py --train_file ./data/ner/train.txt --dev_file ./data/ner/testa.txt --test_file ./data/ner/testb.txt --checkpoint ./checkpoint/ner_ --caseless --fine_tune --high_way --co_train --least_iters 100 118 | ``` 119 | 120 | - Part-of-Speech (POS) Tagging: 121 | ``` 122 | python train_wc.py --train_file ./data/pos/train.txt --dev_file ./data/pos/testa.txt --test_file ./data/pos/testb.txt --eva_matrix a --checkpoint ./checkpoint/pos_ --caseless --fine_tune --high_way --co_train 123 | ``` 124 | 125 | - Noun Phrase (NP) Chunking: 126 | ``` 127 | python train_wc.py --train_file ./data/np/train.txt.iobes --dev_file ./data/np/testa.txt.iobes --test_file ./data/np/testb.txt.iobes --checkpoint ./checkpoint/np_ --caseless --fine_tune --high_way --co_train --least_iters 100 128 | ``` 129 | 130 | For other datasets or tasks, you may wanna try different stopping parameters, especially, for smaller dataset, you may want to set ```least_iters``` to a larger value; and for some tasks, if the speed of loss decreasing is too slow, you may want to increase ```lr```. 131 | 132 | ## Benchmarks 133 | 134 | Here we compare LM-LSTM-CRF with recent state-of-the-art models on the CoNLL 2000 Chunking dataset, the CoNLL 2003 NER dataset, and the WSJ portion of the PTB POS Tagging dataset. All experiments are conducted on a GTX 1080 GPU. 135 | 136 | A serious bug was found on the ```bioes_to_span``` function in the original implementation, please refer the following numbers as the accurate performance. 137 | 138 | ### NER 139 | 140 | When models are only trained on the WSJ portion of the PTB POS Tagging dataset, the results are summarized as below. 141 | 142 | |Model | Max(Acc) | Mean(Acc) | Std(Acc) | Time(h) | 143 | | ------------- |-------------| -----| -----| ---- | 144 | | LM-LSTM-CRF | **91.35** | **91.24** | 0.12 | 4 | 145 | | -- HighWay | 90.87 | 90.79 | 0.07 | 4 | 146 | | -- Co-Train | 91.23 | 90.95 | 0.34 | 2 | 147 | 148 | ### POS 149 | 150 | When models are only trained on the WSJ portion of the PTB POS Tagging dataset, the results are summarized as below. 151 | 152 | |Model | Max(Acc) | Mean(Acc) | Std(Acc) | Reported(Acc) | Time(h) | 153 | | ------------- |-------------| -----| -----| -----| ---- | 154 | | [Lample et al. 2016](https://github.com/glample/tagger) | 97.51 | 97.35 | 0.09 | N/A | 37 | 155 | | [Ma et al. 2016](https://github.com/XuezheMax/LasagneNLP) | 97.46 | 97.42 | 0.04 | 97.55 | 21 | 156 | | LM-LSTM-CRF | **97.59** | **97.53** | 0.03 | | 16 | 157 | 158 | ## Pretrained Model 159 | 160 | ### Evaluation 161 | 162 | We released pre-trained models on these three tasks. The checkpoint file can be downloaded at the following links. Notice that the NER model and Chunking model (coming soon) are trained on both the training set and the development set: 163 | 164 | | WSJ-PTB POS Tagging | CoNLL03 NER | 165 | | ------------------- | ------------------- | 166 | | [Args](https://drive.google.com/file/d/0B587SdKqutQmYmpiNFp6b1hKWEE/view?usp=sharing) | [Args](https://drive.google.com/file/d/1tGAQ0hu9AsIBdrqFn5fmDQ72Pk1I-o74/view?usp=sharing) | 167 | | [Model](https://drive.google.com/file/d/0B587SdKqutQmNnR3Nnk1WHdIMG8/view?usp=sharing) | [Model](https://drive.google.com/file/d/1o9kjZV5EcHAhys3GPgl7EPGE5fuXyYjr/view?usp=sharing) | 168 | 169 | Also, ```eval_wc.py``` is provided to load and run these checkpoints. Its usage can be accessed by command ````python eval_wc.py -h````, and a running command example is provided below: 170 | ``` 171 | python eval_wc.py --load_arg checkpoint/ner/ner_4_cwlm_lstm_crf.json --load_check_point checkpoint/ner_ner_4_cwlm_lstm_crf.model --gpu 0 --dev_file ./data/ner/testa.txt --test_file ./data/ner/testb.txt 172 | ``` 173 | 174 | ### Prediction 175 | 176 | To annotated raw text, ```seq_wc.py``` is provided to annotate un-annotated text. Its usage can be accessed by command ````python seq_wc.py -h````, and a running command example is provided below: 177 | ``` 178 | python seq_wc.py --load_arg checkpoint/ner/ner_4_cwlm_lstm_crf.json --load_check_point checkpoint/ner_ner_4_cwlm_lstm_crf.model --gpu 0 --input_file ./data/ner2003/test.txt --output_file output.txt 179 | ``` 180 | 181 | The input format is similar to CoNLL, but each line is required to only contain one field, token. For example, an input file could be: 182 | 183 | ``` 184 | -DOCSTART- 185 | 186 | But 187 | China 188 | saw 189 | their 190 | luck 191 | desert 192 | them 193 | in 194 | the 195 | second 196 | match 197 | of 198 | the 199 | group 200 | , 201 | crashing 202 | to 203 | a 204 | surprise 205 | 2-0 206 | defeat 207 | to 208 | newcomers 209 | Uzbekistan 210 | . 211 | ``` 212 | and the corresponding output is: 213 | 214 | ``` 215 | -DOCSTART- -DOCSTART- -DOCSTART- 216 | 217 | But China saw their luck desert them in the second match of the group , crashing to a surprise 2-0 defeat to newcomers Uzbekistan . 218 | 219 | ``` 220 | 221 | ## Reference 222 | 223 | ``` 224 | @inproceedings{2017arXiv170904109L, 225 | title = "{Empower Sequence Labeling with Task-Aware Neural Language Model}", 226 | author = {{Liu}, L. and {Shang}, J. and {Xu}, F. and {Ren}, X. and {Gui}, H. and {Peng}, J. and {Han}, J.}, 227 | booktitle={AAAI}, 228 | year = 2018, 229 | } 230 | ``` 231 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = python -msphinx 7 | SPHINXPROJ = LM-LSTM-CRF 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | 22 | 23 | -------------------------------------------------------------------------------- /docs/framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiyuanLucasLiu/LM-LSTM-CRF/b03ecf37799dee8f899783e7c475698d29288bc6/docs/framework.png -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # LM-LSTM-CRF documentation build configuration file, created by 5 | # sphinx-quickstart on Thu Sep 14 03:49:01 2017. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | 20 | import os 21 | import sys 22 | 23 | sys.path.insert(0, os.path.abspath('../..')) 24 | 25 | # -- General configuration ------------------------------------------------ 26 | 27 | # If your documentation needs a minimal Sphinx version, state it here. 28 | # 29 | # needs_sphinx = '1.0' 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = [ 35 | 'sphinx.ext.autodoc', 36 | 'sphinx.ext.autosummary', 37 | 'sphinx.ext.doctest', 38 | 'sphinx.ext.intersphinx', 39 | 'sphinx.ext.todo', 40 | 'sphinx.ext.coverage', 41 | 'sphinx.ext.mathjax', 42 | 'sphinx.ext.napoleon', 43 | 'sphinx.ext.viewcode', 44 | ] 45 | 46 | napoleon_use_ivar = True 47 | 48 | # Add any paths that contain templates here, relative to this directory. 49 | templates_path = ['_templates'] 50 | 51 | # The suffix(es) of source filenames. 52 | # You can specify multiple suffix as a list of string: 53 | # 54 | # source_suffix = ['.rst', '.md'] 55 | source_suffix = '.rst' 56 | 57 | # The master toctree document. 58 | master_doc = 'index' 59 | 60 | # General information about the project. 61 | project = 'LM-LSTM-CRF' 62 | copyright = '2017, Liyuan Liu, Frank Xu, Jingbo Shang' 63 | author = 'Liyuan Liu, Frank Xu, Jingbo Shang' 64 | 65 | # The version info for the project you're documenting, acts as replacement for 66 | # |version| and |release|, also used in various other places throughout the 67 | # built documents. 68 | # 69 | # The short X.Y version. 70 | version = '' 71 | # The full version, including alpha/beta/rc tags. 72 | release = '' 73 | 74 | # The language for content autogenerated by Sphinx. Refer to documentation 75 | # for a list of supported languages. 76 | # 77 | # This is also used if you do content translation via gettext catalogs. 78 | # Usually you set "language" from the command line for these cases. 79 | language = None 80 | 81 | # List of patterns, relative to source directory, that match files and 82 | # directories to ignore when looking for source files. 83 | # This patterns also effect to html_static_path and html_extra_path 84 | exclude_patterns = [] 85 | 86 | # The name of the Pygments (syntax highlighting) style to use. 87 | pygments_style = 'sphinx' 88 | 89 | # If true, `todo` and `todoList` produce output, else they produce nothing. 90 | todo_include_todos = False 91 | 92 | # -- Options for HTML output ---------------------------------------------- 93 | 94 | # The theme to use for HTML and HTML Help pages. See the documentation for 95 | # a list of builtin themes. 96 | # 97 | html_theme = 'sphinx_rtd_theme' 98 | 99 | # Theme options are theme-specific and customize the look and feel of a theme 100 | # further. For a list of options available for each theme, see the 101 | # documentation. 102 | # 103 | # html_theme_options = {} 104 | html_theme_options = { 105 | 'collapse_navigation': False, 106 | 'display_version': True, 107 | } 108 | 109 | # Add any paths that contain custom static files (such as style sheets) here, 110 | # relative to this directory. They are copied after the builtin static files, 111 | # so a file named "default.css" will overwrite the builtin "default.css". 112 | html_static_path = ['_static'] 113 | 114 | # Custom sidebar templates, must be a dictionary that maps document names 115 | # to template names. 116 | # 117 | # This is required for the alabaster theme 118 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars 119 | html_sidebars = { 120 | '**': [ 121 | 'about.html', 122 | 'navigation.html', 123 | 'relations.html', # needs 'show_related': True theme option to display 124 | 'searchbox.html', 125 | 'donate.html', 126 | ] 127 | } 128 | 129 | # -- Options for HTMLHelp output ------------------------------------------ 130 | 131 | # Output file base name for HTML help builder. 132 | htmlhelp_basename = 'LM-LSTM-CRFdoc' 133 | 134 | # -- Options for LaTeX output --------------------------------------------- 135 | 136 | latex_elements = { 137 | # The paper size ('letterpaper' or 'a4paper'). 138 | # 139 | # 'papersize': 'letterpaper', 140 | 141 | # The font size ('10pt', '11pt' or '12pt'). 142 | # 143 | # 'pointsize': '10pt', 144 | 145 | # Additional stuff for the LaTeX preamble. 146 | # 147 | # 'preamble': '', 148 | 149 | # Latex figure (float) alignment 150 | # 151 | # 'figure_align': 'htbp', 152 | } 153 | 154 | # Grouping the document tree into LaTeX files. List of tuples 155 | # (source start file, target name, title, 156 | # author, documentclass [howto, manual, or own class]). 157 | latex_documents = [ 158 | (master_doc, 'LM-LSTM-CRF.tex', 'LM-LSTM-CRF Documentation', 159 | 'Liyuan Liu, Frank Xu, Jingbo Shang', 'manual'), 160 | ] 161 | 162 | # -- Options for manual page output --------------------------------------- 163 | 164 | # One entry per manual page. List of tuples 165 | # (source start file, name, description, authors, manual section). 166 | man_pages = [ 167 | (master_doc, 'lm-lstm-crf', 'LM-LSTM-CRF Documentation', 168 | [author], 1) 169 | ] 170 | 171 | # -- Options for Texinfo output ------------------------------------------- 172 | 173 | # Grouping the document tree into Texinfo files. List of tuples 174 | # (source start file, target name, title, author, 175 | # dir menu entry, description, category) 176 | texinfo_documents = [ 177 | (master_doc, 'LM-LSTM-CRF', 'LM-LSTM-CRF Documentation', 178 | author, 'LM-LSTM-CRF', 'One line description of project.', 179 | 'Miscellaneous'), 180 | ] 181 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. LM-LSTM-CRF documentation master file, created by 2 | sphinx-quickstart on Thu Sep 14 03:49:01 2017. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | :github_url: https://github.com/LiyuanLucasLiu/LM-LSTM-CRF 7 | 8 | LM-LSTM-CRF documentation 9 | ========================= 10 | 11 | **Check Our New NER Toolkit🚀🚀🚀** 12 | 13 | - **Inference**: 14 | 15 | - `LightNER `_: inference w. models pre-trained / trained w. *any* following tools, *efficiently*. 16 | 17 | - **Training**: 18 | 19 | - `LD-Net `_: train NER models w. efficient contextualized representations. 20 | - `VanillaNER `_: train vanilla NER models w. pre-trained embedding. 21 | 22 | - **Distant Training**: 23 | 24 | - `AutoNER `_: train NER models w.o. line-by-line annotations and get competitive performance. 25 | 26 | -------------------------- 27 | 28 | This project provides high-performance character-aware sequence labeling tools, including [Training](#usage), [Evaluation](#evaluation) and [Prediction](#prediction). 29 | 30 | Details about LM-LSTM-CRF can be accessed `here `_, and the implementation is based on the PyTorch library. 31 | 32 | .. toctree:: 33 | :glob: 34 | :maxdepth: 1 35 | :caption: Notes 36 | 37 | notes/* 38 | 39 | .. toctree:: 40 | :maxdepth: 4 41 | :caption: Package Reference 42 | 43 | model 44 | 45 | 46 | Indices and tables 47 | ================== 48 | 49 | * :ref:`genindex` 50 | * :ref:`modindex` 51 | * :ref:`search` 52 | -------------------------------------------------------------------------------- /docs/source/model.rst: -------------------------------------------------------------------------------- 1 | model package 2 | ============= 3 | 4 | Submodules 5 | ---------- 6 | 7 | model\.crf module 8 | ----------------- 9 | 10 | .. automodule:: model.crf 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | model\.evaluator module 16 | ----------------------- 17 | 18 | .. automodule:: model.evaluator 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | model\.highway module 24 | --------------------- 25 | 26 | .. automodule:: model.highway 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | model\.lm\_lstm\_crf module 32 | --------------------------- 33 | 34 | .. automodule:: model.lm_lstm_crf 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | model\.lstm\_crf module 40 | ----------------------- 41 | 42 | .. automodule:: model.lstm_crf 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | model\.ner\_dataset module 48 | -------------------------- 49 | 50 | .. automodule:: model.ner_dataset 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | 55 | model\.utils module 56 | ------------------- 57 | 58 | .. automodule:: model.utils 59 | :members: 60 | :undoc-members: 61 | :show-inheritance: 62 | 63 | 64 | Module contents 65 | --------------- 66 | 67 | .. automodule:: model 68 | :members: 69 | :undoc-members: 70 | :show-inheritance: 71 | -------------------------------------------------------------------------------- /docs/source/modules.rst: -------------------------------------------------------------------------------- 1 | model 2 | ===== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | model 8 | -------------------------------------------------------------------------------- /eval_w.py: -------------------------------------------------------------------------------- 1 | 2 | from __future__ import print_function 3 | import datetime 4 | import time 5 | import torch 6 | import torch.autograd as autograd 7 | import torch.nn as nn 8 | import torch.optim as optim 9 | import codecs 10 | from model.crf import * 11 | from model.lstm_crf import * 12 | import model.utils as utils 13 | from model.evaluator import eval_w 14 | 15 | import argparse 16 | import json 17 | import os 18 | import sys 19 | from tqdm import tqdm 20 | import itertools 21 | import functools 22 | 23 | if __name__ == "__main__": 24 | parser = argparse.ArgumentParser(description='Evaluating BLSTM-CRF') 25 | parser.add_argument('--load_arg', default='./checkpoint/soa/check_wc_p_char_lstm_crf.json', help='arg json file path') 26 | parser.add_argument('--load_check_point', default='./checkpoint/soa/check_wc_p_char_lstm_crf.model', help='checkpoint path') 27 | parser.add_argument('--gpu',type=int, default=0, help='gpu id') 28 | parser.add_argument('--eva_matrix', choices=['a', 'fa'], default='fa', help='use f1 and accuracy or accuracy alone') 29 | parser.add_argument('--test_file', default='', help='path to test file, if set to none, would use test_file path in the checkpoint file') 30 | args = parser.parse_args() 31 | 32 | with open(args.load_arg, 'r') as f: 33 | jd = json.load(f) 34 | jd = jd['args'] 35 | 36 | checkpoint_file = torch.load(args.load_check_point, map_location=lambda storage, loc: storage) 37 | f_map = checkpoint_file['f_map'] 38 | l_map = checkpoint_file['l_map'] 39 | if args.gpu >= 0: 40 | torch.cuda.set_device(args.gpu) 41 | 42 | 43 | # load corpus 44 | 45 | if args.test_file: 46 | with codecs.open(args.test_file, 'r', 'utf-8') as f: 47 | test_lines = f.readlines() 48 | else: 49 | with codecs.open(jd['test_file'], 'r', 'utf-8') as f: 50 | test_lines = f.readlines() 51 | 52 | # converting format 53 | 54 | test_features, test_labels = utils.read_corpus(test_lines) 55 | 56 | # construct dataset 57 | test_dataset = utils.construct_bucket_mean_vb(test_features, test_labels, f_map, l_map, jd['caseless']) 58 | 59 | test_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in test_dataset] 60 | 61 | # build model 62 | ner_model = LSTM_CRF(len(f_map), len(l_map), jd['embedding_dim'], jd['hidden'], jd['layers'], jd['drop_out'], large_CRF=jd['small_crf']) 63 | 64 | ner_model.load_state_dict(checkpoint_file['state_dict']) 65 | 66 | if args.gpu >= 0: 67 | if_cuda = True 68 | torch.cuda.set_device(args.gpu) 69 | ner_model.cuda() 70 | packer = CRFRepack(len(l_map), True) 71 | else: 72 | if_cuda = False 73 | packer = CRFRepack(len(l_map), False) 74 | 75 | evaluator = eval_w(packer, l_map, args.eva_matrix) 76 | 77 | if 'f' in args.eva_matrix: 78 | 79 | test_f1, test_pre, test_rec, test_acc = evaluator.calc_score(ner_model, test_dataset_loader) 80 | 81 | print(jd['checkpoint'] + ' test_f1: %.4f test_rec: %.4f test_pre: %.4f test_acc: %.4f\n' % (test_f1, test_rec, test_pre, test_acc)) 82 | 83 | else: 84 | 85 | test_acc = evaluator.calc_score(ner_model, test_dataset_loader) 86 | 87 | print(jd['checkpoint'] + ' test_acc: %.4f\n' % (test_acc)) 88 | -------------------------------------------------------------------------------- /eval_wc.py: -------------------------------------------------------------------------------- 1 | 2 | from __future__ import print_function 3 | import datetime 4 | import time 5 | import torch 6 | import torch.autograd as autograd 7 | import torch.nn as nn 8 | import torch.optim as optim 9 | import codecs 10 | from model.crf import * 11 | from model.lm_lstm_crf import * 12 | import model.utils as utils 13 | from model.evaluator import eval_wc 14 | 15 | import argparse 16 | import json 17 | import os 18 | import sys 19 | from tqdm import tqdm 20 | import itertools 21 | import functools 22 | 23 | if __name__ == "__main__": 24 | parser = argparse.ArgumentParser(description='Evaluating LM-BLSTM-CRF') 25 | parser.add_argument('--load_arg', default='./checkpoint/soa/check_wc_p_char_lstm_crf.json', help='path to arg json') 26 | parser.add_argument('--load_check_point', default='./checkpoint/soa/check_wc_p_char_lstm_crf.model', help='path to model checkpoint file') 27 | parser.add_argument('--gpu',type=int, default=0, help='gpu id') 28 | parser.add_argument('--eva_matrix', choices=['a', 'fa'], default='fa', help='use f1 and accuracy or f1 alone') 29 | parser.add_argument('--test_file', default='', help='path to test file, if set to none, would use test_file path in the checkpoint file') 30 | args = parser.parse_args() 31 | 32 | with open(args.load_arg, 'r') as f: 33 | jd = json.load(f) 34 | jd = jd['args'] 35 | 36 | checkpoint_file = torch.load(args.load_check_point, map_location=lambda storage, loc: storage) 37 | f_map = checkpoint_file['f_map'] 38 | l_map = checkpoint_file['l_map'] 39 | c_map = checkpoint_file['c_map'] 40 | in_doc_words = checkpoint_file['in_doc_words'] 41 | if args.gpu >= 0: 42 | torch.cuda.set_device(args.gpu) 43 | 44 | 45 | # load corpus 46 | if args.test_file: 47 | with codecs.open(args.test_file, 'r', 'utf-8') as f: 48 | test_lines = f.readlines() 49 | else: 50 | with codecs.open(jd['test_file'], 'r', 'utf-8') as f: 51 | test_lines = f.readlines() 52 | 53 | # converting format 54 | 55 | test_features, test_labels = utils.read_corpus(test_lines) 56 | 57 | # construct dataset 58 | test_dataset, forw_test, back_test = utils.construct_bucket_mean_vb_wc(test_features, test_labels, l_map, c_map, f_map, jd['caseless']) 59 | 60 | test_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in test_dataset] 61 | 62 | # build model 63 | ner_model = LM_LSTM_CRF(len(l_map), len(c_map), jd['char_dim'], jd['char_hidden'], jd['char_layers'], jd['word_dim'], jd['word_hidden'], jd['word_layers'], len(f_map), jd['drop_out'], large_CRF=jd['small_crf'], if_highway=jd['high_way'], in_doc_words=in_doc_words, highway_layers = jd['highway_layers']) 64 | 65 | ner_model.load_state_dict(checkpoint_file['state_dict']) 66 | 67 | if args.gpu >= 0: 68 | if_cuda = True 69 | torch.cuda.set_device(args.gpu) 70 | ner_model.cuda() 71 | packer = CRFRepack_WC(len(l_map), True) 72 | else: 73 | if_cuda = False 74 | packer = CRFRepack_WC(len(l_map), False) 75 | 76 | evaluator = eval_wc(packer, l_map, args.eva_matrix) 77 | 78 | print('start') 79 | if 'f' in args.eva_matrix: 80 | 81 | result = evaluator.calc_score(ner_model, test_dataset_loader) 82 | for label, (test_f1, test_pre, test_rec, test_acc, msg) in result.items(): 83 | print(jd['checkpoint'] +' : %s : test_f1: %.4f test_rec: %.4f test_pre: %.4f test_acc: %.4f | %s\n' % (label, test_f1, test_rec, test_pre, test_acc, msg)) 84 | 85 | else: 86 | 87 | test_acc = evaluator.calc_score(ner_model, test_dataset_loader) 88 | 89 | print(jd['checkpoint'] + ' test_acc: %.4f\n' % (test_acc)) 90 | print('end') 91 | -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = "Liyuan Liu and Frank Xu" 2 | __credits__ = ["Liyuan Liu", "Frank Xu", "Jingbo Shang"] 3 | 4 | __license__ = "Apache License 2.0" 5 | __maintainer__ = "Liyuan Liu" 6 | __email__ = "llychinalz@gmail.com" -------------------------------------------------------------------------------- /model/crf.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. module:: crf 3 | :synopsis: conditional random field 4 | 5 | .. moduleauthor:: Liyuan Liu 6 | """ 7 | 8 | import torch 9 | import torch.autograd as autograd 10 | import torch.nn as nn 11 | import torch.optim as optim 12 | import torch.sparse as sparse 13 | import model.utils as utils 14 | 15 | 16 | class CRF_L(nn.Module): 17 | """Conditional Random Field (CRF) layer. This version is used in Ma et al. 2016, has more parameters than CRF_S 18 | 19 | args: 20 | hidden_dim : input dim size 21 | tagset_size: target_set_size 22 | if_biase: whether allow bias in linear trans 23 | """ 24 | 25 | 26 | def __init__(self, hidden_dim, tagset_size, if_bias=True): 27 | super(CRF_L, self).__init__() 28 | self.tagset_size = tagset_size 29 | self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size * self.tagset_size, bias=if_bias) 30 | 31 | def rand_init(self): 32 | """random initialization 33 | """ 34 | utils.init_linear(self.hidden2tag) 35 | 36 | def forward(self, feats): 37 | """ 38 | args: 39 | feats (batch_size, seq_len, hidden_dim) : input score from previous layers 40 | return: 41 | output from crf layer (batch_size, seq_len, tag_size, tag_size) 42 | """ 43 | return self.hidden2tag(feats).view(-1, self.tagset_size, self.tagset_size) 44 | 45 | 46 | class CRF_S(nn.Module): 47 | """Conditional Random Field (CRF) layer. This version is used in Lample et al. 2016, has less parameters than CRF_L. 48 | 49 | args: 50 | hidden_dim: input dim size 51 | tagset_size: target_set_size 52 | if_biase: whether allow bias in linear trans 53 | 54 | """ 55 | 56 | def __init__(self, hidden_dim, tagset_size, if_bias=True): 57 | super(CRF_S, self).__init__() 58 | self.tagset_size = tagset_size 59 | self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size, bias=if_bias) 60 | self.transitions = nn.Parameter(torch.Tensor(self.tagset_size, self.tagset_size)) 61 | 62 | def rand_init(self): 63 | """random initialization 64 | """ 65 | utils.init_linear(self.hidden2tag) 66 | self.transitions.data.zero_() 67 | 68 | def forward(self, feats): 69 | """ 70 | args: 71 | feats (batch_size, seq_len, hidden_dim) : input score from previous layers 72 | return: 73 | output from crf layer ( (batch_size * seq_len), tag_size, tag_size) 74 | """ 75 | 76 | scores = self.hidden2tag(feats).view(-1, self.tagset_size, 1) 77 | ins_num = scores.size(0) 78 | crf_scores = scores.expand(ins_num, self.tagset_size, self.tagset_size) + self.transitions.view(1, self.tagset_size, self.tagset_size).expand(ins_num, self.tagset_size, self.tagset_size) 79 | 80 | return crf_scores 81 | 82 | class CRFRepack: 83 | """Packer for word level model 84 | 85 | args: 86 | tagset_size: target_set_size 87 | if_cuda: whether use GPU 88 | """ 89 | 90 | def __init__(self, tagset_size, if_cuda): 91 | 92 | self.tagset_size = tagset_size 93 | self.if_cuda = if_cuda 94 | 95 | def repack_vb(self, feature, target, mask): 96 | """packer for viterbi loss 97 | 98 | args: 99 | feature (Seq_len, Batch_size): input feature 100 | target (Seq_len, Batch_size): output target 101 | mask (Seq_len, Batch_size): padding mask 102 | return: 103 | feature (Seq_len, Batch_size), target (Seq_len, Batch_size), mask (Seq_len, Batch_size) 104 | """ 105 | 106 | if self.if_cuda: 107 | fea_v = feature.transpose(0, 1).cuda() 108 | tg_v = target.transpose(0, 1).unsqueeze(2).cuda() 109 | mask_v = mask.transpose(0, 1).cuda() 110 | else: 111 | fea_v = feature.transpose(0, 1) 112 | tg_v = target.transpose(0, 1).contiguous().unsqueeze(2) 113 | mask_v = mask.transpose(0, 1).contiguous() 114 | return fea_v, tg_v, mask_v 115 | 116 | def repack_gd(self, feature, target, current): 117 | """packer for greedy loss 118 | 119 | args: 120 | feature (Seq_len, Batch_size): input feature 121 | target (Seq_len, Batch_size): output target 122 | current (Seq_len, Batch_size): current state 123 | return: 124 | feature (Seq_len, Batch_size), target (Seq_len * Batch_size), current (Seq_len * Batch_size, 1, 1) 125 | """ 126 | if self.if_cuda: 127 | fea_v = feature.transpose(0, 1).cuda() 128 | ts_v = target.transpose(0, 1).cuda().view(-1) 129 | cs_v = current.transpose(0, 1).cuda().view(-1, 1, 1) 130 | else: 131 | fea_v = feature.transpose(0, 1) 132 | ts_v = target.transpose(0, 1).contiguous().view(-1) 133 | cs_v = current.transpose(0, 1).contiguous().view(-1, 1, 1) 134 | return fea_v, ts_v, cs_v 135 | 136 | def convert_for_eval(self, target): 137 | """convert target to original decoding 138 | 139 | args: 140 | target: input labels used in training 141 | return: 142 | output labels used in test 143 | """ 144 | return target % self.tagset_size 145 | 146 | 147 | class CRFRepack_WC: 148 | """Packer for model with char-level and word-level 149 | 150 | args: 151 | tagset_size: target_set_size 152 | if_cuda: whether use GPU 153 | 154 | """ 155 | 156 | def __init__(self, tagset_size, if_cuda): 157 | 158 | self.tagset_size = tagset_size 159 | self.if_cuda = if_cuda 160 | 161 | def repack_vb(self, fc_feature, fc_position, bc_feature, bc_position, word_feature, target, mask, batch_len): 162 | """packer for viterbi loss 163 | 164 | args: 165 | fc_feature (Char_Seq_len, Batch_size) : forward_char input feature 166 | fc_position (Word_Seq_len, Batch_size) : forward_char input position 167 | bc_feature (Char_Seq_len, Batch_size) : backward_char input feature 168 | bc_position (Word_Seq_len, Batch_size) : backward_char input position 169 | word_feature (Word_Seq_len, Batch_size) : input word feature 170 | target (Seq_len, Batch_size) : output target 171 | mask (Word_Seq_len, Batch_size) : padding mask 172 | batch_len (Batch_size, 2) : length of instances in one batch 173 | return: 174 | f_f (Char_Reduced_Seq_len, Batch_size), f_p (Word_Reduced_Seq_len, Batch_size), b_f (Char_Reduced_Seq_len, Batch_size), b_p (Word_Reduced_Seq_len, Batch_size), w_f (size Word_Seq_Len, Batch_size), target (Reduced_Seq_len, Batch_size), mask (Word_Reduced_Seq_len, Batch_size) 175 | 176 | """ 177 | mlen, _ = batch_len.max(0) 178 | mlen = mlen.squeeze() 179 | ocl = bc_feature.size(1) 180 | if self.if_cuda: 181 | fc_feature = fc_feature[:, 0:mlen[0]].transpose(0, 1).cuda() 182 | fc_position = fc_position[:, 0:mlen[1]].transpose(0, 1).cuda() 183 | bc_feature = bc_feature[:, -mlen[0]:].transpose(0, 1).cuda() 184 | bc_position = (bc_position[:, 0:mlen[1]] - ocl + mlen[0]).transpose(0, 1).cuda() 185 | word_feature = word_feature[:, 0:mlen[1]].transpose(0, 1).cuda() 186 | tg_v = target[:, 0:mlen[1]].transpose(0, 1).unsqueeze(2).cuda() 187 | mask_v = mask[:, 0:mlen[1]].transpose(0, 1).cuda() 188 | else: 189 | fc_feature = fc_feature[:, 0:mlen[0]].transpose(0, 1) 190 | fc_position = fc_position[:, 0:mlen[1]].transpose(0, 1) 191 | bc_feature = bc_feature[:, -mlen[0]:].transpose(0, 1) 192 | bc_position = (bc_position[:, 0:mlen[1]] - ocl + mlen[0]).transpose(0, 1) 193 | word_feature = word_feature[:, 0:mlen[1]].transpose(0, 1) 194 | tg_v = target[:, 0:mlen[1]].transpose(0, 1).unsqueeze(2) 195 | mask_v = mask[:, 0:mlen[1]].transpose(0, 1).contiguous() 196 | return fc_feature, fc_position, bc_feature, bc_position, word_feature, tg_v, mask_v 197 | 198 | def convert_for_eval(self, target): 199 | """convert for eval 200 | 201 | args: 202 | target: input labels used in training 203 | return: 204 | output labels used in test 205 | """ 206 | return target % self.tagset_size 207 | 208 | 209 | class CRFLoss_gd(nn.Module): 210 | """loss for greedy decode loss, i.e., although its for CRF Layer, we calculate the loss as 211 | 212 | .. math:: 213 | \sum_{j=1}^n \log (p(\hat{y}_{j+1}|z_{j+1}, \hat{y}_{j})) 214 | 215 | instead of 216 | 217 | .. math:: 218 | \sum_{j=1}^n \log (\phi(\hat{y}_{j-1}, \hat{y}_j, \mathbf{z}_j)) - \log (\sum_{\mathbf{y}' \in \mathbf{Y}(\mathbf{Z})} \prod_{j=1}^n \phi(y'_{j-1}, y'_j, \mathbf{z}_j) ) 219 | 220 | args: 221 | tagset_size: target_set_size 222 | start_tag: ind for 223 | end_tag: ind for 224 | average_batch: whether average the loss among batch 225 | 226 | """ 227 | 228 | def __init__(self, tagset_size, start_tag, end_tag, average_batch=True): 229 | super(CRFLoss_gd, self).__init__() 230 | self.tagset_size = tagset_size 231 | self.average_batch = average_batch 232 | self.crit = nn.CrossEntropyLoss(size_average=self.average_batch) 233 | 234 | def forward(self, scores, target, current): 235 | """ 236 | args: 237 | scores (Word_Seq_len, Batch_size, target_size_from, target_size_to): crf scores 238 | target (Word_Seq_len, Batch_size): golden list 239 | current (Word_Seq_len, Batch_size): current state 240 | return: 241 | crf greedy loss 242 | """ 243 | ins_num = current.size(0) 244 | current = current.expand(ins_num, 1, self.tagset_size) 245 | scores = scores.view(ins_num, self.tagset_size, self.tagset_size) 246 | current_score = torch.gather(scores, 1, current).squeeze() 247 | return self.crit(current_score, target) 248 | 249 | 250 | class CRFLoss_vb(nn.Module): 251 | """loss for viterbi decode 252 | 253 | .. math:: 254 | \sum_{j=1}^n \log (\phi(\hat{y}_{j-1}, \hat{y}_j, \mathbf{z}_j)) - \log (\sum_{\mathbf{y}' \in \mathbf{Y}(\mathbf{Z})} \prod_{j=1}^n \phi(y'_{j-1}, y'_j, \mathbf{z}_j) ) 255 | 256 | args: 257 | tagset_size: target_set_size 258 | start_tag: ind for 259 | end_tag: ind for 260 | average_batch: whether average the loss among batch 261 | 262 | """ 263 | 264 | def __init__(self, tagset_size, start_tag, end_tag, average_batch=True): 265 | super(CRFLoss_vb, self).__init__() 266 | self.tagset_size = tagset_size 267 | self.start_tag = start_tag 268 | self.end_tag = end_tag 269 | self.average_batch = average_batch 270 | 271 | def forward(self, scores, target, mask): 272 | """ 273 | args: 274 | scores (seq_len, bat_size, target_size_from, target_size_to) : crf scores 275 | target (seq_len, bat_size, 1) : golden state 276 | mask (size seq_len, bat_size) : mask for padding 277 | return: 278 | loss 279 | """ 280 | 281 | # calculate batch size and seq len 282 | seq_len = scores.size(0) 283 | bat_size = scores.size(1) 284 | 285 | # calculate sentence score 286 | tg_energy = torch.gather(scores.view(seq_len, bat_size, -1), 2, target).view(seq_len, bat_size) # seq_len * bat_size 287 | tg_energy = tg_energy.masked_select(mask).sum() 288 | 289 | # calculate forward partition score 290 | 291 | # build iter 292 | seq_iter = enumerate(scores) 293 | # the first score should start with 294 | _, inivalues = seq_iter.__next__() # bat_size * from_target_size * to_target_size 295 | # only need start from start_tag 296 | partition = inivalues[:, self.start_tag, :].clone() # bat_size * to_target_size 297 | # iter over last scores 298 | for idx, cur_values in seq_iter: 299 | # previous to_target is current from_target 300 | # partition: previous results log(exp(from_target)), #(batch_size * from_target) 301 | # cur_values: bat_size * from_target * to_target 302 | cur_values = cur_values + partition.contiguous().view(bat_size, self.tagset_size, 1).expand(bat_size, self.tagset_size, self.tagset_size) 303 | cur_partition = utils.log_sum_exp(cur_values, self.tagset_size) 304 | # (bat_size * from_target * to_target) -> (bat_size * to_target) 305 | # partition = utils.switch(partition, cur_partition, mask[idx].view(bat_size, 1).expand(bat_size, self.tagset_size)).view(bat_size, -1) 306 | mask_idx = mask[idx, :].view(bat_size, 1).expand(bat_size, self.tagset_size) 307 | partition.masked_scatter_(mask_idx, cur_partition.masked_select(mask_idx)) #0 for partition, 1 for cur_partition 308 | 309 | #only need end at end_tag 310 | partition = partition[:, self.end_tag].sum() 311 | # average = mask.sum() 312 | 313 | # average_batch 314 | if self.average_batch: 315 | loss = (partition - tg_energy) / bat_size 316 | else: 317 | loss = (partition - tg_energy) 318 | 319 | return loss 320 | 321 | class CRFDecode_vb(): 322 | """Batch-mode viterbi decode 323 | 324 | args: 325 | tagset_size: target_set_size 326 | start_tag: ind for 327 | end_tag: ind for 328 | average_batch: whether average the loss among batch 329 | 330 | """ 331 | 332 | def __init__(self, tagset_size, start_tag, end_tag, average_batch=True): 333 | self.tagset_size = tagset_size 334 | self.start_tag = start_tag 335 | self.end_tag = end_tag 336 | self.average_batch = average_batch 337 | 338 | def decode(self, scores, mask): 339 | """Find the optimal path with viterbe decode 340 | 341 | args: 342 | scores (size seq_len, bat_size, target_size_from, target_size_to) : crf scores 343 | mask (seq_len, bat_size) : mask for padding 344 | return: 345 | decoded sequence (size seq_len, bat_size) 346 | """ 347 | # calculate batch size and seq len 348 | 349 | seq_len = scores.size(0) 350 | bat_size = scores.size(1) 351 | 352 | mask = 1 - mask 353 | decode_idx = torch.LongTensor(seq_len-1, bat_size) 354 | 355 | # calculate forward score and checkpoint 356 | 357 | # build iter 358 | seq_iter = enumerate(scores) 359 | # the first score should start with 360 | _, inivalues = seq_iter.__next__() # bat_size * from_target_size * to_target_size 361 | # only need start from start_tag 362 | forscores = inivalues[:, self.start_tag, :] # bat_size * to_target_size 363 | back_points = list() 364 | # iter over last scores 365 | for idx, cur_values in seq_iter: 366 | # previous to_target is current from_target 367 | # partition: previous results log(exp(from_target)), #(batch_size * from_target) 368 | # cur_values: bat_size * from_target * to_target 369 | cur_values = cur_values + forscores.contiguous().view(bat_size, self.tagset_size, 1).expand(bat_size, self.tagset_size, self.tagset_size) 370 | 371 | forscores, cur_bp = torch.max(cur_values, 1) 372 | cur_bp.masked_fill_(mask[idx].view(bat_size, 1).expand(bat_size, self.tagset_size), self.end_tag) 373 | back_points.append(cur_bp) 374 | 375 | pointer = back_points[-1][:, self.end_tag] 376 | decode_idx[-1] = pointer 377 | for idx in range(len(back_points)-2, -1, -1): 378 | back_point = back_points[idx] 379 | index = pointer.contiguous().view(-1,1) 380 | pointer = torch.gather(back_point, 1, index).view(-1) 381 | decode_idx[idx] = pointer 382 | return decode_idx 383 | -------------------------------------------------------------------------------- /model/evaluator.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. module:: evaluator 3 | :synopsis: evaluation method (f1 score and accuracy) 4 | 5 | .. moduleauthor:: Liyuan Liu, Frank Xu 6 | """ 7 | 8 | 9 | import torch 10 | import numpy as np 11 | import itertools 12 | 13 | import model.utils as utils 14 | from torch.autograd import Variable 15 | 16 | from model.crf import CRFDecode_vb 17 | 18 | class eval_batch: 19 | """Base class for evaluation, provide method to calculate f1 score and accuracy 20 | 21 | args: 22 | packer: provide method to convert target into original space [TODO: need to improve] 23 | l_map: dictionary for labels 24 | """ 25 | 26 | 27 | def __init__(self, packer, l_map): 28 | self.packer = packer 29 | self.l_map = l_map 30 | self.r_l_map = utils.revlut(l_map) 31 | self.totalp_counts={} 32 | self.truep_counts={} 33 | self.fn_counts={} 34 | self.fp_counts={} 35 | self.f1={} 36 | 37 | def reset(self): 38 | """ 39 | re-set all states 40 | """ 41 | self.correct_labels = 0 42 | self.total_labels = 0 43 | self.gold_count = 0 44 | self.guess_count = 0 45 | self.overlap_count = 0 46 | self.totalp_counts={} 47 | self.truep_counts={} 48 | self.fn_counts={} 49 | self.fp_counts={} 50 | self.f1={} 51 | 52 | def calc_f1_batch(self, decoded_data, target_data): 53 | """ 54 | update statics for f1 score 55 | 56 | args: 57 | decoded_data (batch_size, seq_len): prediction sequence 58 | target_data (batch_size, seq_len): ground-truth 59 | """ 60 | batch_decoded = torch.unbind(decoded_data, 1) 61 | batch_targets = torch.unbind(target_data, 0) 62 | 63 | for decoded, target in zip(batch_decoded, batch_targets): 64 | gold = self.packer.convert_for_eval(target) 65 | # remove padding 66 | length = utils.find_length_from_labels(gold, self.l_map) 67 | gold = gold[:length] 68 | best_path = decoded[:length] 69 | 70 | correct_labels_i, total_labels_i, gold_count_i, guess_count_i, overlap_count_i = self.eval_instance(best_path.numpy(), gold.numpy()) 71 | self.correct_labels += correct_labels_i 72 | self.total_labels += total_labels_i 73 | self.gold_count += gold_count_i 74 | self.guess_count += guess_count_i 75 | self.overlap_count += overlap_count_i 76 | 77 | def calc_acc_batch(self, decoded_data, target_data): 78 | """ 79 | update statics for accuracy 80 | 81 | args: 82 | decoded_data (batch_size, seq_len): prediction sequence 83 | target_data (batch_size, seq_len): ground-truth 84 | """ 85 | batch_decoded = torch.unbind(decoded_data, 1) 86 | batch_targets = torch.unbind(target_data, 0) 87 | 88 | for decoded, target in zip(batch_decoded, batch_targets): 89 | gold = self.packer.convert_for_eval(target) 90 | # remove padding 91 | length = utils.find_length_from_labels(gold, self.l_map) 92 | gold = gold[:length].numpy() 93 | best_path = decoded[:length].numpy() 94 | 95 | self.total_labels += length 96 | self.correct_labels += np.sum(np.equal(best_path, gold)) 97 | 98 | def f1_score(self): 99 | """ 100 | calculate f1 score based on statics 101 | """ 102 | if self.guess_count == 0: 103 | return {'total': (0.0, 0.0, 0.0, 0.0, '')} 104 | precision = self.overlap_count / float(self.guess_count) 105 | recall = self.overlap_count / float(self.gold_count) 106 | if precision == 0.0 or recall == 0.0: 107 | return {'total': (0.0, 0.0, 0.0, 0.0, '')} 108 | f = 2 * (precision * recall) / (precision + recall) 109 | accuracy = float(self.correct_labels) / self.total_labels 110 | message="" 111 | self.f1['total'] = (f, precision, recall, accuracy, message) 112 | for label in self.totalp_counts: 113 | tp = self.truep_counts.get(label,1) 114 | fn = sum(self.fn_counts.get(label,{}).values()) 115 | fp = sum(self.fp_counts.get(label,{}).values()) 116 | # print(label, str(tp), str(fp), str(fn), str(self.totalp_counts.get(label,0))) 117 | precision = tp / float(tp+fp+1e-9) 118 | recall = tp / float(tp+fn+1e-9) 119 | f = 2 * (precision * recall) / (precision + recall+1e-9) 120 | message = str(self.fn_counts.get(label, {})) 121 | self.f1[label] = (f, precision, recall, 0, message) 122 | return self.f1 123 | 124 | def acc_score(self): 125 | """ 126 | calculate accuracy score based on statics 127 | """ 128 | if 0 == self.total_labels: 129 | return 0.0 130 | accuracy = float(self.correct_labels) / self.total_labels 131 | return accuracy 132 | 133 | def eval_instance(self, best_path, gold): 134 | """ 135 | update statics for one instance 136 | 137 | args: 138 | best_path (seq_len): predicted 139 | gold (seq_len): ground-truth 140 | """ 141 | 142 | total_labels = len(best_path) 143 | correct_labels = np.sum(np.equal(best_path, gold)) 144 | for i in range(total_labels): 145 | gold_label = self.r_l_map[gold[i]] 146 | guessed_label = self.r_l_map[best_path[i]] 147 | self.totalp_counts[gold_label] = 1 + self.totalp_counts.get(gold_label,0) 148 | if gold_label == guessed_label: 149 | self.truep_counts[gold_label] = 1 + self.truep_counts.get(gold_label,0) 150 | else: 151 | val = self.fn_counts.get(gold_label,{}) 152 | val[guessed_label] = 1+ val.get(guessed_label,0) 153 | self.fn_counts[gold_label]=val 154 | 155 | val2 = self.fp_counts.get(guessed_label,{}) 156 | val2[gold_label] = 1+ val2.get(gold_label,0) 157 | self.fp_counts[guessed_label] = val2 158 | 159 | gold_chunks = utils.iobes_to_spans(gold, self.r_l_map) 160 | gold_count = len(gold_chunks) 161 | 162 | guess_chunks = utils.iobes_to_spans(best_path, self.r_l_map) 163 | guess_count = len(guess_chunks) 164 | 165 | overlap_chunks = gold_chunks & guess_chunks 166 | overlap_count = len(overlap_chunks) 167 | 168 | return correct_labels, total_labels, gold_count, guess_count, overlap_count 169 | 170 | class eval_w(eval_batch): 171 | """evaluation class for word level model (LSTM-CRF) 172 | 173 | args: 174 | packer: provide method to convert target into original space [TODO: need to improve] 175 | l_map: dictionary for labels 176 | score_type: use f1score with using 'f' 177 | 178 | """ 179 | 180 | def __init__(self, packer, l_map, score_type): 181 | eval_batch.__init__(self, packer, l_map) 182 | 183 | self.decoder = CRFDecode_vb(len(l_map), l_map[''], l_map['']) 184 | 185 | if 'f' in score_type: 186 | self.eval_b = self.calc_f1_batch 187 | self.calc_s = self.f1_score 188 | else: 189 | self.eval_b = self.calc_acc_batch 190 | self.calc_s = self.acc_score 191 | 192 | def calc_score(self, ner_model, dataset_loader): 193 | """ 194 | calculate score for pre-selected metrics 195 | 196 | args: 197 | ner_model: LSTM-CRF model 198 | dataset_loader: loader class for test set 199 | """ 200 | ner_model.eval() 201 | self.reset() 202 | 203 | for feature, tg, mask in itertools.chain.from_iterable(dataset_loader): 204 | fea_v, _, mask_v = self.packer.repack_vb(feature, tg, mask) 205 | scores, _ = ner_model(fea_v) 206 | decoded = self.decoder.decode(scores.data, mask_v.data) 207 | self.eval_b(decoded, tg) 208 | 209 | return self.calc_s() 210 | 211 | class eval_wc(eval_batch): 212 | """evaluation class for LM-LSTM-CRF 213 | 214 | args: 215 | packer: provide method to convert target into original space [TODO: need to improve] 216 | l_map: dictionary for labels 217 | score_type: use f1score with using 'f' 218 | 219 | """ 220 | 221 | def __init__(self, packer, l_map, score_type): 222 | eval_batch.__init__(self, packer, l_map) 223 | 224 | self.decoder = CRFDecode_vb(len(l_map), l_map[''], l_map['']) 225 | 226 | if 'f' in score_type: 227 | self.eval_b = self.calc_f1_batch 228 | self.calc_s = self.f1_score 229 | else: 230 | self.eval_b = self.calc_acc_batch 231 | self.calc_s = self.acc_score 232 | 233 | def calc_score(self, ner_model, dataset_loader): 234 | """ 235 | calculate score for pre-selected metrics 236 | 237 | args: 238 | ner_model: LM-LSTM-CRF model 239 | dataset_loader: loader class for test set 240 | """ 241 | ner_model.eval() 242 | self.reset() 243 | 244 | for f_f, f_p, b_f, b_p, w_f, tg, mask_v, len_v in itertools.chain.from_iterable(dataset_loader): 245 | f_f, f_p, b_f, b_p, w_f, _, mask_v = self.packer.repack_vb(f_f, f_p, b_f, b_p, w_f, tg, mask_v, len_v) 246 | scores = ner_model(f_f, f_p, b_f, b_p, w_f) 247 | decoded = self.decoder.decode(scores.data, mask_v.data) 248 | self.eval_b(decoded, tg) 249 | 250 | return self.calc_s() 251 | -------------------------------------------------------------------------------- /model/highway.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. module:: highway 3 | :synopsis: highway network 4 | 5 | .. moduleauthor:: Liyuan Liu 6 | """ 7 | 8 | import torch 9 | import torch.nn as nn 10 | import model.utils as utils 11 | 12 | class hw(nn.Module): 13 | """Highway layers 14 | 15 | args: 16 | size: input and output dimension 17 | dropout_ratio: dropout ratio 18 | """ 19 | 20 | def __init__(self, size, num_layers = 1, dropout_ratio = 0.5): 21 | super(hw, self).__init__() 22 | self.size = size 23 | self.num_layers = num_layers 24 | self.trans = nn.ModuleList() 25 | self.gate = nn.ModuleList() 26 | self.dropout = nn.Dropout(p=dropout_ratio) 27 | 28 | for i in range(num_layers): 29 | tmptrans = nn.Linear(size, size) 30 | tmpgate = nn.Linear(size, size) 31 | self.trans.append(tmptrans) 32 | self.gate.append(tmpgate) 33 | 34 | def rand_init(self): 35 | """ 36 | random initialization 37 | """ 38 | for i in range(self.num_layers): 39 | utils.init_linear(self.trans[i]) 40 | utils.init_linear(self.gate[i]) 41 | 42 | def forward(self, x): 43 | """ 44 | update statics for f1 score 45 | 46 | args: 47 | x (ins_num, hidden_dim): input tensor 48 | return: 49 | output tensor (ins_num, hidden_dim) 50 | """ 51 | 52 | 53 | g = nn.functional.sigmoid(self.gate[0](x)) 54 | h = nn.functional.relu(self.trans[0](x)) 55 | x = g * h + (1 - g) * x 56 | 57 | for i in range(1, self.num_layers): 58 | x = self.dropout(x) 59 | g = nn.functional.sigmoid(self.gate[i](x)) 60 | h = nn.functional.relu(self.trans[i](x)) 61 | x = g * h + (1 - g) * x 62 | 63 | return x -------------------------------------------------------------------------------- /model/lm_lstm_crf.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. module:: lm_lstm_crf 3 | :synopsis: lm_lstm_crf 4 | 5 | .. moduleauthor:: Liyuan Liu 6 | """ 7 | 8 | import torch 9 | import torch.autograd as autograd 10 | import torch.nn as nn 11 | import torch.optim as optim 12 | import numpy as np 13 | import model.crf as crf 14 | import model.utils as utils 15 | import model.highway as highway 16 | 17 | class LM_LSTM_CRF(nn.Module): 18 | """LM_LSTM_CRF model 19 | 20 | args: 21 | tagset_size: size of label set 22 | char_size: size of char dictionary 23 | char_dim: size of char embedding 24 | char_hidden_dim: size of char-level lstm hidden dim 25 | char_rnn_layers: number of char-level lstm layers 26 | embedding_dim: size of word embedding 27 | word_hidden_dim: size of word-level blstm hidden dim 28 | word_rnn_layers: number of word-level lstm layers 29 | vocab_size: size of word dictionary 30 | dropout_ratio: dropout ratio 31 | large_CRF: use CRF_L or not, refer model.crf.CRF_L and model.crf.CRF_S for more details 32 | if_highway: use highway layers or not 33 | in_doc_words: number of words that occurred in the corpus (used for language model prediction) 34 | highway_layers: number of highway layers 35 | """ 36 | 37 | def __init__(self, tagset_size, char_size, char_dim, char_hidden_dim, char_rnn_layers, embedding_dim, word_hidden_dim, word_rnn_layers, vocab_size, dropout_ratio, large_CRF=True, if_highway = False, in_doc_words = 2, highway_layers = 1): 38 | 39 | super(LM_LSTM_CRF, self).__init__() 40 | self.char_dim = char_dim 41 | self.char_hidden_dim = char_hidden_dim 42 | self.char_size = char_size 43 | self.word_dim = embedding_dim 44 | self.word_hidden_dim = word_hidden_dim 45 | self.word_size = vocab_size 46 | self.if_highway = if_highway 47 | 48 | self.char_embeds = nn.Embedding(char_size, char_dim) 49 | self.forw_char_lstm = nn.LSTM(char_dim, char_hidden_dim, num_layers=char_rnn_layers, bidirectional=False, dropout=dropout_ratio) 50 | self.back_char_lstm = nn.LSTM(char_dim, char_hidden_dim, num_layers=char_rnn_layers, bidirectional=False, dropout=dropout_ratio) 51 | self.char_rnn_layers = char_rnn_layers 52 | 53 | self.word_embeds = nn.Embedding(vocab_size, embedding_dim) 54 | 55 | self.word_lstm = nn.LSTM(embedding_dim + char_hidden_dim * 2, word_hidden_dim // 2, num_layers=word_rnn_layers, bidirectional=True, dropout=dropout_ratio) 56 | 57 | self.word_rnn_layers = word_rnn_layers 58 | 59 | self.dropout = nn.Dropout(p=dropout_ratio) 60 | 61 | self.tagset_size = tagset_size 62 | if large_CRF: 63 | self.crf = crf.CRF_L(word_hidden_dim, tagset_size) 64 | else: 65 | self.crf = crf.CRF_S(word_hidden_dim, tagset_size) 66 | 67 | if if_highway: 68 | self.forw2char = highway.hw(char_hidden_dim, num_layers=highway_layers, dropout_ratio=dropout_ratio) 69 | self.back2char = highway.hw(char_hidden_dim, num_layers=highway_layers, dropout_ratio=dropout_ratio) 70 | self.forw2word = highway.hw(char_hidden_dim, num_layers=highway_layers, dropout_ratio=dropout_ratio) 71 | self.back2word = highway.hw(char_hidden_dim, num_layers=highway_layers, dropout_ratio=dropout_ratio) 72 | self.fb2char = highway.hw(2 * char_hidden_dim, num_layers=highway_layers, dropout_ratio=dropout_ratio) 73 | 74 | self.char_pre_train_out = nn.Linear(char_hidden_dim, char_size) 75 | self.word_pre_train_out = nn.Linear(char_hidden_dim, in_doc_words) 76 | 77 | self.batch_size = 1 78 | self.word_seq_length = 1 79 | 80 | def set_batch_size(self, bsize): 81 | """ 82 | set batch size 83 | """ 84 | self.batch_size = bsize 85 | 86 | def set_batch_seq_size(self, sentence): 87 | """ 88 | set batch size and sequence length 89 | """ 90 | tmp = sentence.size() 91 | self.word_seq_length = tmp[0] 92 | self.batch_size = tmp[1] 93 | 94 | def rand_init_embedding(self): 95 | """ 96 | random initialize char-level embedding 97 | """ 98 | utils.init_embedding(self.char_embeds.weight) 99 | 100 | def load_pretrained_word_embedding(self, pre_word_embeddings): 101 | """ 102 | load pre-trained word embedding 103 | 104 | args: 105 | pre_word_embeddings (self.word_size, self.word_dim) : pre-trained embedding 106 | """ 107 | assert (pre_word_embeddings.size()[1] == self.word_dim) 108 | self.word_embeds.weight = nn.Parameter(pre_word_embeddings) 109 | 110 | def rand_init(self, init_char_embedding=True, init_word_embedding=False): 111 | """ 112 | random initialization 113 | 114 | args: 115 | init_char_embedding: random initialize char embedding or not 116 | init_word_embedding: random initialize word embedding or not 117 | """ 118 | 119 | if init_char_embedding: 120 | utils.init_embedding(self.char_embeds.weight) 121 | if init_word_embedding: 122 | utils.init_embedding(self.word_embeds.weight) 123 | if self.if_highway: 124 | self.forw2char.rand_init() 125 | self.back2char.rand_init() 126 | self.forw2word.rand_init() 127 | self.back2word.rand_init() 128 | self.fb2char.rand_init() 129 | utils.init_lstm(self.forw_char_lstm) 130 | utils.init_lstm(self.back_char_lstm) 131 | utils.init_lstm(self.word_lstm) 132 | utils.init_linear(self.char_pre_train_out) 133 | utils.init_linear(self.word_pre_train_out) 134 | self.crf.rand_init() 135 | 136 | def word_pre_train_forward(self, sentence, position, hidden=None): 137 | """ 138 | output of forward language model 139 | 140 | args: 141 | sentence (char_seq_len, batch_size): char-level representation of sentence 142 | position (word_seq_len, batch_size): position of blank space in char-level representation of sentence 143 | hidden: initial hidden state 144 | 145 | return: 146 | language model output (word_seq_len, in_doc_word), hidden 147 | """ 148 | 149 | embeds = self.char_embeds(sentence) 150 | d_embeds = self.dropout(embeds) 151 | lstm_out, hidden = self.forw_char_lstm(d_embeds) 152 | 153 | tmpsize = position.size() 154 | position = position.unsqueeze(2).expand(tmpsize[0], tmpsize[1], self.char_hidden_dim) 155 | select_lstm_out = torch.gather(lstm_out, 0, position) 156 | d_lstm_out = self.dropout(select_lstm_out).view(-1, self.char_hidden_dim) 157 | 158 | if self.if_highway: 159 | char_out = self.forw2word(d_lstm_out) 160 | d_char_out = self.dropout(char_out) 161 | else: 162 | d_char_out = d_lstm_out 163 | 164 | pre_score = self.word_pre_train_out(d_char_out) 165 | return pre_score, hidden 166 | 167 | def word_pre_train_backward(self, sentence, position, hidden=None): 168 | """ 169 | output of backward language model 170 | 171 | args: 172 | sentence (char_seq_len, batch_size): char-level representation of sentence (inverse order) 173 | position (word_seq_len, batch_size): position of blank space in inversed char-level representation of sentence 174 | hidden: initial hidden state 175 | 176 | return: 177 | language model output (word_seq_len, in_doc_word), hidden 178 | """ 179 | embeds = self.char_embeds(sentence) 180 | d_embeds = self.dropout(embeds) 181 | lstm_out, hidden = self.back_char_lstm(d_embeds) 182 | 183 | tmpsize = position.size() 184 | position = position.unsqueeze(2).expand(tmpsize[0], tmpsize[1], self.char_hidden_dim) 185 | select_lstm_out = torch.gather(lstm_out, 0, position) 186 | d_lstm_out = self.dropout(select_lstm_out).view(-1, self.char_hidden_dim) 187 | 188 | if self.if_highway: 189 | char_out = self.back2word(d_lstm_out) 190 | d_char_out = self.dropout(char_out) 191 | else: 192 | d_char_out = d_lstm_out 193 | 194 | pre_score = self.word_pre_train_out(d_char_out) 195 | return pre_score, hidden 196 | 197 | def forward(self, forw_sentence, forw_position, back_sentence, back_position, word_seq, hidden=None): 198 | ''' 199 | args: 200 | forw_sentence (char_seq_len, batch_size) : char-level representation of sentence 201 | forw_position (word_seq_len, batch_size) : position of blank space in char-level representation of sentence 202 | back_sentence (char_seq_len, batch_size) : char-level representation of sentence (inverse order) 203 | back_position (word_seq_len, batch_size) : position of blank space in inversed char-level representation of sentence 204 | word_seq (word_seq_len, batch_size) : word-level representation of sentence 205 | hidden: initial hidden state 206 | 207 | return: 208 | crf output (word_seq_len, batch_size, tag_size, tag_size), hidden 209 | ''' 210 | 211 | self.set_batch_seq_size(forw_position) 212 | 213 | #embedding layer 214 | forw_emb = self.char_embeds(forw_sentence) 215 | back_emb = self.char_embeds(back_sentence) 216 | 217 | #dropout 218 | d_f_emb = self.dropout(forw_emb) 219 | d_b_emb = self.dropout(back_emb) 220 | 221 | #forward the whole sequence 222 | forw_lstm_out, _ = self.forw_char_lstm(d_f_emb)#seq_len_char * batch * char_hidden_dim 223 | 224 | back_lstm_out, _ = self.back_char_lstm(d_b_emb)#seq_len_char * batch * char_hidden_dim 225 | 226 | #select predict point 227 | forw_position = forw_position.unsqueeze(2).expand(self.word_seq_length, self.batch_size, self.char_hidden_dim) 228 | select_forw_lstm_out = torch.gather(forw_lstm_out, 0, forw_position) 229 | 230 | back_position = back_position.unsqueeze(2).expand(self.word_seq_length, self.batch_size, self.char_hidden_dim) 231 | select_back_lstm_out = torch.gather(back_lstm_out, 0, back_position) 232 | 233 | fb_lstm_out = self.dropout(torch.cat((select_forw_lstm_out, select_back_lstm_out), dim=2)) 234 | if self.if_highway: 235 | char_out = self.fb2char(fb_lstm_out) 236 | d_char_out = self.dropout(char_out) 237 | else: 238 | d_char_out = fb_lstm_out 239 | 240 | #word 241 | word_emb = self.word_embeds(word_seq) 242 | d_word_emb = self.dropout(word_emb) 243 | 244 | #combine 245 | word_input = torch.cat((d_word_emb, d_char_out), dim = 2) 246 | 247 | #word level lstm 248 | lstm_out, _ = self.word_lstm(word_input) 249 | d_lstm_out = self.dropout(lstm_out) 250 | 251 | #convert to crf 252 | crf_out = self.crf(d_lstm_out) 253 | crf_out = crf_out.view(self.word_seq_length, self.batch_size, self.tagset_size, self.tagset_size) 254 | 255 | return crf_out -------------------------------------------------------------------------------- /model/lstm_crf.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. module:: lstm_crf 3 | :synopsis: lstm_crf 4 | 5 | .. moduleauthor:: Liyuan Liu 6 | """ 7 | 8 | import torch 9 | import torch.autograd as autograd 10 | import torch.nn as nn 11 | import model.crf as crf 12 | import model.utils as utils 13 | 14 | 15 | class LSTM_CRF(nn.Module): 16 | """LSTM_CRF model 17 | 18 | args: 19 | vocab_size: size of word dictionary 20 | tagset_size: size of label set 21 | embedding_dim: size of word embedding 22 | hidden_dim: size of word-level blstm hidden dim 23 | rnn_layers: number of word-level lstm layers 24 | dropout_ratio: dropout ratio 25 | large_CRF: use CRF_L or not, refer model.crf.CRF_L and model.crf.CRF_S for more details 26 | """ 27 | 28 | def __init__(self, vocab_size, tagset_size, embedding_dim, hidden_dim, rnn_layers, dropout_ratio, large_CRF=True): 29 | super(LSTM_CRF, self).__init__() 30 | self.embedding_dim = embedding_dim 31 | self.hidden_dim = hidden_dim 32 | self.vocab_size = vocab_size 33 | 34 | self.word_embeds = nn.Embedding(vocab_size, embedding_dim) 35 | self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, 36 | num_layers=rnn_layers, bidirectional=True, dropout=dropout_ratio) 37 | self.rnn_layers = rnn_layers 38 | 39 | self.dropout1 = nn.Dropout(p=dropout_ratio) 40 | self.dropout2 = nn.Dropout(p=dropout_ratio) 41 | 42 | self.tagset_size = tagset_size 43 | if large_CRF: 44 | self.crf = crf.CRF_L(hidden_dim, tagset_size) 45 | else: 46 | self.crf = crf.CRF_S(hidden_dim, tagset_size) 47 | 48 | self.batch_size = 1 49 | self.seq_length = 1 50 | 51 | def rand_init_hidden(self): 52 | """ 53 | random initialize hidden variable 54 | """ 55 | return autograd.Variable( 56 | torch.randn(2 * self.rnn_layers, self.batch_size, self.hidden_dim // 2)), autograd.Variable( 57 | torch.randn(2 * self.rnn_layers, self.batch_size, self.hidden_dim // 2)) 58 | 59 | def set_batch_size(self, bsize): 60 | """ 61 | set batch size 62 | """ 63 | self.batch_size = bsize 64 | 65 | def set_batch_seq_size(self, sentence): 66 | """ 67 | set batch size and sequence length 68 | """ 69 | tmp = sentence.size() 70 | self.seq_length = tmp[0] 71 | self.batch_size = tmp[1] 72 | 73 | def load_pretrained_embedding(self, pre_embeddings): 74 | """ 75 | load pre-trained word embedding 76 | 77 | args: 78 | pre_word_embeddings (self.word_size, self.word_dim) : pre-trained embedding 79 | """ 80 | assert (pre_embeddings.size()[1] == self.embedding_dim) 81 | self.word_embeds.weight = nn.Parameter(pre_embeddings) 82 | 83 | def rand_init_embedding(self): 84 | utils.init_embedding(self.word_embeds.weight) 85 | 86 | def rand_init(self, init_embedding=False): 87 | """ 88 | random initialization 89 | 90 | args: 91 | init_embedding: random initialize embedding or not 92 | """ 93 | if init_embedding: 94 | utils.init_embedding(self.word_embeds.weight) 95 | utils.init_lstm(self.lstm) 96 | self.crf.rand_init() 97 | 98 | def forward(self, sentence, hidden=None): 99 | ''' 100 | args: 101 | sentence (word_seq_len, batch_size) : word-level representation of sentence 102 | hidden: initial hidden state 103 | 104 | return: 105 | crf output (word_seq_len, batch_size, tag_size, tag_size), hidden 106 | ''' 107 | self.set_batch_seq_size(sentence) 108 | 109 | embeds = self.word_embeds(sentence) 110 | d_embeds = self.dropout1(embeds) 111 | 112 | lstm_out, hidden = self.lstm(d_embeds, hidden) 113 | lstm_out = lstm_out.view(-1, self.hidden_dim) 114 | 115 | d_lstm_out = self.dropout2(lstm_out) 116 | 117 | crf_out = self.crf(d_lstm_out) 118 | crf_out = crf_out.view(self.seq_length, self.batch_size, self.tagset_size, self.tagset_size) 119 | return crf_out, hidden -------------------------------------------------------------------------------- /model/ner_dataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. module:: datasets 3 | :synopsis: datasets 4 | 5 | .. moduleauthor:: Liyuan Liu 6 | """ 7 | 8 | from torch.utils.data import Dataset 9 | 10 | 11 | class CRFDataset(Dataset): 12 | """Dataset Class for word-level model 13 | 14 | args: 15 | data_tensor (ins_num, seq_length): words 16 | label_tensor (ins_num, seq_length): labels 17 | mask_tensor (ins_num, seq_length): padding masks 18 | """ 19 | def __init__(self, data_tensor, label_tensor, mask_tensor): 20 | assert data_tensor.size(0) == label_tensor.size(0) 21 | assert data_tensor.size(0) == mask_tensor.size(0) 22 | self.data_tensor = data_tensor 23 | self.label_tensor = label_tensor 24 | self.mask_tensor = mask_tensor 25 | 26 | def __getitem__(self, index): 27 | return self.data_tensor[index], self.label_tensor[index], self.mask_tensor[index] 28 | 29 | def __len__(self): 30 | return self.data_tensor.size(0) 31 | 32 | class CRFDataset_WC(Dataset): 33 | """Dataset Class for char-aware model 34 | 35 | args: 36 | forw_tensor (ins_num, seq_length): forward chars 37 | forw_index (ins_num, seq_length): index of forward chars 38 | back_tensor (ins_num, seq_length): backward chars 39 | back_index (ins_num, seq_length): index of backward chars 40 | word_tensor (ins_num, seq_length): words 41 | label_tensor (ins_num, seq_length): labels: 42 | mask_tensor (ins_num, seq_length): padding masks 43 | len_tensor (ins_num, 2): length of chars (dim0) and words (dim1) 44 | """ 45 | def __init__(self, forw_tensor, forw_index, back_tensor, back_index, word_tensor, label_tensor, mask_tensor, len_tensor): 46 | assert forw_tensor.size(0) == label_tensor.size(0) 47 | assert forw_tensor.size(0) == mask_tensor.size(0) 48 | assert forw_tensor.size(0) == forw_index.size(0) 49 | assert forw_tensor.size(0) == back_tensor.size(0) 50 | assert forw_tensor.size(0) == back_index.size(0) 51 | assert forw_tensor.size(0) == word_tensor.size(0) 52 | assert forw_tensor.size(0) == len_tensor.size(0) 53 | self.forw_tensor = forw_tensor 54 | self.forw_index = forw_index 55 | self.back_tensor = back_tensor 56 | self.back_index = back_index 57 | self.word_tensor = word_tensor 58 | self.label_tensor = label_tensor 59 | self.mask_tensor = mask_tensor 60 | self.len_tensor = len_tensor 61 | 62 | def __getitem__(self, index): 63 | return self.forw_tensor[index], self.forw_index[index], self.back_tensor[index], self.back_index[index], self.word_tensor[index], self.label_tensor[index], self.mask_tensor[index], self.len_tensor[index] 64 | 65 | def __len__(self): 66 | return self.forw_tensor.size(0) 67 | -------------------------------------------------------------------------------- /model/predictor.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. module:: predictor 3 | :synopsis: prediction method (for un-annotated text) 4 | 5 | .. moduleauthor:: Liyuan Liu 6 | """ 7 | 8 | import torch 9 | import torch.autograd as autograd 10 | import numpy as np 11 | import itertools 12 | import sys 13 | from tqdm import tqdm 14 | 15 | from model.crf import CRFDecode_vb 16 | from model.utils import * 17 | 18 | class predict: 19 | """Base class for prediction, provide method to calculate f1 score and accuracy 20 | 21 | args: 22 | if_cuda: if use cuda to speed up 23 | l_map: dictionary for labels 24 | label_seq: type of decode function, set `True` to couple label with text, or set 'False' to insert label into test 25 | batch_size: size of batch in decoding 26 | """ 27 | 28 | def __init__(self, if_cuda, l_map, label_seq = True, batch_size = 50): 29 | self.if_cuda = if_cuda 30 | self.l_map = l_map 31 | self.r_l_map = revlut(l_map) 32 | self.batch_size = batch_size 33 | if label_seq: 34 | self.decode_str = self.decode_l 35 | else: 36 | self.decode_str = self.decode_s 37 | 38 | def decode_l(self, feature, label): 39 | """ 40 | decode a sentence coupled with label 41 | 42 | args: 43 | feature (list): words list 44 | label (list): label list 45 | """ 46 | return '\n'.join(map(lambda t: t[0] + ' '+ self.r_l_map[t[1].item()], zip(feature, label))) 47 | 48 | def decode_s(self, feature, label): 49 | """ 50 | decode a sentence in the format of <> 51 | 52 | args: 53 | feature (list): words list 54 | label (list): label list 55 | """ 56 | chunks = "" 57 | current = None 58 | 59 | for f, y in zip(feature, label): 60 | label = self.r_l_map[y.item()] 61 | 62 | if label.startswith('B-'): 63 | 64 | if current is not None: 65 | chunks += " " 66 | current = label[2:] 67 | chunks += "<"+current+"> " + f + " " 68 | 69 | elif label.startswith('S-'): 70 | 71 | if current is not None: 72 | chunks += " " 73 | current = label[2:] 74 | chunks += "<"+current+"> " + f + " " 75 | current = None 76 | 77 | elif label.startswith('I-'): 78 | 79 | if current is not None: 80 | base = label[2:] 81 | if base == current: 82 | chunks += f+" " 83 | else: 84 | chunks += " <"+base+"> " + f + " " 85 | current = base 86 | else: 87 | current = label[2:] 88 | chunks += "<"+current+"> " + f + " " 89 | 90 | elif label.startswith('E-'): 91 | 92 | if current is not None: 93 | base = label[2:] 94 | if base == current: 95 | chunks += f + " " 96 | current = None 97 | else: 98 | chunks += " <"+base+"> " + f + " " 99 | current = None 100 | 101 | else: 102 | current = label[2:] 103 | chunks += "<"+current+"> " + f + " " 104 | current = None 105 | 106 | else: 107 | if current is not None: 108 | chunks += " " 109 | chunks += f+" " 110 | current = None 111 | 112 | if current is not None: 113 | chunks += " " 114 | 115 | return chunks 116 | 117 | def output_batch(self, ner_model, documents, fout): 118 | """ 119 | decode the whole corpus in the specific format by calling apply_model to fit specific models 120 | 121 | args: 122 | ner_model: sequence labeling model 123 | feature (list): list of words list 124 | fout: output file 125 | """ 126 | ner_model.eval() 127 | 128 | d_len = len(documents) 129 | for d_ind in tqdm( range(0, d_len), mininterval=1, 130 | desc=' - Process', leave=False, file=sys.stdout): 131 | fout.write('-DOCSTART- -DOCSTART- -DOCSTART-\n\n') 132 | features = documents[d_ind] 133 | f_len = len(features) 134 | for ind in range(0, f_len, self.batch_size): 135 | eind = min(f_len, ind + self.batch_size) 136 | labels = self.apply_model(ner_model, features[ind: eind]) 137 | labels = torch.unbind(labels, 1) 138 | 139 | for ind2 in range(ind, eind): 140 | f = features[ind2] 141 | l = labels[ind2 - ind][0: len(f) ] 142 | fout.write(self.decode_str(features[ind2], l) + '\n\n') 143 | 144 | def apply_model(self, ner_model, features): 145 | """ 146 | template function for apply_model 147 | 148 | args: 149 | ner_model: sequence labeling model 150 | feature (list): list of words list 151 | """ 152 | return None 153 | 154 | class predict_w(predict): 155 | """prediction class for word level model (LSTM-CRF) 156 | 157 | args: 158 | if_cuda: if use cuda to speed up 159 | f_map: dictionary for words 160 | l_map: dictionary for labels 161 | pad_word: word padding 162 | pad_label: label padding 163 | start_label: start label 164 | label_seq: type of decode function, set `True` to couple label with text, or set 'False' to insert label into test 165 | batch_size: size of batch in decoding 166 | caseless: caseless or not 167 | """ 168 | 169 | def __init__(self, if_cuda, f_map, l_map, pad_word, pad_label, start_label, label_seq = True, batch_size = 50, caseless=True): 170 | predict.__init__(self, if_cuda, l_map, label_seq, batch_size) 171 | self.decoder = CRFDecode_vb(len(l_map), start_label, pad_label) 172 | self.pad_word = pad_word 173 | self.f_map = f_map 174 | self.l_map = l_map 175 | self.caseless = caseless 176 | 177 | def apply_model(self, ner_model, features): 178 | """ 179 | apply_model function for LSTM-CRF 180 | 181 | args: 182 | ner_model: sequence labeling model 183 | feature (list): list of words list 184 | """ 185 | if self.caseless: 186 | features = list(map(lambda t: list(map(lambda x: x.lower(), t)), features)) 187 | features = encode_safe(features, self.f_map, self.f_map['']) 188 | f_len = max(map(lambda t: len(t) + 1, features)) 189 | 190 | masks = torch.ByteTensor(list(map(lambda t: [1] * (len(t) + 1) + [0] * (f_len - len(t) - 1), features))) 191 | word_features = torch.LongTensor(list(map(lambda t: t + [self.pad_word] * (f_len - len(t)), features))) 192 | 193 | if self.if_cuda: 194 | fea_v = autograd.Variable(word_features.transpose(0, 1)).cuda() 195 | mask_v = masks.transpose(0, 1).cuda() 196 | else: 197 | fea_v = autograd.Variable(word_features.transpose(0, 1)) 198 | mask_v = masks.transpose(0, 1).contiguous() 199 | 200 | scores, _ = ner_model(fea_v) 201 | decoded = self.decoder.decode(scores.data, mask_v) 202 | 203 | return decoded 204 | 205 | class predict_wc(predict): 206 | """prediction class for LM-LSTM-CRF 207 | 208 | args: 209 | if_cuda: if use cuda to speed up 210 | f_map: dictionary for words 211 | c_map: dictionary for chars 212 | l_map: dictionary for labels 213 | pad_word: word padding 214 | pad_char: word padding 215 | pad_label: label padding 216 | start_label: start label 217 | label_seq: type of decode function, set `True` to couple label with text, or set 'False' to insert label into test 218 | batch_size: size of batch in decoding 219 | caseless: caseless or not 220 | """ 221 | 222 | def __init__(self, if_cuda, f_map, c_map, l_map, pad_word, pad_char, pad_label, start_label, label_seq = True, batch_size = 50, caseless=True): 223 | predict.__init__(self, if_cuda, l_map, label_seq, batch_size) 224 | self.decoder = CRFDecode_vb(len(l_map), start_label, pad_label) 225 | self.pad_word = pad_word 226 | self.pad_char = pad_char 227 | self.f_map = f_map 228 | self.c_map = c_map 229 | self.l_map = l_map 230 | self.caseless = caseless 231 | 232 | def apply_model(self, ner_model, features): 233 | """ 234 | apply_model function for LM-LSTM-CRF 235 | 236 | args: 237 | ner_model: sequence labeling model 238 | feature (list): list of words list 239 | """ 240 | char_features = encode2char_safe(features, self.c_map) 241 | 242 | if self.caseless: 243 | word_features = encode_safe(list(map(lambda t: list(map(lambda x: x.lower(), t)), features)), self.f_map, self.f_map['']) 244 | else: 245 | word_features = encode_safe(features, self.f_map, self.f_map['']) 246 | 247 | fea_len = [list( map( lambda t: len(t) + 1, f) ) for f in char_features] 248 | forw_features = concatChar(char_features, self.c_map) 249 | 250 | word_len = max(map(lambda t: len(t) + 1, word_features)) 251 | char_len = max(map(lambda t: len(t[0]) + word_len - len(t[1]), zip(forw_features, word_features))) 252 | forw_t = list( map( lambda t: t + [self.pad_char] * ( char_len - len(t) ), forw_features ) ) 253 | back_t = torch.LongTensor( list( map( lambda t: t[::-1], forw_t ) ) ) 254 | forw_t = torch.LongTensor( forw_t ) 255 | forw_p = torch.LongTensor( list( map( lambda t: list(itertools.accumulate( t + [1] * (word_len - len(t) ) ) ), fea_len) ) ) 256 | back_p = torch.LongTensor( list( map( lambda t: [char_len - 1] + [ char_len - 1 - tup for tup in t[:-1] ], forw_p) ) ) 257 | 258 | masks = torch.ByteTensor(list(map(lambda t: [1] * (len(t) + 1) + [0] * (word_len - len(t) - 1), word_features))) 259 | word_t = torch.LongTensor(list(map(lambda t: t + [self.pad_word] * (word_len - len(t)), word_features))) 260 | 261 | if self.if_cuda: 262 | f_f = autograd.Variable(forw_t.transpose(0, 1)).cuda() 263 | f_p = autograd.Variable(forw_p.transpose(0, 1)).cuda() 264 | b_f = autograd.Variable(back_t.transpose(0, 1)).cuda() 265 | b_p = autograd.Variable(back_p.transpose(0, 1)).cuda() 266 | w_f = autograd.Variable(word_t.transpose(0, 1)).cuda() 267 | mask_v = masks.transpose(0, 1).cuda() 268 | else: 269 | f_f = autograd.Variable(forw_t.transpose(0, 1)) 270 | f_p = autograd.Variable(forw_p.transpose(0, 1)) 271 | b_f = autograd.Variable(back_t.transpose(0, 1)) 272 | b_p = autograd.Variable(back_p.transpose(0, 1)) 273 | w_f = autograd.Variable(word_t.transpose(0, 1)) 274 | mask_v = masks.transpose(0, 1) 275 | 276 | scores = ner_model(f_f, f_p, b_f, b_p, w_f) 277 | decoded = self.decoder.decode(scores.data, mask_v) 278 | 279 | return decoded 280 | -------------------------------------------------------------------------------- /model/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. module:: utils 3 | :synopsis: utility tools 4 | 5 | .. moduleauthor:: Liyuan Liu, Frank Xu 6 | """ 7 | 8 | import codecs 9 | import csv 10 | import itertools 11 | from functools import reduce 12 | 13 | import numpy as np 14 | import shutil 15 | import torch 16 | import json 17 | 18 | import torch.nn as nn 19 | import torch.nn.init 20 | 21 | from model.ner_dataset import * 22 | 23 | zip = getattr(itertools, 'izip', zip) 24 | 25 | 26 | def to_scalar(var): 27 | """change the first element of a tensor to scalar 28 | """ 29 | return var.view(-1).data.tolist()[0] 30 | 31 | 32 | def argmax(vec): 33 | """helper function to calculate argmax of input vector at dimension 1 34 | """ 35 | _, idx = torch.max(vec, 1) 36 | return to_scalar(idx) 37 | 38 | 39 | def log_sum_exp(vec, m_size): 40 | """ 41 | calculate log of exp sum 42 | 43 | args: 44 | vec (batch_size, vanishing_dim, hidden_dim) : input tensor 45 | m_size : hidden_dim 46 | return: 47 | batch_size, hidden_dim 48 | """ 49 | _, idx = torch.max(vec, 1) # B * 1 * M 50 | max_score = torch.gather(vec, 1, idx.view(-1, 1, m_size)).view(-1, 1, m_size) # B * M 51 | 52 | return max_score.view(-1, m_size) + torch.log(torch.sum(torch.exp(vec - max_score.expand_as(vec)), 1)).view(-1, m_size) # B * M 53 | 54 | 55 | def switch(vec1, vec2, mask): 56 | """ 57 | switch function for pytorch 58 | 59 | args: 60 | vec1 (any size) : input tensor corresponding to 0 61 | vec2 (same to vec1) : input tensor corresponding to 1 62 | mask (same to vec1) : input tensor, each element equals to 0/1 63 | return: 64 | vec (*) 65 | """ 66 | catvec = torch.cat([vec1.view(-1, 1), vec2.view(-1, 1)], dim=1) 67 | switched_vec = torch.gather(catvec, 1, mask.long().view(-1, 1)) 68 | return switched_vec.view(-1) 69 | 70 | 71 | def encode2char_safe(input_lines, char_dict): 72 | """ 73 | get char representation of lines 74 | 75 | args: 76 | input_lines (list of strings) : input corpus 77 | char_dict (dictionary) : char-level dictionary 78 | return: 79 | forw_lines 80 | """ 81 | unk = char_dict[''] 82 | forw_lines = [list(map(lambda m: list(map(lambda t: char_dict.get(t, unk), m)), line)) for line in input_lines] 83 | return forw_lines 84 | 85 | 86 | def concatChar(input_lines, char_dict): 87 | """ 88 | concat char into string 89 | 90 | args: 91 | input_lines (list of list of char) : input corpus 92 | char_dict (dictionary) : char-level dictionary 93 | return: 94 | forw_lines 95 | """ 96 | features = [[char_dict[' ']] + list(reduce(lambda x, y: x + [char_dict[' ']] + y, sentence)) + [char_dict['\n']] for sentence in input_lines] 97 | return features 98 | 99 | 100 | def encode_safe(input_lines, word_dict, unk): 101 | """ 102 | encode list of strings into word-level representation with unk 103 | """ 104 | lines = list(map(lambda t: list(map(lambda m: word_dict.get(m, unk), t)), input_lines)) 105 | return lines 106 | 107 | 108 | def encode(input_lines, word_dict): 109 | """ 110 | encode list of strings into word-level representation 111 | """ 112 | lines = list(map(lambda t: list(map(lambda m: word_dict[m], t)), input_lines)) 113 | return lines 114 | 115 | 116 | def encode2Tensor(input_lines, word_dict, unk): 117 | """ 118 | encode list of strings into word-level representation (tensor) with unk 119 | """ 120 | lines = list(map(lambda t: torch.LongTensor(list(map(lambda m: word_dict.get(m, unk), t))), input_lines)) 121 | return lines 122 | 123 | 124 | def generate_corpus_char(lines, if_shrink_c_feature=False, c_thresholds=1, if_shrink_w_feature=False, w_thresholds=1): 125 | """ 126 | generate label, feature, word dictionary, char dictionary and label dictionary 127 | 128 | args: 129 | lines : corpus 130 | if_shrink_c_feature: whether shrink char-dictionary 131 | c_threshold: threshold for shrinking char-dictionary 132 | if_shrink_w_feature: whether shrink word-dictionary 133 | w_threshold: threshold for shrinking word-dictionary 134 | 135 | """ 136 | features, labels, feature_map, label_map = generate_corpus(lines, if_shrink_feature=if_shrink_w_feature, thresholds=w_thresholds) 137 | char_count = dict() 138 | for feature in features: 139 | for word in feature: 140 | for tup in word: 141 | if tup not in char_count: 142 | char_count[tup] = 0 143 | else: 144 | char_count[tup] += 1 145 | if if_shrink_c_feature: 146 | shrink_char_count = [k for (k, v) in iter(char_count.items()) if v >= c_thresholds] 147 | char_map = {shrink_char_count[ind]: ind for ind in range(0, len(shrink_char_count))} 148 | else: 149 | char_map = {k: v for (v, k) in enumerate(char_count.keys())} 150 | char_map[''] = len(char_map) # unk for char 151 | char_map[' '] = len(char_map) # concat for char 152 | char_map['\n'] = len(char_map) # eof for char 153 | return features, labels, feature_map, label_map, char_map 154 | 155 | def shrink_features(feature_map, features, thresholds): 156 | """ 157 | filter un-common features by threshold 158 | """ 159 | feature_count = {k: 0 for (k, v) in iter(feature_map.items())} 160 | for feature_list in features: 161 | for feature in feature_list: 162 | feature_count[feature] += 1 163 | shrinked_feature_count = [k for (k, v) in iter(feature_count.items()) if v >= thresholds] 164 | feature_map = {shrinked_feature_count[ind]: (ind + 1) for ind in range(0, len(shrinked_feature_count))} 165 | 166 | #inserting unk to be 0 encoded 167 | feature_map[''] = 0 168 | #inserting eof 169 | feature_map[''] = len(feature_map) 170 | return feature_map 171 | 172 | def generate_corpus(lines, if_shrink_feature=False, thresholds=1): 173 | """ 174 | generate label, feature, word dictionary and label dictionary 175 | 176 | args: 177 | lines : corpus 178 | if_shrink_feature: whether shrink word-dictionary 179 | threshold: threshold for shrinking word-dictionary 180 | 181 | """ 182 | features = list() 183 | labels = list() 184 | tmp_fl = list() 185 | tmp_ll = list() 186 | feature_map = dict() 187 | label_map = dict() 188 | for line in lines: 189 | if not (line.isspace() or (len(line) > 10 and line[0:10] == '-DOCSTART-')): 190 | line = line.rstrip('\n').split() 191 | tmp_fl.append(line[0]) 192 | if line[0] not in feature_map: 193 | feature_map[line[0]] = len(feature_map) + 1 #0 is for unk 194 | tmp_ll.append(line[-1]) 195 | if line[-1] not in label_map: 196 | label_map[line[-1]] = len(label_map) 197 | elif len(tmp_fl) > 0: 198 | features.append(tmp_fl) 199 | labels.append(tmp_ll) 200 | tmp_fl = list() 201 | tmp_ll = list() 202 | if len(tmp_fl) > 0: 203 | features.append(tmp_fl) 204 | labels.append(tmp_ll) 205 | label_map[''] = len(label_map) 206 | label_map[''] = len(label_map) 207 | if if_shrink_feature: 208 | feature_map = shrink_features(feature_map, features, thresholds) 209 | else: 210 | #inserting unk to be 0 encoded 211 | feature_map[''] = 0 212 | #inserting eof 213 | feature_map[''] = len(feature_map) 214 | 215 | return features, labels, feature_map, label_map 216 | 217 | 218 | def read_corpus(lines): 219 | """ 220 | convert corpus into features and labels 221 | """ 222 | features = list() 223 | labels = list() 224 | tmp_fl = list() 225 | tmp_ll = list() 226 | for line in lines: 227 | if not (line.isspace() or (len(line) > 10 and line[0:10] == '-DOCSTART-')): 228 | line = line.rstrip('\n').split() 229 | tmp_fl.append(line[0]) 230 | tmp_ll.append(line[-1]) 231 | elif len(tmp_fl) > 0: 232 | features.append(tmp_fl) 233 | labels.append(tmp_ll) 234 | tmp_fl = list() 235 | tmp_ll = list() 236 | if len(tmp_fl) > 0: 237 | features.append(tmp_fl) 238 | labels.append(tmp_ll) 239 | 240 | return features, labels 241 | 242 | def read_features(lines, multi_docs = True): 243 | """ 244 | convert un-annotated corpus into features 245 | """ 246 | if multi_docs: 247 | documents = list() 248 | features = list() 249 | tmp_fl = list() 250 | for line in lines: 251 | if_doc_end = (len(line) > 10 and line[0:10] == '-DOCSTART-') 252 | if not (line.isspace() or if_doc_end): 253 | line = line.split()[0] 254 | tmp_fl.append(line) 255 | else: 256 | if len(tmp_fl) > 0: 257 | features.append(tmp_fl) 258 | tmp_fl = list() 259 | if if_doc_end and len(features) > 0: 260 | documents.append(features) 261 | features = list() 262 | if len(tmp_fl) > 0: 263 | features.append(tmp_fl) 264 | if len(features) >0: 265 | documents.append(features) 266 | return documents 267 | else: 268 | features = list() 269 | tmp_fl = list() 270 | for line in lines: 271 | if not (line.isspace() or (len(line) > 10 and line[0:10] == '-DOCSTART-')): 272 | line = line.split()[0] 273 | tmp_fl.append(line) 274 | elif len(tmp_fl) > 0: 275 | features.append(tmp_fl) 276 | tmp_fl = list() 277 | if len(tmp_fl) > 0: 278 | features.append(tmp_fl) 279 | 280 | return features 281 | 282 | def shrink_embedding(feature_map, word_dict, word_embedding, caseless): 283 | """ 284 | shrink embedding dictionary to in-doc words only 285 | """ 286 | if caseless: 287 | feature_map = set([k.lower() for k in feature_map.keys()]) 288 | new_word_list = [k for k in word_dict.keys() if (k in feature_map)] 289 | new_word_dict = {k:v for (v, k) in enumerate(new_word_list)} 290 | new_word_list_ind = torch.LongTensor([word_dict[k] for k in new_word_list]) 291 | new_embedding = word_embedding[new_word_list_ind] 292 | return new_word_dict, new_embedding 293 | 294 | def encode_corpus(lines, f_map, l_map, if_lower = False): 295 | """ 296 | encode corpus into features and labels 297 | """ 298 | tmp_fl = [] 299 | tmp_ll = [] 300 | features = [] 301 | labels = [] 302 | for line in lines: 303 | if not (line.isspace() or (len(line) > 10 and line[0:10] == '-DOCSTART-')): 304 | line = line.rstrip('\n').split() 305 | tmp_fl.append(line[0]) 306 | tmp_ll.append(line[-1]) 307 | elif len(tmp_fl) > 0: 308 | features.append(tmp_fl) 309 | labels.append(tmp_ll) 310 | tmp_fl = list() 311 | tmp_ll = list() 312 | if len(tmp_fl) > 0: 313 | features.append(tmp_fl) 314 | labels.append(tmp_ll) 315 | if if_lower: 316 | features = list(map(lambda t: list(map(lambda x: x.lower(), t)), features)) 317 | feature_e = encode_safe(features, f_map, f_map['']) 318 | label_e = encode(labels, l_map) 319 | return feature_e, label_e 320 | 321 | 322 | def encode_corpus_c(lines, f_map, l_map, c_map): 323 | """ 324 | encode corpus into features (both word-level and char-level) and labels 325 | """ 326 | tmp_fl = [] 327 | tmp_ll = [] 328 | features = [] 329 | labels = [] 330 | for line in lines: 331 | if not (line.isspace() or (len(line) > 10 and line[0:10] == '-DOCSTART-')): 332 | line = line.rstrip('\n').split() 333 | tmp_fl.append(line[0]) 334 | tmp_ll.append(line[-1]) 335 | elif len(tmp_fl) > 0: 336 | features.append(tmp_fl) 337 | labels.append(tmp_ll) 338 | tmp_fl = list() 339 | tmp_ll = list() 340 | if len(tmp_fl) > 0: 341 | features.append(tmp_fl) 342 | labels.append(tmp_ll) 343 | 344 | feature_c = encode2char_safe(features, c_map) 345 | feature_e = encode_safe(features, f_map, f_map['']) 346 | label_e = encode(labels, l_map) 347 | return feature_c, feature_e, label_e 348 | 349 | def load_embedding(emb_file, delimiter, feature_map, caseless, unk, shrink_to_train=False): 350 | """ 351 | load embedding 352 | """ 353 | if caseless: 354 | feature_set = set([key.lower() for key in feature_map]) 355 | else: 356 | feature_set = set([key for key in feature_map]) 357 | 358 | word_dict = dict() 359 | embedding_array = list() 360 | for line in open(emb_file, 'r'): 361 | line = line.split(delimiter) 362 | vector = list(map(lambda t: float(t), filter(lambda n: n and not n.isspace(), line[1:]))) 363 | if shrink_to_train and line[0] not in feature_set: 364 | continue 365 | if line[0] == unk: 366 | word_dict[''] = len(word_dict) 367 | else: 368 | word_dict[line[0]] = len(word_dict) 369 | embedding_array.append(vector) 370 | embedding_tensor_1 = torch.FloatTensor(np.asarray(embedding_array)) 371 | emb_len = embedding_tensor_1.size(1) 372 | 373 | rand_embedding_count = 0 374 | for key in feature_map: 375 | if caseless: 376 | key = key.lower() 377 | if key not in word_dict: 378 | word_dict[key] = len(word_dict) 379 | rand_embedding_count += 1 380 | 381 | rand_embedding_tensor = torch.FloatTensor(rand_embedding_count, emb_len) 382 | init_embedding(rand_embedding_tensor) 383 | 384 | embedding_tensor = torch.cat((embedding_tensor_1, rand_embedding_tensor), 0) 385 | return word_dict, embedding_tensor 386 | 387 | def load_embedding_wlm(emb_file, delimiter, feature_map, full_feature_set, caseless, unk, emb_len, shrink_to_train=False, shrink_to_corpus=False): 388 | """ 389 | load embedding, indoc words would be listed before outdoc words 390 | 391 | args: 392 | emb_file: path to embedding file 393 | delimiter: delimiter of lines 394 | feature_map: word dictionary 395 | full_feature_set: all words in the corpus 396 | caseless: convert into casesless style 397 | unk: string for unknown token 398 | emb_len: dimension of embedding vectors 399 | shrink_to_train: whether to shrink out-of-training set or not 400 | shrink_to_corpus: whether to shrink out-of-corpus or not 401 | """ 402 | if caseless: 403 | feature_set = set([key.lower() for key in feature_map]) 404 | full_feature_set = set([key.lower() for key in full_feature_set]) 405 | else: 406 | feature_set = set([key for key in feature_map]) 407 | full_feature_set = set([key for key in full_feature_set]) 408 | 409 | #ensure is 0 410 | word_dict = {v:(k+1) for (k,v) in enumerate(feature_set - set(['']))} 411 | word_dict[''] = 0 412 | 413 | in_doc_freq_num = len(word_dict) 414 | rand_embedding_tensor = torch.FloatTensor(in_doc_freq_num, emb_len) 415 | init_embedding(rand_embedding_tensor) 416 | 417 | indoc_embedding_array = list() 418 | indoc_word_array = list() 419 | outdoc_embedding_array = list() 420 | outdoc_word_array = list() 421 | 422 | for line in open(emb_file, 'r'): 423 | line = line.split(delimiter) 424 | vector = list(map(lambda t: float(t), filter(lambda n: n and not n.isspace(), line[1:]))) 425 | 426 | if shrink_to_train and line[0] not in feature_set: 427 | continue 428 | 429 | if line[0] == unk: 430 | rand_embedding_tensor[0] = torch.FloatTensor(vector) #unk is 0 431 | elif line[0] in word_dict: 432 | rand_embedding_tensor[word_dict[line[0]]] = torch.FloatTensor(vector) 433 | elif line[0] in full_feature_set: 434 | indoc_embedding_array.append(vector) 435 | indoc_word_array.append(line[0]) 436 | elif not shrink_to_corpus: 437 | outdoc_word_array.append(line[0]) 438 | outdoc_embedding_array.append(vector) 439 | 440 | embedding_tensor_0 = torch.FloatTensor(np.asarray(indoc_embedding_array)) 441 | 442 | if not shrink_to_corpus: 443 | embedding_tensor_1 = torch.FloatTensor(np.asarray(outdoc_embedding_array)) 444 | word_emb_len = embedding_tensor_0.size(1) 445 | assert(word_emb_len == emb_len) 446 | 447 | if shrink_to_corpus: 448 | embedding_tensor = torch.cat([rand_embedding_tensor, embedding_tensor_0], 0) 449 | else: 450 | embedding_tensor = torch.cat([rand_embedding_tensor, embedding_tensor_0, embedding_tensor_1], 0) 451 | 452 | for word in indoc_word_array: 453 | word_dict[word] = len(word_dict) 454 | in_doc_num = len(word_dict) 455 | if not shrink_to_corpus: 456 | for word in outdoc_word_array: 457 | word_dict[word] = len(word_dict) 458 | 459 | return word_dict, embedding_tensor, in_doc_num 460 | 461 | def calc_threshold_mean(features): 462 | """ 463 | calculate the threshold for bucket by mean 464 | """ 465 | lines_len = list(map(lambda t: len(t) + 1, features)) 466 | average = int(sum(lines_len) / len(lines_len)) 467 | lower_line = list(filter(lambda t: t < average, lines_len)) 468 | upper_line = list(filter(lambda t: t >= average, lines_len)) 469 | lower_average = int(sum(lower_line) / len(lower_line)) 470 | upper_average = int(sum(upper_line) / len(upper_line)) 471 | max_len = max(lines_len) 472 | return [lower_average, average, upper_average, max_len] 473 | 474 | 475 | def construct_bucket_mean_gd(input_features, input_label, word_dict, label_dict): 476 | """ 477 | Construct bucket by mean for greedy decode, word-level only 478 | """ 479 | # encode and padding 480 | features = encode_safe(input_features, word_dict, word_dict['']) 481 | labels = encode(input_label, label_dict) 482 | labels = list(map(lambda t: [label_dict['']] + list(t), labels)) 483 | 484 | thresholds = calc_threshold_mean(features) 485 | 486 | return construct_bucket_gd(features, labels, thresholds, word_dict[''], label_dict['']) 487 | 488 | 489 | def construct_bucket_mean_vb(input_features, input_label, word_dict, label_dict, caseless): 490 | """ 491 | Construct bucket by mean for viterbi decode, word-level only 492 | """ 493 | # encode and padding 494 | if caseless: 495 | input_features = list(map(lambda t: list(map(lambda x: x.lower(), t)), input_features)) 496 | 497 | features = encode_safe(input_features, word_dict, word_dict['']) 498 | labels = encode(input_label, label_dict) 499 | labels = list(map(lambda t: [label_dict['']] + list(t), labels)) 500 | 501 | thresholds = calc_threshold_mean(features) 502 | 503 | return construct_bucket_vb(features, labels, thresholds, word_dict[''], label_dict[''], len(label_dict)) 504 | 505 | def construct_bucket_mean_vb_wc(word_features, input_label, label_dict, char_dict, word_dict, caseless): 506 | """ 507 | Construct bucket by mean for viterbi decode, word-level and char-level 508 | """ 509 | # encode and padding 510 | char_features = encode2char_safe(word_features, char_dict) 511 | fea_len = [list(map(lambda t: len(t) + 1, f)) for f in char_features] 512 | forw_features = concatChar(char_features, char_dict) 513 | 514 | labels = encode(input_label, label_dict) 515 | labels = list(map(lambda t: [label_dict['']] + list(t), labels)) 516 | 517 | thresholds = calc_threshold_mean(fea_len) 518 | 519 | if caseless: 520 | word_features = list(map(lambda t: list(map(lambda x: x.lower(), t)), word_features)) 521 | word_features = encode_safe(word_features, word_dict, word_dict['']) 522 | 523 | return construct_bucket_vb_wc(word_features, forw_features, fea_len, labels, thresholds, word_dict[''], char_dict['\n'], label_dict[''], len(label_dict)) 524 | 525 | def construct_bucket_vb_wc(word_features, forw_features, fea_len, input_labels, thresholds, pad_word_feature, pad_char_feature, pad_label, label_size): 526 | """ 527 | Construct bucket by thresholds for viterbi decode, word-level and char-level 528 | """ 529 | # construct corpus for language model pre-training 530 | forw_corpus = [pad_char_feature] 531 | for forw_feature in forw_features: 532 | forw_corpus.extend(forw_feature + [pad_char_feature]) 533 | back_corpus = forw_corpus[::-1] 534 | # two way construct, first build the bucket, then calculate padding length, then do the padding 535 | buckets = [[[], [], [], [], [], [], [], []] for ind in range(len(thresholds))] 536 | # forw, forw_ind, back, back_in, label, mask 537 | buckets_len = [0 for ind in range(len(thresholds))] 538 | 539 | # thresholds is the padded length for fea 540 | # buckets_len is the padded length for char 541 | for f_f, f_l in zip(forw_features, fea_len): 542 | cur_len_1 = len(f_l) + 1 543 | idx = 0 544 | while thresholds[idx] < cur_len_1: 545 | idx += 1 546 | tmp_concat_len = len(f_f) + thresholds[idx] - len(f_l) 547 | if buckets_len[idx] < tmp_concat_len: 548 | buckets_len[idx] = tmp_concat_len 549 | 550 | # calc padding 551 | for f_f, f_l, w_f, i_l in zip(forw_features, fea_len, word_features, input_labels): 552 | cur_len = len(f_l) 553 | idx = 0 554 | cur_len_1 = cur_len + 1 555 | while thresholds[idx] < cur_len_1: 556 | idx += 1 557 | 558 | padded_feature = f_f + [pad_char_feature] * (buckets_len[idx] - len(f_f)) # pad feature with <'\n'>, at least one 559 | 560 | padded_feature_len = f_l + [1] * (thresholds[idx] - len(f_l)) # pad feature length with <'\n'>, at least one 561 | padded_feature_len_cum = list(itertools.accumulate(padded_feature_len)) # start from 0, but the first is ' ', so the position need not to be -1 562 | buckets[idx][0].append(padded_feature) # char 563 | buckets[idx][1].append(padded_feature_len_cum) 564 | buckets[idx][2].append(padded_feature[::-1]) 565 | buckets[idx][3].append([buckets_len[idx] - 1] + [buckets_len[idx] - 1 - tup for tup in padded_feature_len_cum[:-1]]) 566 | buckets[idx][4].append(w_f + [pad_word_feature] * (thresholds[idx] - cur_len)) #word 567 | buckets[idx][5].append([i_l[ind] * label_size + i_l[ind + 1] for ind in range(0, cur_len)] + [i_l[cur_len] * label_size + pad_label] + [pad_label * label_size + pad_label] * (thresholds[idx] - cur_len_1)) # has additional start, label 568 | buckets[idx][6].append([1] * cur_len_1 + [0] * (thresholds[idx] - cur_len_1)) # has additional start, mask 569 | buckets[idx][7].append([len(f_f) + thresholds[idx] - len(f_l), cur_len_1]) 570 | bucket_dataset = [CRFDataset_WC(torch.LongTensor(bucket[0]), torch.LongTensor(bucket[1]), 571 | torch.LongTensor(bucket[2]), torch.LongTensor(bucket[3]), 572 | torch.LongTensor(bucket[4]), torch.LongTensor(bucket[5]), 573 | torch.ByteTensor(bucket[6]), torch.LongTensor(bucket[7])) for bucket in buckets] 574 | return bucket_dataset, forw_corpus, back_corpus 575 | 576 | 577 | def construct_bucket_vb(input_features, input_labels, thresholds, pad_feature, pad_label, label_size): 578 | """ 579 | Construct bucket by thresholds for viterbi decode, word-level only 580 | """ 581 | buckets = [[[], [], []] for _ in range(len(thresholds))] 582 | for feature, label in zip(input_features, input_labels): 583 | cur_len = len(feature) 584 | idx = 0 585 | cur_len_1 = cur_len + 1 586 | while thresholds[idx] < cur_len_1: 587 | idx += 1 588 | buckets[idx][0].append(feature + [pad_feature] * (thresholds[idx] - cur_len)) 589 | buckets[idx][1].append([label[ind] * label_size + label[ind + 1] for ind in range(0, cur_len)] + [ 590 | label[cur_len] * label_size + pad_label] + [pad_label * label_size + pad_label] * ( 591 | thresholds[idx] - cur_len_1)) 592 | buckets[idx][2].append([1] * cur_len_1 + [0] * (thresholds[idx] - cur_len_1)) 593 | bucket_dataset = [CRFDataset(torch.LongTensor(bucket[0]), torch.LongTensor(bucket[1]), torch.ByteTensor(bucket[2])) 594 | for bucket in buckets] 595 | return bucket_dataset 596 | 597 | 598 | def construct_bucket_gd(input_features, input_labels, thresholds, pad_feature, pad_label): 599 | """ 600 | Construct bucket by thresholds for greedy decode, word-level only 601 | """ 602 | buckets = [[[], [], []] for ind in range(len(thresholds))] 603 | for feature, label in zip(input_features, input_labels): 604 | cur_len = len(feature) 605 | cur_len_1 = cur_len + 1 606 | idx = 0 607 | while thresholds[idx] < cur_len_1: 608 | idx += 1 609 | buckets[idx][0].append(feature + [pad_feature] * (thresholds[idx] - cur_len)) 610 | buckets[idx][1].append(label[1:] + [pad_label] * (thresholds[idx] - cur_len)) 611 | buckets[idx][2].append(label + [pad_label] * (thresholds[idx] - cur_len_1)) 612 | bucket_dataset = [CRFDataset(torch.LongTensor(bucket[0]), torch.LongTensor(bucket[1]), torch.LongTensor(bucket[2])) for bucket in buckets] 613 | return bucket_dataset 614 | 615 | 616 | def find_length_from_feats(feats, feat_to_ix): 617 | """ 618 | find length of unpadded features based on feature 619 | """ 620 | end_position = len(feats) - 1 621 | for position, feat in enumerate(feats): 622 | if feat.data[0] == feat_to_ix['']: 623 | end_position = position 624 | break 625 | return end_position + 1 626 | 627 | 628 | def find_length_from_labels(labels, label_to_ix): 629 | """ 630 | find length of unpadded features based on labels 631 | """ 632 | end_position = len(labels) - 1 633 | for position, label in enumerate(labels): 634 | if label == label_to_ix['']: 635 | end_position = position 636 | break 637 | return end_position 638 | 639 | 640 | def revlut(lut): 641 | return {v: k for k, v in lut.items()} 642 | 643 | 644 | # Turn a sequence of IOB chunks into single tokens 645 | def iob_to_spans(sequence, lut, strict_iob2=False): 646 | """ 647 | convert to iob to span 648 | """ 649 | iobtype = 2 if strict_iob2 else 1 650 | chunks = [] 651 | current = None 652 | 653 | for i, y in enumerate(sequence): 654 | label = lut[y] 655 | 656 | if label.startswith('B-'): 657 | if current is not None: 658 | chunks.append('@'.join(current)) 659 | current = [label.replace('B-', ''), '%d' % i] 660 | 661 | elif label.startswith('I-'): 662 | 663 | if current is not None: 664 | base = label.replace('I-', '') 665 | if base == current[0]: 666 | current.append('%d' % i) 667 | else: 668 | chunks.append('@'.join(current)) 669 | if iobtype == 2: 670 | print('Warning, type=IOB2, unexpected format ([%s] follows other tag type [%s] @ %d)' % ( 671 | label, current[0], i)) 672 | 673 | current = [base, '%d' % i] 674 | 675 | else: 676 | current = [label.replace('I-', ''), '%d' % i] 677 | if iobtype == 2: 678 | print('Warning, unexpected format (I before B @ %d) %s' % (i, label)) 679 | else: 680 | if current is not None: 681 | chunks.append('@'.join(current)) 682 | current = None 683 | 684 | if current is not None: 685 | chunks.append('@'.join(current)) 686 | 687 | return set(chunks) 688 | 689 | # Turn a sequence of IOBES chunks into single tokens 690 | def iobes_to_spans(sequence, lut, strict_iob2=False): 691 | """ 692 | convert to iobes to span 693 | """ 694 | iobtype = 2 if strict_iob2 else 1 695 | chunks = [] 696 | current = None 697 | 698 | for i, y in enumerate(sequence): 699 | label = lut[y] 700 | 701 | if label.startswith('B-'): 702 | 703 | if current is not None: 704 | chunks.append('@'.join(current)) 705 | current = [label.replace('B-', ''), '%d' % i] 706 | 707 | elif label.startswith('S-'): 708 | 709 | if current is not None: 710 | chunks.append('@'.join(current)) 711 | current = None 712 | base = label.replace('S-', '') 713 | chunks.append('@'.join([base, '%d' % i])) 714 | 715 | elif label.startswith('I-'): 716 | 717 | if current is not None: 718 | base = label.replace('I-', '') 719 | if base == current[0]: 720 | current.append('%d' % i) 721 | else: 722 | chunks.append('@'.join(current)) 723 | if iobtype == 2: 724 | print('Warning') 725 | current = [base, '%d' % i] 726 | 727 | else: 728 | current = [label.replace('I-', ''), '%d' % i] 729 | if iobtype == 2: 730 | print('Warning') 731 | 732 | elif label.startswith('E-'): 733 | 734 | if current is not None: 735 | base = label.replace('E-', '') 736 | if base == current[0]: 737 | current.append('%d' % i) 738 | chunks.append('@'.join(current)) 739 | current = None 740 | else: 741 | chunks.append('@'.join(current)) 742 | if iobtype == 2: 743 | print('Warning') 744 | current = [base, '%d' % i] 745 | chunks.append('@'.join(current)) 746 | current = None 747 | 748 | else: 749 | current = [label.replace('E-', ''), '%d' % i] 750 | if iobtype == 2: 751 | print('Warning') 752 | chunks.append('@'.join(current)) 753 | current = None 754 | else: 755 | if current is not None: 756 | chunks.append('@'.join(current)) 757 | current = None 758 | 759 | if current is not None: 760 | chunks.append('@'.join(current)) 761 | 762 | return set(chunks) 763 | 764 | 765 | def fill_y(nc, yidx): 766 | """ 767 | fill y to dense matrix 768 | """ 769 | batchsz = yidx.shape[0] 770 | siglen = yidx.shape[1] 771 | dense = np.zeros((batchsz, siglen, nc), dtype=np.int) 772 | for i in range(batchsz): 773 | for j in range(siglen): 774 | idx = int(yidx[i, j]) 775 | if idx > 0: 776 | dense[i, j, idx] = 1 777 | 778 | return dense 779 | 780 | def save_checkpoint(state, track_list, filename): 781 | """ 782 | save checkpoint 783 | """ 784 | with open(filename+'.json', 'w') as f: 785 | json.dump(track_list, f) 786 | torch.save(state, filename+'.model') 787 | 788 | def adjust_learning_rate(optimizer, lr): 789 | """ 790 | shrink learning rate for pytorch 791 | """ 792 | for param_group in optimizer.param_groups: 793 | param_group['lr'] = lr 794 | 795 | def init_embedding(input_embedding): 796 | """ 797 | Initialize embedding 798 | """ 799 | bias = np.sqrt(3.0 / input_embedding.size(1)) 800 | nn.init.uniform_(input_embedding, -bias, bias) 801 | 802 | def init_linear(input_linear): 803 | """ 804 | Initialize linear transformation 805 | """ 806 | bias = np.sqrt(6.0 / (input_linear.weight.size(0) + input_linear.weight.size(1))) 807 | nn.init.uniform_(input_linear.weight, -bias, bias) 808 | if input_linear.bias is not None: 809 | input_linear.bias.data.zero_() 810 | 811 | def init_lstm(input_lstm): 812 | """ 813 | Initialize lstm 814 | """ 815 | for ind in range(0, input_lstm.num_layers): 816 | weight = eval('input_lstm.weight_ih_l'+str(ind)) 817 | bias = np.sqrt(6.0 / (weight.size(0)/4 + weight.size(1))) 818 | nn.init.uniform_(weight, -bias, bias) 819 | weight = eval('input_lstm.weight_hh_l'+str(ind)) 820 | bias = np.sqrt(6.0 / (weight.size(0)/4 + weight.size(1))) 821 | nn.init.uniform_(weight, -bias, bias) 822 | 823 | if input_lstm.bias: 824 | for ind in range(0, input_lstm.num_layers): 825 | weight = eval('input_lstm.bias_ih_l'+str(ind)) 826 | weight.data.zero_() 827 | weight.data[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1 828 | weight = eval('input_lstm.bias_hh_l'+str(ind)) 829 | weight.data.zero_() 830 | weight.data[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1 831 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.22.0 2 | tqdm 3 | http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp35-cp35m-linux_x86_64.whl -------------------------------------------------------------------------------- /seq_w.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import datetime 3 | import time 4 | import torch 5 | import torch.autograd as autograd 6 | import torch.nn as nn 7 | import torch.optim as optim 8 | import codecs 9 | from model.crf import * 10 | from model.lstm_crf import * 11 | import model.utils as utils 12 | from model.predictor import predict_w 13 | 14 | import argparse 15 | import json 16 | import os 17 | import sys 18 | from tqdm import tqdm 19 | import itertools 20 | import functools 21 | 22 | if __name__ == "__main__": 23 | parser = argparse.ArgumentParser(description='Evaluating LM-BLSTM-CRF') 24 | parser.add_argument('--load_arg', default='./checkpoint/ner/ner_4_cwlm_lstm_crf.json', help='path to arg json') 25 | parser.add_argument('--load_check_point', default='./checkpoint/ner/ner_4_cwlm_lstm_crf.model', help='path to model checkpoint file') 26 | parser.add_argument('--gpu',type=int, default=0, help='gpu id') 27 | parser.add_argument('--decode_type', choices=['label', 'string'], default='string', help='type of decode function, set `label` to couple label with text, or set `string` to insert label into test') 28 | parser.add_argument('--batch_size', type=int, default=50, help='size of batch') 29 | parser.add_argument('--input_file', default='data/ner2003/test.txt', help='path to input un-annotated corpus') 30 | parser.add_argument('--output_file', default='output.txt', help='path to output file') 31 | args = parser.parse_args() 32 | 33 | print('loading dictionary') 34 | with open(args.load_arg, 'r') as f: 35 | jd = json.load(f) 36 | jd = jd['args'] 37 | 38 | checkpoint_file = torch.load(args.load_check_point, map_location=lambda storage, loc: storage) 39 | f_map = checkpoint_file['f_map'] 40 | l_map = checkpoint_file['l_map'] 41 | if args.gpu >= 0: 42 | torch.cuda.set_device(args.gpu) 43 | 44 | # loading corpus 45 | print('loading corpus') 46 | with codecs.open(args.input_file, 'r', 'utf-8') as f: 47 | lines = f.readlines() 48 | 49 | # converting format 50 | features = utils.read_features(lines) 51 | 52 | # build model 53 | print('loading model') 54 | ner_model = LSTM_CRF(len(f_map), len(l_map), jd['embedding_dim'], jd['hidden'], jd['layers'], jd['drop_out'], large_CRF=jd['small_crf']) 55 | 56 | ner_model.load_state_dict(checkpoint_file['state_dict']) 57 | 58 | if args.gpu >= 0: 59 | if_cuda = True 60 | torch.cuda.set_device(args.gpu) 61 | ner_model.cuda() 62 | else: 63 | if_cuda = False 64 | 65 | decode_label = (args.decode_type == 'label') 66 | 67 | predictor = predict_w(if_cuda, f_map, l_map, f_map[''], l_map[''], l_map[''], decode_label, args.batch_size, jd['caseless']) 68 | 69 | print('annotating') 70 | with open(args.output_file, 'w') as fout: 71 | predictor.output_batch(ner_model, features, fout) -------------------------------------------------------------------------------- /seq_wc.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import datetime 3 | import time 4 | import torch 5 | import torch.autograd as autograd 6 | import torch.nn as nn 7 | import torch.optim as optim 8 | import codecs 9 | from model.crf import * 10 | from model.lm_lstm_crf import * 11 | import model.utils as utils 12 | from model.predictor import predict_wc 13 | 14 | import argparse 15 | import json 16 | import os 17 | import sys 18 | from tqdm import tqdm 19 | import itertools 20 | import functools 21 | 22 | if __name__ == "__main__": 23 | parser = argparse.ArgumentParser(description='Evaluating LM-BLSTM-CRF') 24 | parser.add_argument('--load_arg', default='./checkpoint/ner/ner_4_cwlm_lstm_crf.json', help='path to arg json') 25 | parser.add_argument('--load_check_point', default='./checkpoint/ner/ner_4_cwlm_lstm_crf.model', help='path to model checkpoint file') 26 | parser.add_argument('--gpu',type=int, default=0, help='gpu id') 27 | parser.add_argument('--decode_type', choices=['label', 'string'], default='string', help='type of decode function, set `label` to couple label with text, or set `string` to insert label into test') 28 | parser.add_argument('--batch_size', type=int, default=50, help='size of batch') 29 | parser.add_argument('--input_file', default='data/ner2003/test.txt', help='path to input un-annotated corpus') 30 | parser.add_argument('--output_file', default='output.txt', help='path to output file') 31 | args = parser.parse_args() 32 | 33 | print('loading dictionary') 34 | with open(args.load_arg, 'r') as f: 35 | jd = json.load(f) 36 | jd = jd['args'] 37 | 38 | checkpoint_file = torch.load(args.load_check_point, map_location=lambda storage, loc: storage) 39 | f_map = checkpoint_file['f_map'] 40 | l_map = checkpoint_file['l_map'] 41 | c_map = checkpoint_file['c_map'] 42 | in_doc_words = checkpoint_file['in_doc_words'] 43 | if args.gpu >= 0: 44 | torch.cuda.set_device(args.gpu) 45 | 46 | # loading corpus 47 | print('loading corpus') 48 | with codecs.open(args.input_file, 'r', 'utf-8') as f: 49 | lines = f.readlines() 50 | 51 | # converting format 52 | features = utils.read_features(lines) 53 | 54 | # build model 55 | print('loading model') 56 | ner_model = LM_LSTM_CRF(len(l_map), len(c_map), jd['char_dim'], jd['char_hidden'], jd['char_layers'], jd['word_dim'], jd['word_hidden'], jd['word_layers'], len(f_map), jd['drop_out'], large_CRF=jd['small_crf'], if_highway=jd['high_way'], in_doc_words=in_doc_words, highway_layers = jd['highway_layers']) 57 | 58 | ner_model.load_state_dict(checkpoint_file['state_dict']) 59 | 60 | if args.gpu >= 0: 61 | if_cuda = True 62 | torch.cuda.set_device(args.gpu) 63 | ner_model.cuda() 64 | else: 65 | if_cuda = False 66 | 67 | decode_label = (args.decode_type == 'label') 68 | predictor = predict_wc(if_cuda, f_map, c_map, l_map, f_map[''], c_map['\n'], l_map[''], l_map[''], decode_label, args.batch_size, jd['caseless']) 69 | 70 | print('annotating') 71 | with open(args.output_file, 'w') as fout: 72 | predictor.output_batch(ner_model, features, fout) -------------------------------------------------------------------------------- /train_w.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import datetime 3 | import time 4 | import torch 5 | import torch.autograd as autograd 6 | import torch.nn as nn 7 | import torch.optim as optim 8 | import codecs 9 | from model.crf import * 10 | from model.lstm_crf import * 11 | import model.utils as utils 12 | from model.evaluator import eval_w 13 | 14 | import argparse 15 | import json 16 | import os 17 | import sys 18 | from tqdm import tqdm 19 | import itertools 20 | import functools 21 | 22 | def eprint(*args, **kwargs): 23 | print(*args, file=sys.stderr, **kwargs) 24 | 25 | if __name__ == "__main__": 26 | parser = argparse.ArgumentParser(description='Learning with BLSTM-CRF') 27 | parser.add_argument('--rand_embedding', action='store_true', help='random initialize word embedding') 28 | parser.add_argument('--emb_file', default='./embedding/glove.6B.100d.txt', help='path to pre-trained embedding') 29 | parser.add_argument('--train_file', default='./data/ner2003/eng.train.iobes', help='path to training file') 30 | parser.add_argument('--dev_file', default='./data/ner2003/eng.testa.iobes', help='path to development file') 31 | parser.add_argument('--test_file', default='./data/ner2003/eng.testb.iobes', help='path to test file') 32 | parser.add_argument('--gpu', type=int, default=0, help='gpu id, set to -1 if use cpu mode') 33 | parser.add_argument('--batch_size', type=int, default=10, help='batch size (10)') 34 | parser.add_argument('--unk', default='unk', help='unknow-token in pre-trained embedding') 35 | parser.add_argument('--checkpoint', default='./checkpoint/', help='path to checkpoint prefix') 36 | parser.add_argument('--hidden', type=int, default=100, help='hidden dimension') 37 | parser.add_argument('--drop_out', type=float, default=0.55, help='dropout ratio') 38 | parser.add_argument('--epoch', type=int, default=200, help='maximum epoch number') 39 | parser.add_argument('--start_epoch', type=int, default=0, help='start epoch idx') 40 | parser.add_argument('--caseless', action='store_true', help='caseless or not') 41 | parser.add_argument('--embedding_dim', type=int, default=100, help='dimension for word embedding') 42 | parser.add_argument('--layers', type=int, default=1, help='number of lstm layers') 43 | parser.add_argument('--lr', type=float, default=0.015, help='initial learning rate') 44 | parser.add_argument('--lr_decay', type=float, default=0.05, help='decay ratio of learning rate') 45 | parser.add_argument('--fine_tune', action='store_false', help='fine tune pre-trained embedding dictionary') 46 | parser.add_argument('--load_check_point', default='', help='path of checkpoint') 47 | parser.add_argument('--load_opt', action='store_true', help='load optimizer from ') 48 | parser.add_argument('--update', choices=['sgd', 'adam'], default='sgd', help='optimizer method') 49 | parser.add_argument('--momentum', type=float, default=0.9, help='momentum for sgd') 50 | parser.add_argument('--clip_grad', type=float, default=5.0, help='grad clip at') 51 | parser.add_argument('--small_crf', action='store_false', help='use small crf instead of large crf, refer model.crf module for more details') 52 | parser.add_argument('--mini_count', type=float, default=5, help='thresholds to replace rare words with ') 53 | parser.add_argument('--eva_matrix', choices=['a', 'fa'], default='fa', help='use f1 and accuracy or accuracy alone') 54 | parser.add_argument('--patience', type=int, default=15, help='patience for early stop') 55 | parser.add_argument('--least_iters', type=int, default=50, help='at least train how many epochs before stop') 56 | parser.add_argument('--shrink_embedding', action='store_true', help='shrink the embedding dictionary to corpus (open this if pre-trained embedding dictionary is too large, but disable this may yield better results on external corpus)') 57 | args = parser.parse_args() 58 | 59 | if args.gpu >= 0: 60 | torch.cuda.set_device(args.gpu) 61 | 62 | print('setting:') 63 | print(args) 64 | 65 | # load corpus 66 | print('loading corpus') 67 | with codecs.open(args.train_file, 'r', 'utf-8') as f: 68 | lines = f.readlines() 69 | with codecs.open(args.dev_file, 'r', 'utf-8') as f: 70 | dev_lines = f.readlines() 71 | with codecs.open(args.test_file, 'r', 'utf-8') as f: 72 | test_lines = f.readlines() 73 | 74 | # converting format 75 | dev_features, dev_labels = utils.read_corpus(dev_lines) 76 | test_features, test_labels = utils.read_corpus(test_lines) 77 | 78 | if args.load_check_point: 79 | if os.path.isfile(args.load_check_point): 80 | print("loading checkpoint: '{}'".format(args.load_check_point)) 81 | checkpoint_file = torch.load(args.load_check_point) 82 | args.start_epoch = checkpoint_file['epoch'] 83 | f_map = checkpoint_file['f_map'] 84 | l_map = checkpoint_file['l_map'] 85 | train_features, train_labels = utils.read_corpus(lines) 86 | else: 87 | print("no checkpoint found at: '{}'".format(args.load_check_point)) 88 | else: 89 | print('constructing coding table') 90 | 91 | # converting format 92 | 93 | train_features, train_labels, f_map, l_map = utils.generate_corpus(lines, if_shrink_feature=True, thresholds=0) 94 | 95 | f_set = {v for v in f_map} 96 | f_map = utils.shrink_features(f_map, train_features, args.mini_count) 97 | 98 | dt_f_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), dev_features), f_set) 99 | dt_f_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), test_features), dt_f_set) 100 | 101 | if not args.rand_embedding: 102 | print("feature size: '{}'".format(len(f_map))) 103 | print('loading embedding') 104 | if args.fine_tune: # which means does not do fine-tune 105 | f_map = {'': 0} 106 | f_map, embedding_tensor, in_doc_words = utils.load_embedding_wlm(args.emb_file, ' ', f_map, dt_f_set,args.caseless,args.unk, args.embedding_dim, shrink_to_corpus=args.shrink_embedding) 107 | print("embedding size: '{}'".format(len(f_map))) 108 | 109 | l_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), dev_labels)) 110 | l_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), test_labels), l_set) 111 | for label in l_set: 112 | if label not in l_map: 113 | l_map[label] = len(l_map) 114 | 115 | # construct dataset 116 | dataset = utils.construct_bucket_mean_vb(train_features, train_labels, f_map, l_map, args.caseless) 117 | dev_dataset = utils.construct_bucket_mean_vb(dev_features, dev_labels, f_map, l_map, args.caseless) 118 | test_dataset = utils.construct_bucket_mean_vb(test_features, test_labels, f_map, l_map, args.caseless) 119 | 120 | dataset_loader = [torch.utils.data.DataLoader(tup, args.batch_size, shuffle=True, drop_last=False) for tup in dataset] 121 | dev_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in dev_dataset] 122 | test_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in test_dataset] 123 | 124 | # build model 125 | print('building model') 126 | ner_model = LSTM_CRF(len(f_map), len(l_map), args.embedding_dim, args.hidden, args.layers, args.drop_out, large_CRF=args.small_crf) 127 | 128 | if args.load_check_point: 129 | ner_model.load_state_dict(checkpoint_file['state_dict']) 130 | else: 131 | if not args.rand_embedding: 132 | ner_model.load_pretrained_embedding(embedding_tensor) 133 | print('random initialization') 134 | ner_model.rand_init(init_embedding=args.rand_embedding) 135 | 136 | if args.update == 'sgd': 137 | optimizer = optim.SGD(ner_model.parameters(), lr=args.lr, momentum=args.momentum) 138 | elif args.update == 'adam': 139 | optimizer = optim.Adam(ner_model.parameters(), lr=args.lr) 140 | 141 | 142 | if args.load_check_point and args.load_opt: 143 | optimizer.load_state_dict(checkpoint_file['optimizer']) 144 | 145 | crit = CRFLoss_vb(len(l_map), l_map[''], l_map['']) 146 | 147 | if args.gpu >= 0: 148 | if_cuda = True 149 | print('device: ' + str(args.gpu)) 150 | torch.cuda.set_device(args.gpu) 151 | crit.cuda() 152 | ner_model.cuda() 153 | packer = CRFRepack(len(l_map), True) 154 | else: 155 | if_cuda = False 156 | packer = CRFRepack(len(l_map), False) 157 | 158 | if args.load_check_point: 159 | dev_f1, dev_acc = eval_batch(ner_model, dev_dataset_loader, pack, l_map) 160 | test_f1, test_acc = eval_batch(ner_model, test_dataset_loader, pack, l_map) 161 | print('(checkpoint: dev F1 = %.4f, dev acc = %.4f, F1 on test = %.4f, acc on test= %.4f)' % 162 | (dev_f1, 163 | dev_acc, 164 | test_f1, 165 | test_acc)) 166 | 167 | tot_length = sum(map(lambda t: len(t), dataset_loader)) 168 | best_f1 = float('-inf') 169 | best_acc = float('-inf') 170 | track_list = list() 171 | start_time = time.time() 172 | epoch_list = range(args.start_epoch, args.start_epoch + args.epoch) 173 | patience_count = 0 174 | 175 | evaluator = eval_w(packer, l_map, args.eva_matrix) 176 | 177 | for epoch_idx, args.start_epoch in enumerate(epoch_list): 178 | 179 | epoch_loss = 0 180 | ner_model.train() 181 | 182 | for feature, tg, mask in tqdm( 183 | itertools.chain.from_iterable(dataset_loader), mininterval=2, 184 | desc=' - Tot it %d (epoch %d)' % (tot_length, args.start_epoch), leave=False, file=sys.stdout): 185 | 186 | fea_v, tg_v, mask_v = packer.repack_vb(feature, tg, mask) 187 | ner_model.zero_grad() 188 | scores, hidden = ner_model.forward(fea_v) 189 | loss = crit.forward(scores, tg_v, mask_v) 190 | loss.backward() 191 | nn.utils.clip_grad_norm_(ner_model.parameters(), args.clip_grad) 192 | optimizer.step() 193 | epoch_loss += utils.to_scalar(loss) 194 | 195 | # update lr 196 | utils.adjust_learning_rate(optimizer, args.lr / (1 + (args.start_epoch + 1) * args.lr_decay)) 197 | 198 | # average 199 | epoch_loss /= tot_length 200 | 201 | # eval & save check_point 202 | 203 | if 'f' in args.eva_matrix: 204 | dev_result = evaluator.calc_score(ner_model, dev_dataset_loader) 205 | for label, (dev_f1, dev_pre, dev_rec, dev_acc, msg) in dev_result.items(): 206 | print('DEV : %s : dev_f1: %.4f dev_rec: %.4f dev_pre: %.4f dev_acc: %.4f | %s\n' % (label, dev_f1, dev_pre, dev_rec, dev_acc, msg)) 207 | (dev_f1, dev_pre, dev_rec, dev_acc, msg) = dev_result['total'] 208 | 209 | if dev_f1 > best_f1: 210 | patience_count = 0 211 | best_f1 = dev_f1 212 | 213 | test_result = evaluator.calc_score(ner_model, test_dataset_loader) 214 | for label, (test_f1, test_pre, test_rec, test_acc, msg) in test_result.items(): 215 | print('TEST : %s : test_f1: %.4f test_rec: %.4f test_pre: %.4f test_acc: %.4f | %s\n' % (label, test_f1, test_rec, test_pre, test_acc, msg)) 216 | (test_f1, test_rec, test_pre, test_acc, msg) = test_result['total'] 217 | 218 | track_list.append( 219 | {'loss': epoch_loss, 'dev_f1': dev_f1, 'dev_acc': dev_acc, 'test_f1': test_f1, 220 | 'test_acc': test_acc}) 221 | 222 | print( 223 | '(loss: %.4f, epoch: %d, dev F1 = %.4f, dev acc = %.4f, F1 on test = %.4f, acc on test= %.4f), saving...' % 224 | (epoch_loss, 225 | args.start_epoch, 226 | dev_f1, 227 | dev_acc, 228 | test_f1, 229 | test_acc)) 230 | 231 | try: 232 | utils.save_checkpoint({ 233 | 'epoch': args.start_epoch, 234 | 'state_dict': ner_model.state_dict(), 235 | 'optimizer': optimizer.state_dict(), 236 | 'f_map': f_map, 237 | 'l_map': l_map, 238 | }, {'track_list': track_list, 239 | 'args': vars(args) 240 | }, args.checkpoint + 'lstm_crf') 241 | except Exception as inst: 242 | print(inst) 243 | 244 | else: 245 | patience_count += 1 246 | print('(loss: %.4f, epoch: %d, dev F1 = %.4f, dev acc = %.4f)' % 247 | (epoch_loss, 248 | args.start_epoch, 249 | dev_f1, 250 | dev_acc)) 251 | track_list.append({'loss': epoch_loss, 'dev_f1': dev_f1, 'dev_acc': dev_acc}) 252 | 253 | else: 254 | 255 | dev_acc = evaluator.calc_score(ner_model, dev_dataset_loader) 256 | 257 | if dev_acc > best_acc: 258 | patience_count = 0 259 | best_acc = dev_acc 260 | 261 | test_acc = evaluator.calc_score(ner_model, test_dataset_loader) 262 | 263 | track_list.append( 264 | {'loss': epoch_loss, 'dev_acc': dev_acc, 'test_acc': test_acc}) 265 | 266 | print( 267 | '(loss: %.4f, epoch: %d, dev acc = %.4f, acc on test= %.4f), saving...' % 268 | (epoch_loss, 269 | args.start_epoch, 270 | dev_acc, 271 | test_acc)) 272 | 273 | try: 274 | utils.save_checkpoint({ 275 | 'epoch': args.start_epoch, 276 | 'state_dict': ner_model.state_dict(), 277 | 'optimizer': optimizer.state_dict(), 278 | 'f_map': f_map, 279 | 'l_map': l_map, 280 | }, {'track_list': track_list, 281 | 'args': vars(args) 282 | }, args.checkpoint + 'lstm_crf') 283 | except Exception as inst: 284 | print(inst) 285 | 286 | else: 287 | patience_count += 1 288 | print('(loss: %.4f, epoch: %d, dev acc = %.4f)' % 289 | (epoch_loss, 290 | args.start_epoch, 291 | dev_acc)) 292 | track_list.append({'loss': epoch_loss, 'dev_acc': dev_acc}) 293 | 294 | print('epoch: ' + str(args.start_epoch) + '\t in ' + str(args.epoch) + ' take: ' + str( 295 | time.time() - start_time) + ' s') 296 | 297 | if patience_count >= args.patience and args.start_epoch >= args.least_iters: 298 | break 299 | 300 | #print best 301 | if 'f' in args.eva_matrix: 302 | eprint(args.checkpoint + ' dev_f1: %.4f dev_rec: %.4f dev_pre: %.4f dev_acc: %.4f test_f1: %.4f test_rec: %.4f test_pre: %.4f test_acc: %.4f\n' % (dev_f1, dev_rec, dev_pre, dev_acc, test_f1, test_rec, test_pre, test_acc)) 303 | else: 304 | eprint(args.checkpoint + ' dev_acc: %.4f test_acc: %.4f\n' % (dev_acc, test_acc)) 305 | 306 | # printing summary 307 | print('setting:') 308 | print(args) 309 | -------------------------------------------------------------------------------- /train_wc.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import datetime 3 | import time 4 | import torch 5 | import torch.autograd as autograd 6 | import torch.nn as nn 7 | import torch.optim as optim 8 | import codecs 9 | from model.crf import * 10 | from model.lm_lstm_crf import * 11 | import model.utils as utils 12 | from model.evaluator import eval_wc 13 | 14 | import argparse 15 | import json 16 | import os 17 | import sys 18 | from tqdm import tqdm 19 | import itertools 20 | import functools 21 | 22 | def eprint(*args, **kwargs): 23 | print(*args, file=sys.stderr, **kwargs) 24 | 25 | if __name__ == "__main__": 26 | parser = argparse.ArgumentParser(description='Learning with LM-LSTM-CRF together with Language Model') 27 | parser.add_argument('--rand_embedding', action='store_true', help='random initialize word embedding') 28 | parser.add_argument('--emb_file', default='./embedding/glove.6B.100d.txt', help='path to pre-trained embedding') 29 | parser.add_argument('--train_file', default='./data/ner/eng.train.iobes', help='path to training file') 30 | parser.add_argument('--dev_file', default='./data/ner/eng.testa.iobes', help='path to development file') 31 | parser.add_argument('--test_file', default='./data/ner/eng.testb.iobes', help='path to test file') 32 | parser.add_argument('--gpu', type=int, default=0, help='gpu id') 33 | parser.add_argument('--batch_size', type=int, default=10, help='batch_size') 34 | parser.add_argument('--unk', default='unk', help='unknow-token in pre-trained embedding') 35 | parser.add_argument('--char_hidden', type=int, default=300, help='dimension of char-level layers') 36 | parser.add_argument('--word_hidden', type=int, default=300, help='dimension of word-level layers') 37 | parser.add_argument('--drop_out', type=float, default=0.55, help='dropout ratio') 38 | parser.add_argument('--epoch', type=int, default=200, help='maximum epoch number') 39 | parser.add_argument('--start_epoch', type=int, default=0, help='start point of epoch') 40 | parser.add_argument('--checkpoint', default='./checkpoint/', help='checkpoint path') 41 | parser.add_argument('--caseless', action='store_true', help='caseless or not') 42 | parser.add_argument('--char_dim', type=int, default=30, help='dimension of char embedding') 43 | parser.add_argument('--word_dim', type=int, default=100, help='dimension of word embedding') 44 | parser.add_argument('--char_layers', type=int, default=1, help='number of char level layers') 45 | parser.add_argument('--word_layers', type=int, default=1, help='number of word level layers') 46 | parser.add_argument('--lr', type=float, default=0.015, help='initial learning rate') 47 | parser.add_argument('--lr_decay', type=float, default=0.05, help='decay ratio of learning rate') 48 | parser.add_argument('--fine_tune', action='store_false', help='fine tune the diction of word embedding or not') 49 | parser.add_argument('--load_check_point', default='', help='path previous checkpoint that want to be loaded') 50 | parser.add_argument('--load_opt', action='store_true', help='also load optimizer from the checkpoint') 51 | parser.add_argument('--update', choices=['sgd', 'adam'], default='sgd', help='optimizer choice') 52 | parser.add_argument('--momentum', type=float, default=0.9, help='momentum for sgd') 53 | parser.add_argument('--clip_grad', type=float, default=5.0, help='clip grad at') 54 | parser.add_argument('--small_crf', action='store_false', help='use small crf instead of large crf, refer model.crf module for more details') 55 | parser.add_argument('--mini_count', type=float, default=5, help='thresholds to replace rare words with ') 56 | parser.add_argument('--lambda0', type=float, default=1, help='lambda0') 57 | parser.add_argument('--co_train', action='store_true', help='cotrain language model') 58 | parser.add_argument('--patience', type=int, default=15, help='patience for early stop') 59 | parser.add_argument('--high_way', action='store_true', help='use highway layers') 60 | parser.add_argument('--highway_layers', type=int, default=1, help='number of highway layers') 61 | parser.add_argument('--eva_matrix', choices=['a', 'fa'], default='fa', help='use f1 and accuracy or accuracy alone') 62 | parser.add_argument('--least_iters', type=int, default=50, help='at least train how many epochs before stop') 63 | parser.add_argument('--shrink_embedding', action='store_true', help='shrink the embedding dictionary to corpus (open this if pre-trained embedding dictionary is too large, but disable this may yield better results on external corpus)') 64 | args = parser.parse_args() 65 | 66 | if args.gpu >= 0: 67 | torch.cuda.set_device(args.gpu) 68 | 69 | print('setting:') 70 | print(args) 71 | 72 | # load corpus 73 | print('loading corpus') 74 | with codecs.open(args.train_file, 'r', 'utf-8') as f: 75 | lines = f.readlines() 76 | with codecs.open(args.dev_file, 'r', 'utf-8') as f: 77 | dev_lines = f.readlines() 78 | with codecs.open(args.test_file, 'r', 'utf-8') as f: 79 | test_lines = f.readlines() 80 | 81 | dev_features, dev_labels = utils.read_corpus(dev_lines) 82 | test_features, test_labels = utils.read_corpus(test_lines) 83 | 84 | if args.load_check_point: 85 | if os.path.isfile(args.load_check_point): 86 | print("loading checkpoint: '{}'".format(args.load_check_point)) 87 | checkpoint_file = torch.load(args.load_check_point) 88 | args.start_epoch = checkpoint_file['epoch'] 89 | f_map = checkpoint_file['f_map'] 90 | l_map = checkpoint_file['l_map'] 91 | c_map = checkpoint_file['c_map'] 92 | in_doc_words = checkpoint_file['in_doc_words'] 93 | train_features, train_labels = utils.read_corpus(lines) 94 | else: 95 | print("no checkpoint found at: '{}'".format(args.load_check_point)) 96 | else: 97 | print('constructing coding table') 98 | 99 | # converting format 100 | train_features, train_labels, f_map, l_map, c_map = utils.generate_corpus_char(lines, if_shrink_c_feature=True, c_thresholds=args.mini_count, if_shrink_w_feature=False) 101 | 102 | f_set = {v for v in f_map} 103 | f_map = utils.shrink_features(f_map, train_features, args.mini_count) 104 | 105 | if args.rand_embedding: 106 | print("embedding size: '{}'".format(len(f_map))) 107 | in_doc_words = len(f_map) 108 | else: 109 | dt_f_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), dev_features), f_set) 110 | dt_f_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), test_features), dt_f_set) 111 | print("feature size: '{}'".format(len(f_map))) 112 | print('loading embedding') 113 | if args.fine_tune: # which means does not do fine-tune 114 | f_map = {'': 0} 115 | f_map, embedding_tensor, in_doc_words = utils.load_embedding_wlm(args.emb_file, ' ', f_map, dt_f_set, args.caseless, args.unk, args.word_dim, shrink_to_corpus=args.shrink_embedding) 116 | print("embedding size: '{}'".format(len(f_map))) 117 | 118 | l_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), dev_labels)) 119 | l_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), test_labels), l_set) 120 | for label in l_set: 121 | if label not in l_map: 122 | l_map[label] = len(l_map) 123 | 124 | print('constructing dataset') 125 | # construct dataset 126 | dataset, forw_corp, back_corp = utils.construct_bucket_mean_vb_wc(train_features, train_labels, l_map, c_map, f_map, args.caseless) 127 | dev_dataset, forw_dev, back_dev = utils.construct_bucket_mean_vb_wc(dev_features, dev_labels, l_map, c_map, f_map, args.caseless) 128 | test_dataset, forw_test, back_test = utils.construct_bucket_mean_vb_wc(test_features, test_labels, l_map, c_map, f_map, args.caseless) 129 | 130 | dataset_loader = [torch.utils.data.DataLoader(tup, args.batch_size, shuffle=True, drop_last=False) for tup in dataset] 131 | dev_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in dev_dataset] 132 | test_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in test_dataset] 133 | 134 | # build model 135 | print('building model') 136 | ner_model = LM_LSTM_CRF(len(l_map), len(c_map), args.char_dim, args.char_hidden, args.char_layers, args.word_dim, args.word_hidden, args.word_layers, len(f_map), args.drop_out, large_CRF=args.small_crf, if_highway=args.high_way, in_doc_words=in_doc_words, highway_layers = args.highway_layers) 137 | 138 | if args.load_check_point: 139 | ner_model.load_state_dict(checkpoint_file['state_dict']) 140 | else: 141 | if not args.rand_embedding: 142 | ner_model.load_pretrained_word_embedding(embedding_tensor) 143 | ner_model.rand_init(init_word_embedding=args.rand_embedding) 144 | 145 | if args.update == 'sgd': 146 | optimizer = optim.SGD(ner_model.parameters(), lr=args.lr, momentum=args.momentum) 147 | elif args.update == 'adam': 148 | optimizer = optim.Adam(ner_model.parameters(), lr=args.lr) 149 | 150 | if args.load_check_point and args.load_opt: 151 | optimizer.load_state_dict(checkpoint_file['optimizer']) 152 | 153 | crit_lm = nn.CrossEntropyLoss() 154 | crit_ner = CRFLoss_vb(len(l_map), l_map[''], l_map['']) 155 | 156 | if args.gpu >= 0: 157 | if_cuda = True 158 | print('device: ' + str(args.gpu)) 159 | torch.cuda.set_device(args.gpu) 160 | crit_ner.cuda() 161 | crit_lm.cuda() 162 | ner_model.cuda() 163 | packer = CRFRepack_WC(len(l_map), True) 164 | else: 165 | if_cuda = False 166 | packer = CRFRepack_WC(len(l_map), False) 167 | 168 | tot_length = sum(map(lambda t: len(t), dataset_loader)) 169 | 170 | best_f1 = float('-inf') 171 | best_acc = float('-inf') 172 | track_list = list() 173 | start_time = time.time() 174 | epoch_list = range(args.start_epoch, args.start_epoch + args.epoch) 175 | patience_count = 0 176 | 177 | evaluator = eval_wc(packer, l_map, args.eva_matrix) 178 | 179 | for epoch_idx, args.start_epoch in enumerate(epoch_list): 180 | 181 | epoch_loss = 0 182 | ner_model.train() 183 | for f_f, f_p, b_f, b_p, w_f, tg_v, mask_v, len_v in tqdm( 184 | itertools.chain.from_iterable(dataset_loader), mininterval=2, 185 | desc=' - Tot it %d (epoch %d)' % (tot_length, args.start_epoch), leave=False, file=sys.stdout): 186 | f_f, f_p, b_f, b_p, w_f, tg_v, mask_v = packer.repack_vb(f_f, f_p, b_f, b_p, w_f, tg_v, mask_v, len_v) 187 | ner_model.zero_grad() 188 | scores = ner_model(f_f, f_p, b_f, b_p, w_f) 189 | loss = crit_ner(scores, tg_v, mask_v) 190 | epoch_loss += utils.to_scalar(loss) 191 | if args.co_train: 192 | cf_p = f_p[0:-1, :].contiguous() 193 | cb_p = b_p[1:, :].contiguous() 194 | cf_y = w_f[1:, :].contiguous() 195 | cb_y = w_f[0:-1, :].contiguous() 196 | cfs, _ = ner_model.word_pre_train_forward(f_f, cf_p) 197 | loss = loss + args.lambda0 * crit_lm(cfs, cf_y.view(-1)) 198 | cbs, _ = ner_model.word_pre_train_backward(b_f, cb_p) 199 | loss = loss + args.lambda0 * crit_lm(cbs, cb_y.view(-1)) 200 | loss.backward() 201 | nn.utils.clip_grad_norm_(ner_model.parameters(), args.clip_grad) 202 | optimizer.step() 203 | epoch_loss /= tot_length 204 | 205 | # update lr 206 | if args.update == 'sgd': 207 | utils.adjust_learning_rate(optimizer, args.lr / (1 + (args.start_epoch + 1) * args.lr_decay)) 208 | 209 | # eval & save check_point 210 | 211 | if 'f' in args.eva_matrix: 212 | dev_result = evaluator.calc_score(ner_model, dev_dataset_loader) 213 | for label, (dev_f1, dev_pre, dev_rec, dev_acc, msg) in dev_result.items(): 214 | print('DEV : %s : dev_f1: %.4f dev_rec: %.4f dev_pre: %.4f dev_acc: %.4f | %s\n' % (label, dev_f1, dev_rec, dev_pre, dev_acc, msg)) 215 | (dev_f1, dev_pre, dev_rec, dev_acc, msg) = dev_result['total'] 216 | 217 | if dev_f1 > best_f1: 218 | patience_count = 0 219 | best_f1 = dev_f1 220 | 221 | test_result = evaluator.calc_score(ner_model, test_dataset_loader) 222 | for label, (test_f1, test_pre, test_rec, test_acc, msg) in test_result.items(): 223 | print('TEST : %s : test_f1: %.4f test_rec: %.4f test_pre: %.4f test_acc: %.4f | %s\n' % (label, test_f1, test_rec, test_pre, test_acc, msg)) 224 | (test_f1, test_pre, test_rec, test_acc, msg) = test_result['total'] 225 | 226 | track_list.append( 227 | {'loss': epoch_loss, 'dev_f1': dev_f1, 'dev_acc': dev_acc, 'test_f1': test_f1, 228 | 'test_acc': test_acc}) 229 | 230 | print( 231 | '(loss: %.4f, epoch: %d, dev F1 = %.4f, dev acc = %.4f, F1 on test = %.4f, acc on test= %.4f), saving...' % 232 | (epoch_loss, 233 | args.start_epoch, 234 | dev_f1, 235 | dev_acc, 236 | test_f1, 237 | test_acc)) 238 | 239 | try: 240 | utils.save_checkpoint({ 241 | 'epoch': args.start_epoch, 242 | 'state_dict': ner_model.state_dict(), 243 | 'optimizer': optimizer.state_dict(), 244 | 'f_map': f_map, 245 | 'l_map': l_map, 246 | 'c_map': c_map, 247 | 'in_doc_words': in_doc_words 248 | }, {'track_list': track_list, 249 | 'args': vars(args) 250 | }, args.checkpoint + 'cwlm_lstm_crf') 251 | except Exception as inst: 252 | print(inst) 253 | 254 | else: 255 | patience_count += 1 256 | print('(loss: %.4f, epoch: %d, dev F1 = %.4f, dev acc = %.4f)' % 257 | (epoch_loss, 258 | args.start_epoch, 259 | dev_f1, 260 | dev_acc)) 261 | track_list.append({'loss': epoch_loss, 'dev_f1': dev_f1, 'dev_acc': dev_acc}) 262 | 263 | else: 264 | 265 | dev_acc = evaluator.calc_score(ner_model, dev_dataset_loader) 266 | 267 | if dev_acc > best_acc: 268 | patience_count = 0 269 | best_acc = dev_acc 270 | 271 | test_acc = evaluator.calc_score(ner_model, test_dataset_loader) 272 | 273 | track_list.append( 274 | {'loss': epoch_loss, 'dev_acc': dev_acc, 'test_acc': test_acc}) 275 | 276 | print( 277 | '(loss: %.4f, epoch: %d, dev acc = %.4f, acc on test= %.4f), saving...' % 278 | (epoch_loss, 279 | args.start_epoch, 280 | dev_acc, 281 | test_acc)) 282 | 283 | try: 284 | utils.save_checkpoint({ 285 | 'epoch': args.start_epoch, 286 | 'state_dict': ner_model.state_dict(), 287 | 'optimizer': optimizer.state_dict(), 288 | 'f_map': f_map, 289 | 'l_map': l_map, 290 | 'c_map': c_map, 291 | 'in_doc_words': in_doc_words 292 | }, {'track_list': track_list, 293 | 'args': vars(args) 294 | }, args.checkpoint + 'cwlm_lstm_crf') 295 | except Exception as inst: 296 | print(inst) 297 | 298 | else: 299 | patience_count += 1 300 | print('(loss: %.4f, epoch: %d, dev acc = %.4f)' % 301 | (epoch_loss, 302 | args.start_epoch, 303 | dev_acc)) 304 | track_list.append({'loss': epoch_loss, 'dev_acc': dev_acc}) 305 | 306 | print('epoch: ' + str(args.start_epoch) + '\t in ' + str(args.epoch) + ' take: ' + str( 307 | time.time() - start_time) + ' s') 308 | 309 | if patience_count >= args.patience and args.start_epoch >= args.least_iters: 310 | break 311 | 312 | #print best 313 | if 'f' in args.eva_matrix: 314 | eprint(args.checkpoint + ' dev_f1: %.4f dev_rec: %.4f dev_pre: %.4f dev_acc: %.4f test_f1: %.4f test_rec: %.4f test_pre: %.4f test_acc: %.4f\n' % (dev_f1, dev_rec, dev_pre, dev_acc, test_f1, test_rec, test_pre, test_acc)) 315 | else: 316 | eprint(args.checkpoint + ' dev_acc: %.4f test_acc: %.4f\n' % (dev_acc, test_acc)) 317 | 318 | # printing summary 319 | print('setting:') 320 | print(args) 321 | --------------------------------------------------------------------------------