├── .gitignore
├── LICENSE
├── README.md
├── docs
    ├── Makefile
    ├── framework.png
    └── source
    │   ├── conf.py
    │   ├── index.rst
    │   ├── model.rst
    │   └── modules.rst
├── eval_w.py
├── eval_wc.py
├── model
    ├── __init__.py
    ├── crf.py
    ├── evaluator.py
    ├── highway.py
    ├── lm_lstm_crf.py
    ├── lstm_crf.py
    ├── ner_dataset.py
    ├── predictor.py
    └── utils.py
├── requirements.txt
├── seq_w.py
├── seq_wc.py
├── train_w.py
└── train_wc.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | .DS_Store
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # IDEA
107 | .idea/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {2017} {Liyuan Liu}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # LM-LSTM-CRF 
  2 | 
  3 | [![Documentation Status](https://readthedocs.org/projects/lm-lstm-crf/badge/?version=latest)](http://lm-lstm-crf.readthedocs.io/en/latest/?badge=latest)
  4 | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
  5 | [![Insight.io](https://insight.io/repoBadge/github.com/LiyuanLucasLiu/LM-LSTM-CRF)](https://insight.io/github.com/LiyuanLucasLiu/LM-LSTM-CRF)
  6 | 
  7 | **Check Our New NER Toolkit🚀🚀🚀**
  8 | - **Inference**:
  9 |   - **[LightNER](https://github.com/LiyuanLucasLiu/LightNER)**: inference w. models pre-trained / trained w. *any* following tools, *efficiently*. 
 10 | - **Training**:
 11 |   - **[LD-Net](https://github.com/LiyuanLucasLiu/LD-Net)**: train NER models w. efficient contextualized representations.
 12 |   - **[VanillaNER](https://github.com/LiyuanLucasLiu/Vanilla_NER)**: train vanilla NER models w. pre-trained embedding.
 13 | - **Distant Training**:
 14 |   - **[AutoNER](https://shangjingbo1226.github.io/AutoNER/)**: train NER models w.o. line-by-line annotations and get competitive performance.
 15 | 
 16 | --------------------------------
 17 | 
 18 | This project provides high-performance character-aware sequence labeling tools, including [Training](#usage), [Evaluation](#evaluation) and [Prediction](#prediction). 
 19 | 
 20 | Details about LM-LSTM-CRF can be accessed [here](http://arxiv.org/abs/1709.04109), and the implementation is based on the PyTorch library. 
 21 | 
 22 | **Important:** A serious bug was found on the ```bioes_to_span``` function in the original implementation, please refer the numbers reported in the [Benchmarks](#benchmarks) section as the accurate performance.
 23 | 
 24 | The documents would be available [here](http://lm-lstm-crf.readthedocs.io/en/latest/).
 25 | 
 26 | ## Quick Links
 27 | 
 28 | - [Model](#model-notes)
 29 | - [Installation](#installation)
 30 | - [Data](#data)
 31 | - [Usage](#usage)
 32 | - [Benchmarks](#benchmarks)
 33 | - [Pretrained model](#pretrained-model)
 34 | 
 35 | ## Model Notes
 36 | 
 37 | <p align="center"><img width="100%" src="docs/framework.png"/></p>
 38 | 
 39 | As visualized above, we use conditional random field (CRF) to capture label dependencies, and adopt a hierarchical LSTM to leverage both char-level and word-level inputs. 
 40 | The char-level structure is further guided by a language model, while pre-trained word embeddings are leveraged in word-level.
 41 | The language model and the sequence labeling model are trained at the same time, and both make predictions at word-level.
 42 | [Highway networks]("https://arxiv.org/abs/1507.06228") are used to transform the output of char-level LSTM into different semantic spaces, and thus mediating these two tasks and allowing language model to empower sequence labeling.
 43 | 
 44 | ## Installation
 45 | 
 46 | For training, a GPU is strongly recommended for speed. CPU is supported but training could be extremely slow.
 47 | 
 48 | ### PyTorch
 49 | 
 50 | The code is based on PyTorch and **supports PyTorch 0.4 now** . You can find installation instructions [here](http://pytorch.org/).
 51 | 
 52 | ### Dependencies
 53 | 
 54 | The code is written in Python 3.6. Its dependencies are summarized in the file ```requirements.txt```. You can install these dependencies like this:
 55 | ```
 56 | pip3 install -r requirements.txt
 57 | ```
 58 | 
 59 | ## Data
 60 | 
 61 | We mainly focus on the CoNLL 2003 NER dataset, and the code takes its original format as input. 
 62 | However, due to the license issue, we are restricted to distribute this dataset.
 63 | You should be able to get it [here](http://aclweb.org/anthology/W03-0419).
 64 | You may also want to search online (e.g., Github), someone might release it accidentally.
 65 | 
 66 | ### Format
 67 | 
 68 | We assume the corpus is formatted as same as the CoNLL 2003 NER dataset.
 69 | More specifically, **empty lines** are used as separators between sentences, and the separator between documents is a special line as below.
 70 | ```
 71 | -DOCSTART- -X- -X- -X- O
 72 | ```
 73 | Other lines contains words, labels and other fields. **Word** must be the **first** field, **label** mush be the **last**, and these fields are **separated by space**.
 74 | For example, the first several lines in the WSJ portion of the PTB POS tagging corpus should be like the following snippet.
 75 | 
 76 | ```
 77 | -DOCSTART- -X- -X- -X- O
 78 | 
 79 | Pierre NNP
 80 | Vinken NNP
 81 | , ,
 82 | 61 CD
 83 | years NNS
 84 | old JJ
 85 | , ,
 86 | will MD
 87 | join VB
 88 | the DT
 89 | board NN
 90 | as IN
 91 | a DT
 92 | nonexecutive JJ
 93 | director NN
 94 | Nov. NNP
 95 | 29 CD
 96 | . .
 97 | 
 98 | 
 99 | ```
100 | 
101 | ## Usage
102 | 
103 | Here we provide implementations for two models, one is **LM-LSTM-CRF** and the other is its variant, **LSTM-CRF**, which only contains the word-level structure and CRF.
104 | ```train_wc.py``` and ```eval_wc.py``` are scripts for LM-LSTM-CRF, while ```train_w.py``` and ```eval_w.py``` are scripts for LSTM-CRF.
105 | The usages of these scripts can be accessed by the parameter ````-h````, i.e., 
106 | ```
107 | python train_wc.py -h
108 | python train_w.py -h
109 | python eval_wc.py -h
110 | python eval_w.py -h
111 | ```
112 | 
113 | The default running commands for NER and POS tagging, and NP Chunking are:
114 | 
115 | - Named Entity Recognition (NER):
116 | ```
117 | python train_wc.py --train_file ./data/ner/train.txt --dev_file ./data/ner/testa.txt --test_file ./data/ner/testb.txt --checkpoint ./checkpoint/ner_ --caseless --fine_tune --high_way --co_train --least_iters 100
118 | ```
119 | 
120 | - Part-of-Speech (POS) Tagging:
121 | ```
122 | python train_wc.py --train_file ./data/pos/train.txt --dev_file ./data/pos/testa.txt --test_file ./data/pos/testb.txt --eva_matrix a --checkpoint ./checkpoint/pos_ --caseless --fine_tune --high_way --co_train
123 | ```
124 | 
125 | - Noun Phrase (NP) Chunking:
126 | ```
127 | python train_wc.py --train_file ./data/np/train.txt.iobes --dev_file ./data/np/testa.txt.iobes --test_file ./data/np/testb.txt.iobes --checkpoint ./checkpoint/np_ --caseless --fine_tune --high_way --co_train --least_iters 100
128 | ```
129 | 
130 | For other datasets or tasks, you may wanna try different stopping parameters, especially, for smaller dataset, you may want to set ```least_iters``` to a larger value; and for some tasks, if the speed of loss decreasing is too slow, you may want to increase ```lr```.
131 | 
132 | ## Benchmarks
133 | 
134 | Here we compare LM-LSTM-CRF with recent state-of-the-art models on the CoNLL 2000 Chunking dataset, the CoNLL 2003 NER dataset, and the WSJ portion of the PTB POS Tagging dataset. All experiments are conducted on a GTX 1080 GPU.
135 | 
136 | A serious bug was found on the ```bioes_to_span``` function in the original implementation, please refer the following numbers as the accurate performance. 
137 | 
138 | ### NER
139 | 
140 | When models are only trained on the WSJ portion of the PTB POS Tagging dataset, the results are summarized as below.
141 | 
142 | |Model | Max(Acc) | Mean(Acc) | Std(Acc) | Time(h) |
143 | | ------------- |-------------| -----| -----| ---- |
144 | | LM-LSTM-CRF | **91.35** | **91.24** | 0.12 | 4 |
145 | | -- HighWay | 90.87 | 90.79 | 0.07 | 4 |
146 | | -- Co-Train | 91.23 | 90.95 | 0.34 | 2 |
147 | 
148 | ### POS
149 | 
150 | When models are only trained on the WSJ portion of the PTB POS Tagging dataset, the results are summarized as below.
151 | 
152 | |Model | Max(Acc) | Mean(Acc) | Std(Acc) | Reported(Acc) | Time(h) |
153 | | ------------- |-------------| -----| -----| -----| ---- |
154 | | [Lample et al. 2016](https://github.com/glample/tagger) | 97.51 | 97.35 | 0.09 | N/A | 37 |
155 | | [Ma et al. 2016](https://github.com/XuezheMax/LasagneNLP) | 97.46 | 97.42 | 0.04 | 97.55 | 21 |
156 | | LM-LSTM-CRF | **97.59** | **97.53** | 0.03 | | 16 |
157 | 
158 | ## Pretrained Model
159 | 
160 | ### Evaluation
161 | 
162 | We released pre-trained models on these three tasks. The checkpoint file can be downloaded at the following links. Notice that the NER model and Chunking model (coming soon) are trained on both the training set and the development set:
163 | 
164 | | WSJ-PTB POS Tagging |  CoNLL03 NER |
165 | | ------------------- | ------------------- |
166 | | [Args](https://drive.google.com/file/d/0B587SdKqutQmYmpiNFp6b1hKWEE/view?usp=sharing) | [Args](https://drive.google.com/file/d/1tGAQ0hu9AsIBdrqFn5fmDQ72Pk1I-o74/view?usp=sharing) | 
167 | | [Model](https://drive.google.com/file/d/0B587SdKqutQmNnR3Nnk1WHdIMG8/view?usp=sharing) | [Model](https://drive.google.com/file/d/1o9kjZV5EcHAhys3GPgl7EPGE5fuXyYjr/view?usp=sharing) | 
168 | 
169 | Also, ```eval_wc.py``` is provided to load and run these checkpoints. Its usage can be accessed by command ````python eval_wc.py -h````, and a running command example is provided below:
170 | ```
171 | python eval_wc.py --load_arg checkpoint/ner/ner_4_cwlm_lstm_crf.json --load_check_point checkpoint/ner_ner_4_cwlm_lstm_crf.model --gpu 0 --dev_file ./data/ner/testa.txt --test_file ./data/ner/testb.txt
172 | ```
173 | 
174 | ### Prediction
175 | 
176 | To annotated raw text, ```seq_wc.py``` is provided to annotate un-annotated text. Its usage can be accessed by command ````python seq_wc.py -h````, and a running command example is provided below:
177 | ```
178 | python seq_wc.py --load_arg checkpoint/ner/ner_4_cwlm_lstm_crf.json --load_check_point checkpoint/ner_ner_4_cwlm_lstm_crf.model --gpu 0 --input_file ./data/ner2003/test.txt --output_file output.txt
179 | ```
180 | 
181 | The input format is similar to CoNLL, but each line is required to only contain one field, token. For example, an input file could be:
182 | 
183 | ```
184 | -DOCSTART-
185 | 
186 | But
187 | China
188 | saw
189 | their
190 | luck
191 | desert
192 | them
193 | in
194 | the
195 | second
196 | match
197 | of
198 | the
199 | group
200 | ,
201 | crashing
202 | to
203 | a
204 | surprise
205 | 2-0
206 | defeat
207 | to
208 | newcomers
209 | Uzbekistan
210 | .
211 | ```
212 | and the corresponding output is:
213 | 
214 | ```
215 | -DOCSTART- -DOCSTART- -DOCSTART-
216 | 
217 | But <LOC> China </LOC> saw their luck desert them in the second match of the group , crashing to a surprise 2-0 defeat to newcomers <LOC> Uzbekistan </LOC> . 
218 | 
219 | ```
220 | 
221 | ## Reference
222 | 
223 | ```
224 | @inproceedings{2017arXiv170904109L,
225 |   title = "{Empower Sequence Labeling with Task-Aware Neural Language Model}", 
226 |   author = {{Liu}, L. and {Shang}, J. and {Xu}, F. and {Ren}, X. and {Gui}, H. and {Peng}, J. and {Han}, J.}, 
227 |   booktitle={AAAI},
228 |   year = 2018, 
229 | }
230 | ```
231 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = python -msphinx
 7 | SPHINXPROJ    = LM-LSTM-CRF
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/docs/framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LiyuanLucasLiu/LM-LSTM-CRF/b03ecf37799dee8f899783e7c475698d29288bc6/docs/framework.png


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # LM-LSTM-CRF documentation build configuration file, created by
  5 | # sphinx-quickstart on Thu Sep 14 03:49:01 2017.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | 
 20 | import os
 21 | import sys
 22 | 
 23 | sys.path.insert(0, os.path.abspath('../..'))
 24 | 
 25 | # -- General configuration ------------------------------------------------
 26 | 
 27 | # If your documentation needs a minimal Sphinx version, state it here.
 28 | #
 29 | # needs_sphinx = '1.0'
 30 | 
 31 | # Add any Sphinx extension module names here, as strings. They can be
 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 33 | # ones.
 34 | extensions = [
 35 |     'sphinx.ext.autodoc',
 36 |     'sphinx.ext.autosummary',
 37 |     'sphinx.ext.doctest',
 38 |     'sphinx.ext.intersphinx',
 39 |     'sphinx.ext.todo',
 40 |     'sphinx.ext.coverage',
 41 |     'sphinx.ext.mathjax',
 42 |     'sphinx.ext.napoleon',
 43 |     'sphinx.ext.viewcode',
 44 | ]
 45 | 
 46 | napoleon_use_ivar = True
 47 | 
 48 | # Add any paths that contain templates here, relative to this directory.
 49 | templates_path = ['_templates']
 50 | 
 51 | # The suffix(es) of source filenames.
 52 | # You can specify multiple suffix as a list of string:
 53 | #
 54 | # source_suffix = ['.rst', '.md']
 55 | source_suffix = '.rst'
 56 | 
 57 | # The master toctree document.
 58 | master_doc = 'index'
 59 | 
 60 | # General information about the project.
 61 | project = 'LM-LSTM-CRF'
 62 | copyright = '2017, Liyuan Liu, Frank Xu, Jingbo Shang'
 63 | author = 'Liyuan Liu, Frank Xu, Jingbo Shang'
 64 | 
 65 | # The version info for the project you're documenting, acts as replacement for
 66 | # |version| and |release|, also used in various other places throughout the
 67 | # built documents.
 68 | #
 69 | # The short X.Y version.
 70 | version = ''
 71 | # The full version, including alpha/beta/rc tags.
 72 | release = ''
 73 | 
 74 | # The language for content autogenerated by Sphinx. Refer to documentation
 75 | # for a list of supported languages.
 76 | #
 77 | # This is also used if you do content translation via gettext catalogs.
 78 | # Usually you set "language" from the command line for these cases.
 79 | language = None
 80 | 
 81 | # List of patterns, relative to source directory, that match files and
 82 | # directories to ignore when looking for source files.
 83 | # This patterns also effect to html_static_path and html_extra_path
 84 | exclude_patterns = []
 85 | 
 86 | # The name of the Pygments (syntax highlighting) style to use.
 87 | pygments_style = 'sphinx'
 88 | 
 89 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 90 | todo_include_todos = False
 91 | 
 92 | # -- Options for HTML output ----------------------------------------------
 93 | 
 94 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 95 | # a list of builtin themes.
 96 | #
 97 | html_theme = 'sphinx_rtd_theme'
 98 | 
 99 | # Theme options are theme-specific and customize the look and feel of a theme
100 | # further.  For a list of options available for each theme, see the
101 | # documentation.
102 | #
103 | # html_theme_options = {}
104 | html_theme_options = {
105 |     'collapse_navigation': False,
106 |     'display_version': True,
107 | }
108 | 
109 | # Add any paths that contain custom static files (such as style sheets) here,
110 | # relative to this directory. They are copied after the builtin static files,
111 | # so a file named "default.css" will overwrite the builtin "default.css".
112 | html_static_path = ['_static']
113 | 
114 | # Custom sidebar templates, must be a dictionary that maps document names
115 | # to template names.
116 | #
117 | # This is required for the alabaster theme
118 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
119 | html_sidebars = {
120 |     '**': [
121 |         'about.html',
122 |         'navigation.html',
123 |         'relations.html',  # needs 'show_related': True theme option to display
124 |         'searchbox.html',
125 |         'donate.html',
126 |     ]
127 | }
128 | 
129 | # -- Options for HTMLHelp output ------------------------------------------
130 | 
131 | # Output file base name for HTML help builder.
132 | htmlhelp_basename = 'LM-LSTM-CRFdoc'
133 | 
134 | # -- Options for LaTeX output ---------------------------------------------
135 | 
136 | latex_elements = {
137 |     # The paper size ('letterpaper' or 'a4paper').
138 |     #
139 |     # 'papersize': 'letterpaper',
140 | 
141 |     # The font size ('10pt', '11pt' or '12pt').
142 |     #
143 |     # 'pointsize': '10pt',
144 | 
145 |     # Additional stuff for the LaTeX preamble.
146 |     #
147 |     # 'preamble': '',
148 | 
149 |     # Latex figure (float) alignment
150 |     #
151 |     # 'figure_align': 'htbp',
152 | }
153 | 
154 | # Grouping the document tree into LaTeX files. List of tuples
155 | # (source start file, target name, title,
156 | #  author, documentclass [howto, manual, or own class]).
157 | latex_documents = [
158 |     (master_doc, 'LM-LSTM-CRF.tex', 'LM-LSTM-CRF Documentation',
159 |      'Liyuan Liu, Frank Xu, Jingbo Shang', 'manual'),
160 | ]
161 | 
162 | # -- Options for manual page output ---------------------------------------
163 | 
164 | # One entry per manual page. List of tuples
165 | # (source start file, name, description, authors, manual section).
166 | man_pages = [
167 |     (master_doc, 'lm-lstm-crf', 'LM-LSTM-CRF Documentation',
168 |      [author], 1)
169 | ]
170 | 
171 | # -- Options for Texinfo output -------------------------------------------
172 | 
173 | # Grouping the document tree into Texinfo files. List of tuples
174 | # (source start file, target name, title, author,
175 | #  dir menu entry, description, category)
176 | texinfo_documents = [
177 |     (master_doc, 'LM-LSTM-CRF', 'LM-LSTM-CRF Documentation',
178 |      author, 'LM-LSTM-CRF', 'One line description of project.',
179 |      'Miscellaneous'),
180 | ]
181 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. LM-LSTM-CRF documentation master file, created by
 2 |    sphinx-quickstart on Thu Sep 14 03:49:01 2017.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | :github_url: https://github.com/LiyuanLucasLiu/LM-LSTM-CRF
 7 | 
 8 | LM-LSTM-CRF documentation
 9 | =========================
10 | 
11 | **Check Our New NER Toolkit🚀🚀🚀**
12 | 
13 | - **Inference**:
14 | 
15 |   - `LightNER <https://github.com/LiyuanLucasLiu/LightNER>`_: inference w. models pre-trained / trained w. *any* following tools, *efficiently*. 
16 | 
17 | - **Training**:
18 | 
19 |   - `LD-Net <https://github.com/LiyuanLucasLiu/LD-Net>`_: train NER models w. efficient contextualized representations.
20 |   - `VanillaNER <https://github.com/LiyuanLucasLiu/Vanilla_NER>`_: train vanilla NER models w. pre-trained embedding.
21 | 
22 | - **Distant Training**:
23 | 
24 |   - `AutoNER <https://shangjingbo1226.github.io/AutoNER/>`_: train NER models w.o. line-by-line annotations and get competitive performance.
25 | 
26 | --------------------------
27 | 
28 | This project provides high-performance character-aware sequence labeling tools, including [Training](#usage), [Evaluation](#evaluation) and [Prediction](#prediction). 
29 | 
30 | Details about LM-LSTM-CRF can be accessed `here <http://arxiv.org/abs/1709.04109>`_, and the implementation is based on the PyTorch library. 
31 | 
32 | .. toctree::
33 |    :glob:
34 |    :maxdepth: 1
35 |    :caption: Notes
36 | 
37 |    notes/*
38 | 
39 | .. toctree::
40 |    :maxdepth: 4
41 |    :caption: Package Reference
42 | 
43 |    model
44 | 
45 | 
46 | Indices and tables
47 | ==================
48 | 
49 | * :ref:`genindex`
50 | * :ref:`modindex`
51 | * :ref:`search`
52 | 


--------------------------------------------------------------------------------
/docs/source/model.rst:
--------------------------------------------------------------------------------
 1 | model package
 2 | =============
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | model\.crf module
 8 | -----------------
 9 | 
10 | .. automodule:: model.crf
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | model\.evaluator module
16 | -----------------------
17 | 
18 | .. automodule:: model.evaluator
19 |     :members:
20 |     :undoc-members:
21 |     :show-inheritance:
22 | 
23 | model\.highway module
24 | ---------------------
25 | 
26 | .. automodule:: model.highway
27 |     :members:
28 |     :undoc-members:
29 |     :show-inheritance:
30 | 
31 | model\.lm\_lstm\_crf module
32 | ---------------------------
33 | 
34 | .. automodule:: model.lm_lstm_crf
35 |     :members:
36 |     :undoc-members:
37 |     :show-inheritance:
38 | 
39 | model\.lstm\_crf module
40 | -----------------------
41 | 
42 | .. automodule:: model.lstm_crf
43 |     :members:
44 |     :undoc-members:
45 |     :show-inheritance:
46 | 
47 | model\.ner\_dataset module
48 | --------------------------
49 | 
50 | .. automodule:: model.ner_dataset
51 |     :members:
52 |     :undoc-members:
53 |     :show-inheritance:
54 | 
55 | model\.utils module
56 | -------------------
57 | 
58 | .. automodule:: model.utils
59 |     :members:
60 |     :undoc-members:
61 |     :show-inheritance:
62 | 
63 | 
64 | Module contents
65 | ---------------
66 | 
67 | .. automodule:: model
68 |     :members:
69 |     :undoc-members:
70 |     :show-inheritance:
71 | 


--------------------------------------------------------------------------------
/docs/source/modules.rst:
--------------------------------------------------------------------------------
1 | model
2 | =====
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    model
8 | 


--------------------------------------------------------------------------------
/eval_w.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from __future__ import print_function
 3 | import datetime
 4 | import time
 5 | import torch
 6 | import torch.autograd as autograd
 7 | import torch.nn as nn
 8 | import torch.optim as optim
 9 | import codecs
10 | from model.crf import *
11 | from model.lstm_crf import *
12 | import model.utils as utils
13 | from model.evaluator import eval_w
14 | 
15 | import argparse
16 | import json
17 | import os
18 | import sys
19 | from tqdm import tqdm
20 | import itertools
21 | import functools
22 | 
23 | if __name__ == "__main__":
24 |     parser = argparse.ArgumentParser(description='Evaluating BLSTM-CRF')
25 |     parser.add_argument('--load_arg', default='./checkpoint/soa/check_wc_p_char_lstm_crf.json', help='arg json file path')
26 |     parser.add_argument('--load_check_point', default='./checkpoint/soa/check_wc_p_char_lstm_crf.model', help='checkpoint path')
27 |     parser.add_argument('--gpu',type=int, default=0, help='gpu id')
28 |     parser.add_argument('--eva_matrix', choices=['a', 'fa'], default='fa', help='use f1 and accuracy or accuracy alone')
29 |     parser.add_argument('--test_file', default='', help='path to test file, if set to none, would use test_file path in the checkpoint file')
30 |     args = parser.parse_args()
31 | 
32 |     with open(args.load_arg, 'r') as f:
33 |         jd = json.load(f)
34 |     jd = jd['args']
35 | 
36 |     checkpoint_file = torch.load(args.load_check_point, map_location=lambda storage, loc: storage)
37 |     f_map = checkpoint_file['f_map']
38 |     l_map = checkpoint_file['l_map']
39 |     if args.gpu >= 0:
40 |         torch.cuda.set_device(args.gpu)
41 | 
42 | 
43 |     # load corpus
44 | 
45 |     if args.test_file:
46 |         with codecs.open(args.test_file, 'r', 'utf-8') as f:
47 |             test_lines = f.readlines()
48 |     else:
49 |         with codecs.open(jd['test_file'], 'r', 'utf-8') as f:
50 |             test_lines = f.readlines()
51 | 
52 |     # converting format
53 | 
54 |     test_features, test_labels = utils.read_corpus(test_lines)
55 | 
56 |     # construct dataset
57 |     test_dataset = utils.construct_bucket_mean_vb(test_features, test_labels, f_map, l_map, jd['caseless'])
58 |     
59 |     test_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in test_dataset]
60 | 
61 |     # build model
62 |     ner_model = LSTM_CRF(len(f_map), len(l_map), jd['embedding_dim'], jd['hidden'], jd['layers'], jd['drop_out'], large_CRF=jd['small_crf'])
63 | 
64 |     ner_model.load_state_dict(checkpoint_file['state_dict'])
65 | 
66 |     if args.gpu >= 0:
67 |         if_cuda = True
68 |         torch.cuda.set_device(args.gpu)
69 |         ner_model.cuda()
70 |         packer = CRFRepack(len(l_map), True)
71 |     else:
72 |         if_cuda = False
73 |         packer = CRFRepack(len(l_map), False)
74 | 
75 |     evaluator = eval_w(packer, l_map, args.eva_matrix)
76 | 
77 |     if 'f' in args.eva_matrix:
78 | 
79 |         test_f1, test_pre, test_rec, test_acc = evaluator.calc_score(ner_model, test_dataset_loader)
80 | 
81 |         print(jd['checkpoint'] + ' test_f1: %.4f test_rec: %.4f test_pre: %.4f test_acc: %.4f\n' % (test_f1, test_rec, test_pre, test_acc))
82 | 
83 |     else:
84 | 
85 |         test_acc = evaluator.calc_score(ner_model, test_dataset_loader)
86 | 
87 |         print(jd['checkpoint'] + ' test_acc: %.4f\n' % (test_acc))
88 | 


--------------------------------------------------------------------------------
/eval_wc.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from __future__ import print_function
 3 | import datetime
 4 | import time
 5 | import torch
 6 | import torch.autograd as autograd
 7 | import torch.nn as nn
 8 | import torch.optim as optim
 9 | import codecs
10 | from model.crf import *
11 | from model.lm_lstm_crf import *
12 | import model.utils as utils
13 | from model.evaluator import eval_wc
14 | 
15 | import argparse
16 | import json
17 | import os
18 | import sys
19 | from tqdm import tqdm
20 | import itertools
21 | import functools
22 | 
23 | if __name__ == "__main__":
24 |     parser = argparse.ArgumentParser(description='Evaluating LM-BLSTM-CRF')
25 |     parser.add_argument('--load_arg', default='./checkpoint/soa/check_wc_p_char_lstm_crf.json', help='path to arg json')
26 |     parser.add_argument('--load_check_point', default='./checkpoint/soa/check_wc_p_char_lstm_crf.model', help='path to model checkpoint file')
27 |     parser.add_argument('--gpu',type=int, default=0, help='gpu id')
28 |     parser.add_argument('--eva_matrix', choices=['a', 'fa'], default='fa', help='use f1 and accuracy or f1 alone')
29 |     parser.add_argument('--test_file', default='', help='path to test file, if set to none, would use test_file path in the checkpoint file')
30 |     args = parser.parse_args()
31 | 
32 |     with open(args.load_arg, 'r') as f:
33 |         jd = json.load(f)
34 |     jd = jd['args']
35 | 
36 |     checkpoint_file = torch.load(args.load_check_point, map_location=lambda storage, loc: storage)
37 |     f_map = checkpoint_file['f_map']
38 |     l_map = checkpoint_file['l_map']
39 |     c_map = checkpoint_file['c_map']
40 |     in_doc_words = checkpoint_file['in_doc_words']
41 |     if args.gpu >= 0:
42 |         torch.cuda.set_device(args.gpu)
43 | 
44 | 
45 |     # load corpus
46 |     if args.test_file:
47 |         with codecs.open(args.test_file, 'r', 'utf-8') as f:
48 |             test_lines = f.readlines()
49 |     else:
50 |         with codecs.open(jd['test_file'], 'r', 'utf-8') as f:
51 |             test_lines = f.readlines()
52 | 
53 |     # converting format
54 | 
55 |     test_features, test_labels = utils.read_corpus(test_lines)
56 | 
57 |     # construct dataset
58 |     test_dataset, forw_test, back_test = utils.construct_bucket_mean_vb_wc(test_features, test_labels, l_map, c_map, f_map, jd['caseless'])
59 |     
60 |     test_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in test_dataset]
61 | 
62 |     # build model
63 |     ner_model = LM_LSTM_CRF(len(l_map), len(c_map), jd['char_dim'], jd['char_hidden'], jd['char_layers'], jd['word_dim'], jd['word_hidden'], jd['word_layers'], len(f_map), jd['drop_out'], large_CRF=jd['small_crf'], if_highway=jd['high_way'], in_doc_words=in_doc_words, highway_layers = jd['highway_layers'])
64 | 
65 |     ner_model.load_state_dict(checkpoint_file['state_dict'])
66 | 
67 |     if args.gpu >= 0:
68 |         if_cuda = True
69 |         torch.cuda.set_device(args.gpu)
70 |         ner_model.cuda()
71 |         packer = CRFRepack_WC(len(l_map), True)
72 |     else:
73 |         if_cuda = False
74 |         packer = CRFRepack_WC(len(l_map), False)
75 | 
76 |     evaluator = eval_wc(packer, l_map, args.eva_matrix)
77 | 
78 |     print('start')
79 |     if 'f' in args.eva_matrix:
80 | 
81 |         result = evaluator.calc_score(ner_model, test_dataset_loader)
82 |         for label, (test_f1, test_pre, test_rec, test_acc, msg) in result.items():
83 |             print(jd['checkpoint'] +' : %s : test_f1: %.4f test_rec: %.4f test_pre: %.4f test_acc: %.4f | %s\n' % (label, test_f1, test_rec, test_pre, test_acc, msg))
84 | 
85 |     else:
86 | 
87 |         test_acc = evaluator.calc_score(ner_model, test_dataset_loader)
88 | 
89 |         print(jd['checkpoint'] + ' test_acc: %.4f\n' % (test_acc))
90 |     print('end')
91 | 


--------------------------------------------------------------------------------
/model/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = "Liyuan Liu and Frank Xu"
2 | __credits__ = ["Liyuan Liu", "Frank Xu", "Jingbo Shang"]
3 | 
4 | __license__ = "Apache License 2.0"
5 | __maintainer__ = "Liyuan Liu"
6 | __email__ = "llychinalz@gmail.com"


--------------------------------------------------------------------------------
/model/crf.py:
--------------------------------------------------------------------------------
  1 | """
  2 | .. module:: crf
  3 |     :synopsis: conditional random field
  4 | 
  5 | .. moduleauthor:: Liyuan Liu
  6 | """
  7 | 
  8 | import torch
  9 | import torch.autograd as autograd
 10 | import torch.nn as nn
 11 | import torch.optim as optim
 12 | import torch.sparse as sparse
 13 | import model.utils as utils
 14 | 
 15 | 
 16 | class CRF_L(nn.Module):
 17 |     """Conditional Random Field (CRF) layer. This version is used in Ma et al. 2016, has more parameters than CRF_S
 18 | 
 19 |     args:
 20 |         hidden_dim : input dim size
 21 |         tagset_size: target_set_size
 22 |         if_biase: whether allow bias in linear trans
 23 |     """
 24 | 
 25 | 
 26 |     def __init__(self, hidden_dim, tagset_size, if_bias=True):
 27 |         super(CRF_L, self).__init__()
 28 |         self.tagset_size = tagset_size
 29 |         self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size * self.tagset_size, bias=if_bias)
 30 | 
 31 |     def rand_init(self):
 32 |         """random initialization
 33 |         """
 34 |         utils.init_linear(self.hidden2tag)
 35 | 
 36 |     def forward(self, feats):
 37 |         """
 38 |         args:
 39 |             feats (batch_size, seq_len, hidden_dim) : input score from previous layers
 40 |         return:
 41 |             output from crf layer (batch_size, seq_len, tag_size, tag_size)
 42 |         """
 43 |         return self.hidden2tag(feats).view(-1, self.tagset_size, self.tagset_size)
 44 | 
 45 | 
 46 | class CRF_S(nn.Module):
 47 |     """Conditional Random Field (CRF) layer. This version is used in Lample et al. 2016, has less parameters than CRF_L.
 48 | 
 49 |     args:
 50 |         hidden_dim: input dim size
 51 |         tagset_size: target_set_size
 52 |         if_biase: whether allow bias in linear trans
 53 | 
 54 |     """
 55 | 
 56 |     def __init__(self, hidden_dim, tagset_size, if_bias=True):
 57 |         super(CRF_S, self).__init__()
 58 |         self.tagset_size = tagset_size
 59 |         self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size, bias=if_bias)
 60 |         self.transitions = nn.Parameter(torch.Tensor(self.tagset_size, self.tagset_size))
 61 | 
 62 |     def rand_init(self):
 63 |         """random initialization
 64 |         """
 65 |         utils.init_linear(self.hidden2tag)
 66 |         self.transitions.data.zero_()
 67 | 
 68 |     def forward(self, feats):
 69 |         """
 70 |         args:
 71 |             feats (batch_size, seq_len, hidden_dim) : input score from previous layers
 72 |         return:
 73 |             output from crf layer ( (batch_size * seq_len), tag_size, tag_size)
 74 |         """
 75 | 
 76 |         scores = self.hidden2tag(feats).view(-1, self.tagset_size, 1)
 77 |         ins_num = scores.size(0)
 78 |         crf_scores = scores.expand(ins_num, self.tagset_size, self.tagset_size) + self.transitions.view(1, self.tagset_size, self.tagset_size).expand(ins_num, self.tagset_size, self.tagset_size)
 79 | 
 80 |         return crf_scores
 81 | 
 82 | class CRFRepack:
 83 |     """Packer for word level model
 84 | 
 85 |     args:
 86 |         tagset_size: target_set_size
 87 |         if_cuda: whether use GPU
 88 |     """
 89 | 
 90 |     def __init__(self, tagset_size, if_cuda):
 91 | 
 92 |         self.tagset_size = tagset_size
 93 |         self.if_cuda = if_cuda
 94 | 
 95 |     def repack_vb(self, feature, target, mask):
 96 |         """packer for viterbi loss
 97 | 
 98 |         args:
 99 |             feature (Seq_len, Batch_size): input feature
100 |             target (Seq_len, Batch_size): output target
101 |             mask (Seq_len, Batch_size): padding mask
102 |         return:
103 |             feature (Seq_len, Batch_size), target (Seq_len, Batch_size), mask (Seq_len, Batch_size)
104 |         """
105 | 
106 |         if self.if_cuda:
107 |             fea_v = feature.transpose(0, 1).cuda()
108 |             tg_v = target.transpose(0, 1).unsqueeze(2).cuda()
109 |             mask_v = mask.transpose(0, 1).cuda()
110 |         else:
111 |             fea_v = feature.transpose(0, 1)
112 |             tg_v = target.transpose(0, 1).contiguous().unsqueeze(2)
113 |             mask_v = mask.transpose(0, 1).contiguous()
114 |         return fea_v, tg_v, mask_v
115 | 
116 |     def repack_gd(self, feature, target, current):
117 |         """packer for greedy loss
118 | 
119 |         args:
120 |             feature (Seq_len, Batch_size): input feature
121 |             target (Seq_len, Batch_size): output target
122 |             current (Seq_len, Batch_size): current state
123 |         return:
124 |             feature (Seq_len, Batch_size), target (Seq_len * Batch_size), current (Seq_len * Batch_size, 1, 1)
125 |         """
126 |         if self.if_cuda:
127 |             fea_v = feature.transpose(0, 1).cuda()
128 |             ts_v = target.transpose(0, 1).cuda().view(-1)
129 |             cs_v = current.transpose(0, 1).cuda().view(-1, 1, 1)
130 |         else:
131 |             fea_v = feature.transpose(0, 1)
132 |             ts_v = target.transpose(0, 1).contiguous().view(-1)
133 |             cs_v = current.transpose(0, 1).contiguous().view(-1, 1, 1)
134 |         return fea_v, ts_v, cs_v
135 | 
136 |     def convert_for_eval(self, target):
137 |         """convert target to original decoding
138 | 
139 |         args:
140 |             target: input labels used in training
141 |         return:
142 |             output labels used in test
143 |         """
144 |         return target % self.tagset_size
145 | 
146 | 
147 | class CRFRepack_WC:
148 |     """Packer for model with char-level and word-level
149 | 
150 |     args:
151 |         tagset_size: target_set_size
152 |         if_cuda: whether use GPU
153 | 
154 |     """
155 | 
156 |     def __init__(self, tagset_size, if_cuda):
157 | 
158 |         self.tagset_size = tagset_size
159 |         self.if_cuda = if_cuda
160 | 
161 |     def repack_vb(self, fc_feature, fc_position, bc_feature, bc_position, word_feature, target, mask, batch_len):
162 |         """packer for viterbi loss
163 | 
164 |         args:
165 |             fc_feature (Char_Seq_len, Batch_size) : forward_char input feature
166 |             fc_position (Word_Seq_len, Batch_size) : forward_char input position
167 |             bc_feature (Char_Seq_len, Batch_size) : backward_char input feature
168 |             bc_position (Word_Seq_len, Batch_size) : backward_char input position
169 |             word_feature (Word_Seq_len, Batch_size) : input word feature
170 |             target (Seq_len, Batch_size) : output target
171 |             mask (Word_Seq_len, Batch_size) : padding mask
172 |             batch_len (Batch_size, 2) : length of instances in one batch
173 |         return:
174 |             f_f (Char_Reduced_Seq_len, Batch_size), f_p (Word_Reduced_Seq_len, Batch_size), b_f (Char_Reduced_Seq_len, Batch_size), b_p (Word_Reduced_Seq_len, Batch_size), w_f (size Word_Seq_Len, Batch_size), target (Reduced_Seq_len, Batch_size), mask  (Word_Reduced_Seq_len, Batch_size)
175 | 
176 |         """
177 |         mlen, _ = batch_len.max(0)
178 |         mlen = mlen.squeeze()
179 |         ocl = bc_feature.size(1)
180 |         if self.if_cuda:
181 |             fc_feature = fc_feature[:, 0:mlen[0]].transpose(0, 1).cuda()
182 |             fc_position = fc_position[:, 0:mlen[1]].transpose(0, 1).cuda()
183 |             bc_feature = bc_feature[:, -mlen[0]:].transpose(0, 1).cuda()
184 |             bc_position = (bc_position[:, 0:mlen[1]] - ocl + mlen[0]).transpose(0, 1).cuda()
185 |             word_feature = word_feature[:, 0:mlen[1]].transpose(0, 1).cuda()
186 |             tg_v = target[:, 0:mlen[1]].transpose(0, 1).unsqueeze(2).cuda()
187 |             mask_v = mask[:, 0:mlen[1]].transpose(0, 1).cuda()
188 |         else:
189 |             fc_feature = fc_feature[:, 0:mlen[0]].transpose(0, 1)
190 |             fc_position = fc_position[:, 0:mlen[1]].transpose(0, 1)
191 |             bc_feature = bc_feature[:, -mlen[0]:].transpose(0, 1)
192 |             bc_position = (bc_position[:, 0:mlen[1]] - ocl + mlen[0]).transpose(0, 1)
193 |             word_feature = word_feature[:, 0:mlen[1]].transpose(0, 1)
194 |             tg_v = target[:, 0:mlen[1]].transpose(0, 1).unsqueeze(2)
195 |             mask_v = mask[:, 0:mlen[1]].transpose(0, 1).contiguous()
196 |         return fc_feature, fc_position, bc_feature, bc_position, word_feature, tg_v, mask_v
197 | 
198 |     def convert_for_eval(self, target):
199 |         """convert for eval
200 | 
201 |         args:
202 |             target: input labels used in training
203 |         return:
204 |             output labels used in test
205 |         """
206 |         return target % self.tagset_size
207 | 
208 | 
209 | class CRFLoss_gd(nn.Module):
210 |     """loss for greedy decode loss, i.e., although its for CRF Layer, we calculate the loss as
211 | 
212 |     .. math::
213 |         \sum_{j=1}^n \log (p(\hat{y}_{j+1}|z_{j+1}, \hat{y}_{j}))
214 | 
215 |     instead of
216 | 
217 |     .. math::
218 |         \sum_{j=1}^n \log (\phi(\hat{y}_{j-1}, \hat{y}_j, \mathbf{z}_j)) - \log (\sum_{\mathbf{y}' \in \mathbf{Y}(\mathbf{Z})} \prod_{j=1}^n \phi(y'_{j-1}, y'_j, \mathbf{z}_j) )
219 | 
220 |     args:
221 |         tagset_size: target_set_size
222 |         start_tag: ind for <start>
223 |         end_tag: ind for <pad>
224 |         average_batch: whether average the loss among batch
225 | 
226 |     """
227 | 
228 |     def __init__(self, tagset_size, start_tag, end_tag, average_batch=True):
229 |         super(CRFLoss_gd, self).__init__()
230 |         self.tagset_size = tagset_size
231 |         self.average_batch = average_batch
232 |         self.crit = nn.CrossEntropyLoss(size_average=self.average_batch)
233 | 
234 |     def forward(self, scores, target, current):
235 |         """
236 |         args:
237 |             scores (Word_Seq_len, Batch_size, target_size_from, target_size_to): crf scores
238 |             target (Word_Seq_len, Batch_size): golden list
239 |             current (Word_Seq_len, Batch_size): current state
240 |         return:
241 |             crf greedy loss
242 |         """
243 |         ins_num = current.size(0)
244 |         current = current.expand(ins_num, 1, self.tagset_size)
245 |         scores = scores.view(ins_num, self.tagset_size, self.tagset_size)
246 |         current_score = torch.gather(scores, 1, current).squeeze()
247 |         return self.crit(current_score, target)
248 | 
249 | 
250 | class CRFLoss_vb(nn.Module):
251 |     """loss for viterbi decode
252 | 
253 |     .. math::
254 |         \sum_{j=1}^n \log (\phi(\hat{y}_{j-1}, \hat{y}_j, \mathbf{z}_j)) - \log (\sum_{\mathbf{y}' \in \mathbf{Y}(\mathbf{Z})} \prod_{j=1}^n \phi(y'_{j-1}, y'_j, \mathbf{z}_j) )
255 | 
256 |     args:
257 |         tagset_size: target_set_size
258 |         start_tag: ind for <start>
259 |         end_tag: ind for <pad>
260 |         average_batch: whether average the loss among batch
261 | 
262 |     """
263 | 
264 |     def __init__(self, tagset_size, start_tag, end_tag, average_batch=True):
265 |         super(CRFLoss_vb, self).__init__()
266 |         self.tagset_size = tagset_size
267 |         self.start_tag = start_tag
268 |         self.end_tag = end_tag
269 |         self.average_batch = average_batch
270 | 
271 |     def forward(self, scores, target, mask):
272 |         """
273 |         args:
274 |             scores (seq_len, bat_size, target_size_from, target_size_to) : crf scores
275 |             target (seq_len, bat_size, 1) : golden state
276 |             mask (size seq_len, bat_size) : mask for padding
277 |         return:
278 |             loss
279 |         """
280 | 
281 |         # calculate batch size and seq len
282 |         seq_len = scores.size(0)
283 |         bat_size = scores.size(1)
284 | 
285 |         # calculate sentence score
286 |         tg_energy = torch.gather(scores.view(seq_len, bat_size, -1), 2, target).view(seq_len, bat_size)  # seq_len * bat_size
287 |         tg_energy = tg_energy.masked_select(mask).sum()
288 | 
289 |         # calculate forward partition score
290 | 
291 |         # build iter
292 |         seq_iter = enumerate(scores)
293 |         # the first score should start with <start>
294 |         _, inivalues = seq_iter.__next__()  # bat_size * from_target_size * to_target_size
295 |         # only need start from start_tag
296 |         partition = inivalues[:, self.start_tag, :].clone()  # bat_size * to_target_size
297 |         # iter over last scores
298 |         for idx, cur_values in seq_iter:
299 |             # previous to_target is current from_target
300 |             # partition: previous results log(exp(from_target)), #(batch_size * from_target)
301 |             # cur_values: bat_size * from_target * to_target
302 |             cur_values = cur_values + partition.contiguous().view(bat_size, self.tagset_size, 1).expand(bat_size, self.tagset_size, self.tagset_size)
303 |             cur_partition = utils.log_sum_exp(cur_values, self.tagset_size)
304 |             # (bat_size * from_target * to_target) -> (bat_size * to_target)
305 |             # partition = utils.switch(partition, cur_partition, mask[idx].view(bat_size, 1).expand(bat_size, self.tagset_size)).view(bat_size, -1)
306 |             mask_idx = mask[idx, :].view(bat_size, 1).expand(bat_size, self.tagset_size)
307 |             partition.masked_scatter_(mask_idx, cur_partition.masked_select(mask_idx))  #0 for partition, 1 for cur_partition
308 | 
309 |         #only need end at end_tag
310 |         partition = partition[:, self.end_tag].sum()
311 |         # average = mask.sum()
312 | 
313 |         # average_batch
314 |         if self.average_batch:
315 |             loss = (partition - tg_energy) / bat_size
316 |         else:
317 |             loss = (partition - tg_energy)
318 | 
319 |         return loss
320 | 
321 | class CRFDecode_vb():
322 |     """Batch-mode viterbi decode
323 | 
324 |     args:
325 |         tagset_size: target_set_size
326 |         start_tag: ind for <start>
327 |         end_tag: ind for <pad>
328 |         average_batch: whether average the loss among batch
329 | 
330 |     """
331 | 
332 |     def __init__(self, tagset_size, start_tag, end_tag, average_batch=True):
333 |         self.tagset_size = tagset_size
334 |         self.start_tag = start_tag
335 |         self.end_tag = end_tag
336 |         self.average_batch = average_batch
337 | 
338 |     def decode(self, scores, mask):
339 |         """Find the optimal path with viterbe decode
340 | 
341 |         args:
342 |             scores (size seq_len, bat_size, target_size_from, target_size_to) : crf scores
343 |             mask (seq_len, bat_size) : mask for padding
344 |         return:
345 |             decoded sequence (size seq_len, bat_size)
346 |         """
347 |         # calculate batch size and seq len
348 | 
349 |         seq_len = scores.size(0)
350 |         bat_size = scores.size(1)
351 | 
352 |         mask = 1 - mask
353 |         decode_idx = torch.LongTensor(seq_len-1, bat_size)
354 | 
355 |         # calculate forward score and checkpoint
356 | 
357 |         # build iter
358 |         seq_iter = enumerate(scores)
359 |         # the first score should start with <start>
360 |         _, inivalues = seq_iter.__next__()  # bat_size * from_target_size * to_target_size
361 |         # only need start from start_tag
362 |         forscores = inivalues[:, self.start_tag, :]  # bat_size * to_target_size
363 |         back_points = list()
364 |         # iter over last scores
365 |         for idx, cur_values in seq_iter:
366 |             # previous to_target is current from_target
367 |             # partition: previous results log(exp(from_target)), #(batch_size * from_target)
368 |             # cur_values: bat_size * from_target * to_target
369 |             cur_values = cur_values + forscores.contiguous().view(bat_size, self.tagset_size, 1).expand(bat_size, self.tagset_size, self.tagset_size)
370 | 
371 |             forscores, cur_bp = torch.max(cur_values, 1)
372 |             cur_bp.masked_fill_(mask[idx].view(bat_size, 1).expand(bat_size, self.tagset_size), self.end_tag)
373 |             back_points.append(cur_bp)
374 | 
375 |         pointer = back_points[-1][:, self.end_tag]
376 |         decode_idx[-1] = pointer
377 |         for idx in range(len(back_points)-2, -1, -1):
378 |             back_point = back_points[idx]
379 |             index = pointer.contiguous().view(-1,1)
380 |             pointer = torch.gather(back_point, 1, index).view(-1)
381 |             decode_idx[idx] = pointer
382 |         return decode_idx
383 | 


--------------------------------------------------------------------------------
/model/evaluator.py:
--------------------------------------------------------------------------------
  1 | """
  2 | .. module:: evaluator
  3 |     :synopsis: evaluation method (f1 score and accuracy)
  4 | 
  5 | .. moduleauthor:: Liyuan Liu, Frank Xu
  6 | """
  7 | 
  8 | 
  9 | import torch
 10 | import numpy as np
 11 | import itertools
 12 | 
 13 | import model.utils as utils
 14 | from torch.autograd import Variable
 15 | 
 16 | from model.crf import CRFDecode_vb
 17 | 
 18 | class eval_batch:
 19 |     """Base class for evaluation, provide method to calculate f1 score and accuracy
 20 | 
 21 |     args:
 22 |         packer: provide method to convert target into original space [TODO: need to improve]
 23 |         l_map: dictionary for labels
 24 |     """
 25 | 
 26 | 
 27 |     def __init__(self, packer, l_map):
 28 |         self.packer = packer
 29 |         self.l_map = l_map
 30 |         self.r_l_map = utils.revlut(l_map)
 31 |         self.totalp_counts={}
 32 |         self.truep_counts={}
 33 |         self.fn_counts={}
 34 |         self.fp_counts={}
 35 |         self.f1={}
 36 | 
 37 |     def reset(self):
 38 |         """
 39 |         re-set all states
 40 |         """
 41 |         self.correct_labels = 0
 42 |         self.total_labels = 0
 43 |         self.gold_count = 0
 44 |         self.guess_count = 0
 45 |         self.overlap_count = 0
 46 |         self.totalp_counts={}
 47 |         self.truep_counts={}
 48 |         self.fn_counts={}
 49 |         self.fp_counts={}
 50 |         self.f1={}
 51 | 
 52 |     def calc_f1_batch(self, decoded_data, target_data):
 53 |         """
 54 |         update statics for f1 score
 55 | 
 56 |         args:
 57 |             decoded_data (batch_size, seq_len): prediction sequence
 58 |             target_data (batch_size, seq_len): ground-truth
 59 |         """
 60 |         batch_decoded = torch.unbind(decoded_data, 1)
 61 |         batch_targets = torch.unbind(target_data, 0)
 62 | 
 63 |         for decoded, target in zip(batch_decoded, batch_targets):
 64 |             gold = self.packer.convert_for_eval(target)
 65 |             # remove padding
 66 |             length = utils.find_length_from_labels(gold, self.l_map)
 67 |             gold = gold[:length]
 68 |             best_path = decoded[:length]
 69 | 
 70 |             correct_labels_i, total_labels_i, gold_count_i, guess_count_i, overlap_count_i = self.eval_instance(best_path.numpy(), gold.numpy())
 71 |             self.correct_labels += correct_labels_i
 72 |             self.total_labels += total_labels_i
 73 |             self.gold_count += gold_count_i
 74 |             self.guess_count += guess_count_i
 75 |             self.overlap_count += overlap_count_i
 76 | 
 77 |     def calc_acc_batch(self, decoded_data, target_data):
 78 |         """
 79 |         update statics for accuracy
 80 | 
 81 |         args:
 82 |             decoded_data (batch_size, seq_len): prediction sequence
 83 |             target_data (batch_size, seq_len): ground-truth
 84 |         """
 85 |         batch_decoded = torch.unbind(decoded_data, 1)
 86 |         batch_targets = torch.unbind(target_data, 0)
 87 | 
 88 |         for decoded, target in zip(batch_decoded, batch_targets):
 89 |             gold = self.packer.convert_for_eval(target)
 90 |             # remove padding
 91 |             length = utils.find_length_from_labels(gold, self.l_map)
 92 |             gold = gold[:length].numpy()
 93 |             best_path = decoded[:length].numpy()
 94 | 
 95 |             self.total_labels += length
 96 |             self.correct_labels += np.sum(np.equal(best_path, gold))
 97 | 
 98 |     def f1_score(self):
 99 |         """
100 |         calculate f1 score based on statics
101 |         """
102 |         if self.guess_count == 0:
103 |             return {'total': (0.0, 0.0, 0.0, 0.0, '')}
104 |         precision = self.overlap_count / float(self.guess_count)
105 |         recall = self.overlap_count / float(self.gold_count)
106 |         if precision == 0.0 or recall == 0.0:
107 |             return {'total': (0.0, 0.0, 0.0, 0.0, '')}
108 |         f = 2 * (precision * recall) / (precision + recall)
109 |         accuracy = float(self.correct_labels) / self.total_labels
110 |         message=""
111 |         self.f1['total'] = (f, precision, recall, accuracy, message)
112 |         for label in self.totalp_counts:
113 |             tp = self.truep_counts.get(label,1)
114 |             fn = sum(self.fn_counts.get(label,{}).values())
115 |             fp = sum(self.fp_counts.get(label,{}).values())
116 |             # print(label, str(tp), str(fp), str(fn), str(self.totalp_counts.get(label,0)))
117 |             precision = tp / float(tp+fp+1e-9)
118 |             recall = tp / float(tp+fn+1e-9)
119 |             f = 2 * (precision * recall) / (precision + recall+1e-9)
120 |             message = str(self.fn_counts.get(label, {}))
121 |             self.f1[label] = (f, precision, recall, 0, message)
122 |         return self.f1
123 | 
124 |     def acc_score(self):
125 |         """
126 |         calculate accuracy score based on statics
127 |         """
128 |         if 0 == self.total_labels:
129 |             return 0.0
130 |         accuracy = float(self.correct_labels) / self.total_labels
131 |         return accuracy
132 | 
133 |     def eval_instance(self, best_path, gold):
134 |         """
135 |         update statics for one instance
136 | 
137 |         args:
138 |             best_path (seq_len): predicted
139 |             gold (seq_len): ground-truth
140 |         """
141 | 
142 |         total_labels = len(best_path)
143 |         correct_labels = np.sum(np.equal(best_path, gold))
144 |         for i in range(total_labels):
145 |             gold_label = self.r_l_map[gold[i]]
146 |             guessed_label = self.r_l_map[best_path[i]]
147 |             self.totalp_counts[gold_label] = 1 + self.totalp_counts.get(gold_label,0)
148 |             if gold_label == guessed_label:
149 |                 self.truep_counts[gold_label] = 1 + self.truep_counts.get(gold_label,0)
150 |             else:
151 |                 val = self.fn_counts.get(gold_label,{})
152 |                 val[guessed_label] = 1+ val.get(guessed_label,0)
153 |                 self.fn_counts[gold_label]=val
154 | 
155 |                 val2 = self.fp_counts.get(guessed_label,{})
156 |                 val2[gold_label] = 1+ val2.get(gold_label,0)
157 |                 self.fp_counts[guessed_label] = val2
158 | 
159 |         gold_chunks = utils.iobes_to_spans(gold, self.r_l_map)
160 |         gold_count = len(gold_chunks)
161 | 
162 |         guess_chunks = utils.iobes_to_spans(best_path, self.r_l_map)
163 |         guess_count = len(guess_chunks)
164 | 
165 |         overlap_chunks = gold_chunks & guess_chunks
166 |         overlap_count = len(overlap_chunks)
167 | 
168 |         return correct_labels, total_labels, gold_count, guess_count, overlap_count
169 | 
170 | class eval_w(eval_batch):
171 |     """evaluation class for word level model (LSTM-CRF)
172 | 
173 |     args:
174 |         packer: provide method to convert target into original space [TODO: need to improve]
175 |         l_map: dictionary for labels
176 |         score_type: use f1score with using 'f'
177 | 
178 |     """
179 | 
180 |     def __init__(self, packer, l_map, score_type):
181 |         eval_batch.__init__(self, packer, l_map)
182 | 
183 |         self.decoder = CRFDecode_vb(len(l_map), l_map['<start>'], l_map['<pad>'])
184 | 
185 |         if 'f' in score_type:
186 |             self.eval_b = self.calc_f1_batch
187 |             self.calc_s = self.f1_score
188 |         else:
189 |             self.eval_b = self.calc_acc_batch
190 |             self.calc_s = self.acc_score
191 | 
192 |     def calc_score(self, ner_model, dataset_loader):
193 |         """
194 |         calculate score for pre-selected metrics
195 | 
196 |         args:
197 |             ner_model: LSTM-CRF model
198 |             dataset_loader: loader class for test set
199 |         """
200 |         ner_model.eval()
201 |         self.reset()
202 | 
203 |         for feature, tg, mask in itertools.chain.from_iterable(dataset_loader):
204 |             fea_v, _, mask_v = self.packer.repack_vb(feature, tg, mask)
205 |             scores, _ = ner_model(fea_v)
206 |             decoded = self.decoder.decode(scores.data, mask_v.data)
207 |             self.eval_b(decoded, tg)
208 | 
209 |         return self.calc_s()
210 | 
211 | class eval_wc(eval_batch):
212 |     """evaluation class for LM-LSTM-CRF
213 | 
214 |     args:
215 |         packer: provide method to convert target into original space [TODO: need to improve]
216 |         l_map: dictionary for labels
217 |         score_type: use f1score with using 'f'
218 | 
219 |     """
220 | 
221 |     def __init__(self, packer, l_map, score_type):
222 |         eval_batch.__init__(self, packer, l_map)
223 | 
224 |         self.decoder = CRFDecode_vb(len(l_map), l_map['<start>'], l_map['<pad>'])
225 | 
226 |         if 'f' in score_type:
227 |             self.eval_b = self.calc_f1_batch
228 |             self.calc_s = self.f1_score
229 |         else:
230 |             self.eval_b = self.calc_acc_batch
231 |             self.calc_s = self.acc_score
232 | 
233 |     def calc_score(self, ner_model, dataset_loader):
234 |         """
235 |         calculate score for pre-selected metrics
236 | 
237 |         args:
238 |             ner_model: LM-LSTM-CRF model
239 |             dataset_loader: loader class for test set
240 |         """
241 |         ner_model.eval()
242 |         self.reset()
243 | 
244 |         for f_f, f_p, b_f, b_p, w_f, tg, mask_v, len_v in itertools.chain.from_iterable(dataset_loader):
245 |             f_f, f_p, b_f, b_p, w_f, _, mask_v = self.packer.repack_vb(f_f, f_p, b_f, b_p, w_f, tg, mask_v, len_v)
246 |             scores = ner_model(f_f, f_p, b_f, b_p, w_f)
247 |             decoded = self.decoder.decode(scores.data, mask_v.data)
248 |             self.eval_b(decoded, tg)
249 | 
250 |         return self.calc_s()
251 | 


--------------------------------------------------------------------------------
/model/highway.py:
--------------------------------------------------------------------------------
 1 | """
 2 | .. module:: highway
 3 |     :synopsis: highway network
 4 |  
 5 | .. moduleauthor:: Liyuan Liu
 6 | """
 7 | 
 8 | import torch
 9 | import torch.nn as nn
10 | import model.utils as utils
11 | 
12 | class hw(nn.Module):
13 |     """Highway layers
14 | 
15 |     args: 
16 |         size: input and output dimension
17 |         dropout_ratio: dropout ratio
18 |     """
19 |    
20 |     def __init__(self, size, num_layers = 1, dropout_ratio = 0.5):
21 |         super(hw, self).__init__()
22 |         self.size = size
23 |         self.num_layers = num_layers
24 |         self.trans = nn.ModuleList()
25 |         self.gate = nn.ModuleList()
26 |         self.dropout = nn.Dropout(p=dropout_ratio)
27 | 
28 |         for i in range(num_layers):
29 |             tmptrans = nn.Linear(size, size)
30 |             tmpgate = nn.Linear(size, size)
31 |             self.trans.append(tmptrans)
32 |             self.gate.append(tmpgate)
33 | 
34 |     def rand_init(self):
35 |         """
36 |         random initialization
37 |         """
38 |         for i in range(self.num_layers):
39 |             utils.init_linear(self.trans[i])
40 |             utils.init_linear(self.gate[i])
41 | 
42 |     def forward(self, x):
43 |         """
44 |         update statics for f1 score
45 | 
46 |         args: 
47 |             x (ins_num, hidden_dim): input tensor
48 |         return:
49 |             output tensor (ins_num, hidden_dim)
50 |         """
51 |         
52 |         
53 |         g = nn.functional.sigmoid(self.gate[0](x))
54 |         h = nn.functional.relu(self.trans[0](x))
55 |         x = g * h + (1 - g) * x
56 | 
57 |         for i in range(1, self.num_layers):
58 |             x = self.dropout(x)
59 |             g = nn.functional.sigmoid(self.gate[i](x))
60 |             h = nn.functional.relu(self.trans[i](x))
61 |             x = g * h + (1 - g) * x
62 | 
63 |         return x


--------------------------------------------------------------------------------
/model/lm_lstm_crf.py:
--------------------------------------------------------------------------------
  1 | """
  2 | .. module:: lm_lstm_crf
  3 |     :synopsis: lm_lstm_crf
  4 | 
  5 | .. moduleauthor:: Liyuan Liu
  6 | """
  7 | 
  8 | import torch
  9 | import torch.autograd as autograd
 10 | import torch.nn as nn
 11 | import torch.optim as optim
 12 | import numpy as np
 13 | import model.crf as crf
 14 | import model.utils as utils
 15 | import model.highway as highway
 16 | 
 17 | class LM_LSTM_CRF(nn.Module):
 18 |     """LM_LSTM_CRF model
 19 | 
 20 |     args:
 21 |         tagset_size: size of label set
 22 |         char_size: size of char dictionary
 23 |         char_dim: size of char embedding
 24 |         char_hidden_dim: size of char-level lstm hidden dim
 25 |         char_rnn_layers: number of char-level lstm layers
 26 |         embedding_dim: size of word embedding
 27 |         word_hidden_dim: size of word-level blstm hidden dim
 28 |         word_rnn_layers: number of word-level lstm layers
 29 |         vocab_size: size of word dictionary
 30 |         dropout_ratio: dropout ratio
 31 |         large_CRF: use CRF_L or not, refer model.crf.CRF_L and model.crf.CRF_S for more details
 32 |         if_highway: use highway layers or not
 33 |         in_doc_words: number of words that occurred in the corpus (used for language model prediction)
 34 |         highway_layers: number of highway layers
 35 |     """
 36 | 
 37 |     def __init__(self, tagset_size, char_size, char_dim, char_hidden_dim, char_rnn_layers, embedding_dim, word_hidden_dim, word_rnn_layers, vocab_size, dropout_ratio, large_CRF=True, if_highway = False, in_doc_words = 2, highway_layers = 1):
 38 | 
 39 |         super(LM_LSTM_CRF, self).__init__()
 40 |         self.char_dim = char_dim
 41 |         self.char_hidden_dim = char_hidden_dim
 42 |         self.char_size = char_size
 43 |         self.word_dim = embedding_dim
 44 |         self.word_hidden_dim = word_hidden_dim
 45 |         self.word_size = vocab_size
 46 |         self.if_highway = if_highway
 47 | 
 48 |         self.char_embeds = nn.Embedding(char_size, char_dim)
 49 |         self.forw_char_lstm = nn.LSTM(char_dim, char_hidden_dim, num_layers=char_rnn_layers, bidirectional=False, dropout=dropout_ratio)
 50 |         self.back_char_lstm = nn.LSTM(char_dim, char_hidden_dim, num_layers=char_rnn_layers, bidirectional=False, dropout=dropout_ratio)
 51 |         self.char_rnn_layers = char_rnn_layers
 52 | 
 53 |         self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
 54 | 
 55 |         self.word_lstm = nn.LSTM(embedding_dim + char_hidden_dim * 2, word_hidden_dim // 2, num_layers=word_rnn_layers, bidirectional=True, dropout=dropout_ratio)
 56 | 
 57 |         self.word_rnn_layers = word_rnn_layers
 58 | 
 59 |         self.dropout = nn.Dropout(p=dropout_ratio)
 60 | 
 61 |         self.tagset_size = tagset_size
 62 |         if large_CRF:
 63 |             self.crf = crf.CRF_L(word_hidden_dim, tagset_size)
 64 |         else:
 65 |             self.crf = crf.CRF_S(word_hidden_dim, tagset_size)
 66 | 
 67 |         if if_highway:
 68 |             self.forw2char = highway.hw(char_hidden_dim, num_layers=highway_layers, dropout_ratio=dropout_ratio)
 69 |             self.back2char = highway.hw(char_hidden_dim, num_layers=highway_layers, dropout_ratio=dropout_ratio)
 70 |             self.forw2word = highway.hw(char_hidden_dim, num_layers=highway_layers, dropout_ratio=dropout_ratio)
 71 |             self.back2word = highway.hw(char_hidden_dim, num_layers=highway_layers, dropout_ratio=dropout_ratio)
 72 |             self.fb2char = highway.hw(2 * char_hidden_dim, num_layers=highway_layers, dropout_ratio=dropout_ratio)
 73 | 
 74 |         self.char_pre_train_out = nn.Linear(char_hidden_dim, char_size)
 75 |         self.word_pre_train_out = nn.Linear(char_hidden_dim, in_doc_words)
 76 | 
 77 |         self.batch_size = 1
 78 |         self.word_seq_length = 1
 79 | 
 80 |     def set_batch_size(self, bsize):
 81 |         """
 82 |         set batch size
 83 |         """
 84 |         self.batch_size = bsize
 85 | 
 86 |     def set_batch_seq_size(self, sentence):
 87 |         """
 88 |         set batch size and sequence length
 89 |         """
 90 |         tmp = sentence.size()
 91 |         self.word_seq_length = tmp[0]
 92 |         self.batch_size = tmp[1]
 93 | 
 94 |     def rand_init_embedding(self):
 95 |         """
 96 |         random initialize char-level embedding
 97 |         """
 98 |         utils.init_embedding(self.char_embeds.weight)
 99 | 
100 |     def load_pretrained_word_embedding(self, pre_word_embeddings):
101 |         """
102 |         load pre-trained word embedding
103 | 
104 |         args:
105 |             pre_word_embeddings (self.word_size, self.word_dim) : pre-trained embedding
106 |         """
107 |         assert (pre_word_embeddings.size()[1] == self.word_dim)
108 |         self.word_embeds.weight = nn.Parameter(pre_word_embeddings)
109 | 
110 |     def rand_init(self, init_char_embedding=True, init_word_embedding=False):
111 |         """
112 |         random initialization
113 | 
114 |         args:
115 |             init_char_embedding: random initialize char embedding or not
116 |             init_word_embedding: random initialize word embedding or not
117 |         """
118 | 
119 |         if init_char_embedding:
120 |             utils.init_embedding(self.char_embeds.weight)
121 |         if init_word_embedding:
122 |             utils.init_embedding(self.word_embeds.weight)
123 |         if self.if_highway:
124 |             self.forw2char.rand_init()
125 |             self.back2char.rand_init()
126 |             self.forw2word.rand_init()
127 |             self.back2word.rand_init()
128 |             self.fb2char.rand_init()
129 |         utils.init_lstm(self.forw_char_lstm)
130 |         utils.init_lstm(self.back_char_lstm)
131 |         utils.init_lstm(self.word_lstm)
132 |         utils.init_linear(self.char_pre_train_out)
133 |         utils.init_linear(self.word_pre_train_out)
134 |         self.crf.rand_init()
135 | 
136 |     def word_pre_train_forward(self, sentence, position, hidden=None):
137 |         """
138 |         output of forward language model
139 | 
140 |         args:
141 |             sentence (char_seq_len, batch_size): char-level representation of sentence
142 |             position (word_seq_len, batch_size): position of blank space in char-level representation of sentence
143 |             hidden: initial hidden state
144 | 
145 |         return:
146 |             language model output (word_seq_len, in_doc_word), hidden
147 |         """
148 | 
149 |         embeds = self.char_embeds(sentence)
150 |         d_embeds = self.dropout(embeds)
151 |         lstm_out, hidden = self.forw_char_lstm(d_embeds)
152 | 
153 |         tmpsize = position.size()
154 |         position = position.unsqueeze(2).expand(tmpsize[0], tmpsize[1], self.char_hidden_dim)
155 |         select_lstm_out = torch.gather(lstm_out, 0, position)
156 |         d_lstm_out = self.dropout(select_lstm_out).view(-1, self.char_hidden_dim)
157 | 
158 |         if self.if_highway:
159 |             char_out = self.forw2word(d_lstm_out)
160 |             d_char_out = self.dropout(char_out)
161 |         else:
162 |             d_char_out = d_lstm_out
163 | 
164 |         pre_score = self.word_pre_train_out(d_char_out)
165 |         return pre_score, hidden
166 | 
167 |     def word_pre_train_backward(self, sentence, position, hidden=None):
168 |         """
169 |         output of backward language model
170 | 
171 |         args:
172 |             sentence (char_seq_len, batch_size): char-level representation of sentence (inverse order)
173 |             position (word_seq_len, batch_size): position of blank space in inversed char-level representation of sentence
174 |             hidden: initial hidden state
175 | 
176 |         return:
177 |             language model output (word_seq_len, in_doc_word), hidden
178 |         """
179 |         embeds = self.char_embeds(sentence)
180 |         d_embeds = self.dropout(embeds)
181 |         lstm_out, hidden = self.back_char_lstm(d_embeds)
182 | 
183 |         tmpsize = position.size()
184 |         position = position.unsqueeze(2).expand(tmpsize[0], tmpsize[1], self.char_hidden_dim)
185 |         select_lstm_out = torch.gather(lstm_out, 0, position)
186 |         d_lstm_out = self.dropout(select_lstm_out).view(-1, self.char_hidden_dim)
187 | 
188 |         if self.if_highway:
189 |             char_out = self.back2word(d_lstm_out)
190 |             d_char_out = self.dropout(char_out)
191 |         else:
192 |             d_char_out = d_lstm_out
193 | 
194 |         pre_score = self.word_pre_train_out(d_char_out)
195 |         return pre_score, hidden
196 | 
197 |     def forward(self, forw_sentence, forw_position, back_sentence, back_position, word_seq, hidden=None):
198 |         '''
199 |         args:
200 |             forw_sentence (char_seq_len, batch_size) : char-level representation of sentence
201 |             forw_position (word_seq_len, batch_size) : position of blank space in char-level representation of sentence
202 |             back_sentence (char_seq_len, batch_size) : char-level representation of sentence (inverse order)
203 |             back_position (word_seq_len, batch_size) : position of blank space in inversed char-level representation of sentence
204 |             word_seq (word_seq_len, batch_size) : word-level representation of sentence
205 |             hidden: initial hidden state
206 | 
207 |         return:
208 |             crf output (word_seq_len, batch_size, tag_size, tag_size), hidden
209 |         '''
210 | 
211 |         self.set_batch_seq_size(forw_position)
212 | 
213 |         #embedding layer
214 |         forw_emb = self.char_embeds(forw_sentence)
215 |         back_emb = self.char_embeds(back_sentence)
216 | 
217 |         #dropout
218 |         d_f_emb = self.dropout(forw_emb)
219 |         d_b_emb = self.dropout(back_emb)
220 | 
221 |         #forward the whole sequence
222 |         forw_lstm_out, _ = self.forw_char_lstm(d_f_emb)#seq_len_char * batch * char_hidden_dim
223 | 
224 |         back_lstm_out, _ = self.back_char_lstm(d_b_emb)#seq_len_char * batch * char_hidden_dim
225 | 
226 |         #select predict point
227 |         forw_position = forw_position.unsqueeze(2).expand(self.word_seq_length, self.batch_size, self.char_hidden_dim)
228 |         select_forw_lstm_out = torch.gather(forw_lstm_out, 0, forw_position)
229 | 
230 |         back_position = back_position.unsqueeze(2).expand(self.word_seq_length, self.batch_size, self.char_hidden_dim)
231 |         select_back_lstm_out = torch.gather(back_lstm_out, 0, back_position)
232 | 
233 |         fb_lstm_out = self.dropout(torch.cat((select_forw_lstm_out, select_back_lstm_out), dim=2))
234 |         if self.if_highway:
235 |             char_out = self.fb2char(fb_lstm_out)
236 |             d_char_out = self.dropout(char_out)
237 |         else:
238 |             d_char_out = fb_lstm_out
239 | 
240 |         #word
241 |         word_emb = self.word_embeds(word_seq)
242 |         d_word_emb = self.dropout(word_emb)
243 | 
244 |         #combine
245 |         word_input = torch.cat((d_word_emb, d_char_out), dim = 2)
246 | 
247 |         #word level lstm
248 |         lstm_out, _ = self.word_lstm(word_input)
249 |         d_lstm_out = self.dropout(lstm_out)
250 | 
251 |         #convert to crf
252 |         crf_out = self.crf(d_lstm_out)
253 |         crf_out = crf_out.view(self.word_seq_length, self.batch_size, self.tagset_size, self.tagset_size)
254 | 
255 |         return crf_out


--------------------------------------------------------------------------------
/model/lstm_crf.py:
--------------------------------------------------------------------------------
  1 | """
  2 | .. module:: lstm_crf
  3 |     :synopsis: lstm_crf
  4 |  
  5 | .. moduleauthor:: Liyuan Liu
  6 | """
  7 | 
  8 | import torch
  9 | import torch.autograd as autograd
 10 | import torch.nn as nn
 11 | import model.crf as crf
 12 | import model.utils as utils
 13 | 
 14 | 
 15 | class LSTM_CRF(nn.Module):
 16 |     """LSTM_CRF model
 17 | 
 18 |     args: 
 19 |         vocab_size: size of word dictionary
 20 |         tagset_size: size of label set
 21 |         embedding_dim: size of word embedding
 22 |         hidden_dim: size of word-level blstm hidden dim
 23 |         rnn_layers: number of word-level lstm layers
 24 |         dropout_ratio: dropout ratio
 25 |         large_CRF: use CRF_L or not, refer model.crf.CRF_L and model.crf.CRF_S for more details
 26 |     """
 27 |     
 28 |     def __init__(self, vocab_size, tagset_size, embedding_dim, hidden_dim, rnn_layers, dropout_ratio, large_CRF=True):
 29 |         super(LSTM_CRF, self).__init__()
 30 |         self.embedding_dim = embedding_dim
 31 |         self.hidden_dim = hidden_dim
 32 |         self.vocab_size = vocab_size
 33 | 
 34 |         self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
 35 |         self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
 36 |                             num_layers=rnn_layers, bidirectional=True, dropout=dropout_ratio)
 37 |         self.rnn_layers = rnn_layers
 38 |     
 39 |         self.dropout1 = nn.Dropout(p=dropout_ratio)
 40 |         self.dropout2 = nn.Dropout(p=dropout_ratio)
 41 | 
 42 |         self.tagset_size = tagset_size
 43 |         if large_CRF:
 44 |             self.crf = crf.CRF_L(hidden_dim, tagset_size)
 45 |         else:
 46 |             self.crf = crf.CRF_S(hidden_dim, tagset_size)
 47 | 
 48 |         self.batch_size = 1
 49 |         self.seq_length = 1
 50 | 
 51 |     def rand_init_hidden(self):
 52 |         """
 53 |         random initialize hidden variable
 54 |         """
 55 |         return autograd.Variable(
 56 |             torch.randn(2 * self.rnn_layers, self.batch_size, self.hidden_dim // 2)), autograd.Variable(
 57 |             torch.randn(2 * self.rnn_layers, self.batch_size, self.hidden_dim // 2))
 58 | 
 59 |     def set_batch_size(self, bsize):
 60 |         """
 61 |         set batch size
 62 |         """
 63 |         self.batch_size = bsize
 64 | 
 65 |     def set_batch_seq_size(self, sentence):
 66 |         """
 67 |         set batch size and sequence length
 68 |         """        
 69 |         tmp = sentence.size()
 70 |         self.seq_length = tmp[0]
 71 |         self.batch_size = tmp[1]
 72 | 
 73 |     def load_pretrained_embedding(self, pre_embeddings):
 74 |         """
 75 |         load pre-trained word embedding
 76 | 
 77 |         args:
 78 |             pre_word_embeddings (self.word_size, self.word_dim) : pre-trained embedding
 79 |         """
 80 |         assert (pre_embeddings.size()[1] == self.embedding_dim)
 81 |         self.word_embeds.weight = nn.Parameter(pre_embeddings)
 82 | 
 83 |     def rand_init_embedding(self):
 84 |         utils.init_embedding(self.word_embeds.weight)
 85 | 
 86 |     def rand_init(self, init_embedding=False):
 87 |         """
 88 |         random initialization
 89 | 
 90 |         args:
 91 |             init_embedding: random initialize embedding or not
 92 |         """
 93 |         if init_embedding:
 94 |             utils.init_embedding(self.word_embeds.weight)
 95 |         utils.init_lstm(self.lstm)
 96 |         self.crf.rand_init()
 97 | 
 98 |     def forward(self, sentence, hidden=None):
 99 |         '''
100 |         args:
101 |             sentence (word_seq_len, batch_size) : word-level representation of sentence
102 |             hidden: initial hidden state
103 | 
104 |         return:
105 |             crf output (word_seq_len, batch_size, tag_size, tag_size), hidden
106 |         '''
107 |         self.set_batch_seq_size(sentence)
108 | 
109 |         embeds = self.word_embeds(sentence)
110 |         d_embeds = self.dropout1(embeds)
111 | 
112 |         lstm_out, hidden = self.lstm(d_embeds, hidden)
113 |         lstm_out = lstm_out.view(-1, self.hidden_dim)
114 | 
115 |         d_lstm_out = self.dropout2(lstm_out)
116 | 
117 |         crf_out = self.crf(d_lstm_out)
118 |         crf_out = crf_out.view(self.seq_length, self.batch_size, self.tagset_size, self.tagset_size)
119 |         return crf_out, hidden


--------------------------------------------------------------------------------
/model/ner_dataset.py:
--------------------------------------------------------------------------------
 1 | """
 2 | .. module:: datasets
 3 |     :synopsis: datasets
 4 |  
 5 | .. moduleauthor:: Liyuan Liu
 6 | """
 7 | 
 8 | from torch.utils.data import Dataset
 9 | 
10 | 
11 | class CRFDataset(Dataset):
12 |     """Dataset Class for word-level model 
13 | 
14 |     args: 
15 |         data_tensor (ins_num, seq_length): words 
16 |         label_tensor (ins_num, seq_length): labels
17 |         mask_tensor (ins_num, seq_length): padding masks
18 |     """
19 |     def __init__(self, data_tensor, label_tensor, mask_tensor):
20 |         assert data_tensor.size(0) == label_tensor.size(0)
21 |         assert data_tensor.size(0) == mask_tensor.size(0)
22 |         self.data_tensor = data_tensor
23 |         self.label_tensor = label_tensor
24 |         self.mask_tensor = mask_tensor
25 | 
26 |     def __getitem__(self, index):
27 |         return self.data_tensor[index], self.label_tensor[index], self.mask_tensor[index]
28 | 
29 |     def __len__(self):
30 |         return self.data_tensor.size(0)
31 | 
32 | class CRFDataset_WC(Dataset):
33 |     """Dataset Class for char-aware model 
34 | 
35 |     args: 
36 |         forw_tensor (ins_num, seq_length): forward chars
37 |         forw_index (ins_num, seq_length): index of forward chars
38 |         back_tensor (ins_num, seq_length): backward chars
39 |         back_index (ins_num, seq_length): index of backward chars
40 |         word_tensor (ins_num, seq_length): words
41 |         label_tensor (ins_num, seq_length): labels:
42 |         mask_tensor (ins_num, seq_length): padding masks
43 |         len_tensor (ins_num, 2): length of chars (dim0) and words (dim1)
44 |     """
45 |     def __init__(self, forw_tensor, forw_index, back_tensor, back_index, word_tensor, label_tensor, mask_tensor, len_tensor):
46 |         assert forw_tensor.size(0) == label_tensor.size(0)
47 |         assert forw_tensor.size(0) == mask_tensor.size(0)
48 |         assert forw_tensor.size(0) == forw_index.size(0)
49 |         assert forw_tensor.size(0) == back_tensor.size(0)
50 |         assert forw_tensor.size(0) == back_index.size(0)
51 |         assert forw_tensor.size(0) == word_tensor.size(0)
52 |         assert forw_tensor.size(0) == len_tensor.size(0)
53 |         self.forw_tensor = forw_tensor
54 |         self.forw_index = forw_index
55 |         self.back_tensor = back_tensor
56 |         self.back_index = back_index
57 |         self.word_tensor = word_tensor
58 |         self.label_tensor = label_tensor
59 |         self.mask_tensor = mask_tensor
60 |         self.len_tensor = len_tensor
61 | 
62 |     def __getitem__(self, index):
63 |         return self.forw_tensor[index], self.forw_index[index], self.back_tensor[index], self.back_index[index], self.word_tensor[index], self.label_tensor[index], self.mask_tensor[index], self.len_tensor[index]
64 | 
65 |     def __len__(self):
66 |         return self.forw_tensor.size(0)
67 | 


--------------------------------------------------------------------------------
/model/predictor.py:
--------------------------------------------------------------------------------
  1 | """
  2 | .. module:: predictor
  3 |     :synopsis: prediction method (for un-annotated text)
  4 |  
  5 | .. moduleauthor:: Liyuan Liu
  6 | """
  7 | 
  8 | import torch
  9 | import torch.autograd as autograd
 10 | import numpy as np
 11 | import itertools
 12 | import sys
 13 | from tqdm import tqdm
 14 | 
 15 | from model.crf import CRFDecode_vb
 16 | from model.utils import *
 17 | 
 18 | class predict:
 19 |     """Base class for prediction, provide method to calculate f1 score and accuracy 
 20 | 
 21 |     args: 
 22 |         if_cuda: if use cuda to speed up 
 23 |         l_map: dictionary for labels 
 24 |         label_seq: type of decode function, set `True` to couple label with text, or set 'False' to insert label into test
 25 |         batch_size: size of batch in decoding
 26 |     """
 27 | 
 28 |     def __init__(self, if_cuda, l_map, label_seq = True, batch_size = 50):
 29 |         self.if_cuda = if_cuda
 30 |         self.l_map = l_map
 31 |         self.r_l_map = revlut(l_map)
 32 |         self.batch_size = batch_size
 33 |         if label_seq:
 34 |             self.decode_str = self.decode_l
 35 |         else:
 36 |             self.decode_str = self.decode_s
 37 | 
 38 |     def decode_l(self, feature, label):
 39 |         """
 40 |         decode a sentence coupled with label
 41 | 
 42 |         args:
 43 |             feature (list): words list
 44 |             label (list): label list
 45 |         """
 46 |         return '\n'.join(map(lambda t: t[0] + ' '+ self.r_l_map[t[1].item()], zip(feature, label)))
 47 | 
 48 |     def decode_s(self, feature, label):
 49 |         """
 50 |         decode a sentence in the format of <>
 51 | 
 52 |         args:
 53 |             feature (list): words list
 54 |             label (list): label list
 55 |         """
 56 |         chunks = ""
 57 |         current = None
 58 | 
 59 |         for f, y in zip(feature, label):
 60 |             label = self.r_l_map[y.item()]
 61 | 
 62 |             if label.startswith('B-'):
 63 | 
 64 |                 if current is not None:
 65 |                     chunks += "</"+current+"> "
 66 |                 current = label[2:]
 67 |                 chunks += "<"+current+"> " + f + " "
 68 | 
 69 |             elif label.startswith('S-'):
 70 | 
 71 |                 if current is not None:
 72 |                     chunks += " </"+current+"> "
 73 |                 current = label[2:]
 74 |                 chunks += "<"+current+"> " + f + " </"+current+"> "
 75 |                 current = None
 76 | 
 77 |             elif label.startswith('I-'):
 78 | 
 79 |                 if current is not None:
 80 |                     base = label[2:]
 81 |                     if base == current:
 82 |                         chunks += f+" "
 83 |                     else:
 84 |                         chunks += "</"+current+"> <"+base+"> " + f + " "
 85 |                         current = base
 86 |                 else:
 87 |                     current = label[2:]
 88 |                     chunks += "<"+current+"> " + f + " "
 89 | 
 90 |             elif label.startswith('E-'):
 91 | 
 92 |                 if current is not None:
 93 |                     base = label[2:]
 94 |                     if base == current:
 95 |                         chunks += f + " </"+base+"> "
 96 |                         current = None
 97 |                     else:
 98 |                         chunks += "</"+current+"> <"+base+"> " + f + " </"+base+"> "
 99 |                         current = None
100 | 
101 |                 else:
102 |                     current = label[2:]
103 |                     chunks += "<"+current+"> " + f + " </"+current+"> "
104 |                     current = None
105 | 
106 |             else:
107 |                 if current is not None:
108 |                     chunks += "</"+current+"> "
109 |                 chunks += f+" "
110 |                 current = None
111 | 
112 |         if current is not None:
113 |             chunks += "</"+current+"> "
114 | 
115 |         return chunks
116 | 
117 |     def output_batch(self, ner_model, documents, fout):
118 |         """
119 |         decode the whole corpus in the specific format by calling apply_model to fit specific models
120 | 
121 |         args:
122 |             ner_model: sequence labeling model
123 |             feature (list): list of words list
124 |             fout: output file
125 |         """
126 |         ner_model.eval()
127 | 
128 |         d_len = len(documents)
129 |         for d_ind in tqdm( range(0, d_len), mininterval=1,
130 |                 desc=' - Process', leave=False, file=sys.stdout):
131 |             fout.write('-DOCSTART- -DOCSTART- -DOCSTART-\n\n')
132 |             features = documents[d_ind]
133 |             f_len = len(features)
134 |             for ind in range(0, f_len, self.batch_size):
135 |                 eind = min(f_len, ind + self.batch_size)
136 |                 labels = self.apply_model(ner_model, features[ind: eind])
137 |                 labels = torch.unbind(labels, 1)
138 | 
139 |                 for ind2 in range(ind, eind):
140 |                     f = features[ind2]
141 |                     l = labels[ind2 - ind][0: len(f) ]
142 |                     fout.write(self.decode_str(features[ind2], l) + '\n\n')
143 | 
144 |     def apply_model(self, ner_model, features):
145 |         """
146 |         template function for apply_model
147 | 
148 |         args:
149 |             ner_model: sequence labeling model
150 |             feature (list): list of words list
151 |         """
152 |         return None
153 | 
154 | class predict_w(predict):
155 |     """prediction class for word level model (LSTM-CRF)
156 | 
157 |     args: 
158 |         if_cuda: if use cuda to speed up 
159 |         f_map: dictionary for words
160 |         l_map: dictionary for labels
161 |         pad_word: word padding
162 |         pad_label: label padding
163 |         start_label: start label 
164 |         label_seq: type of decode function, set `True` to couple label with text, or set 'False' to insert label into test
165 |         batch_size: size of batch in decoding
166 |         caseless: caseless or not
167 |     """
168 |    
169 |     def __init__(self, if_cuda, f_map, l_map, pad_word, pad_label, start_label, label_seq = True, batch_size = 50, caseless=True):
170 |         predict.__init__(self, if_cuda, l_map, label_seq, batch_size)
171 |         self.decoder = CRFDecode_vb(len(l_map), start_label, pad_label)
172 |         self.pad_word = pad_word
173 |         self.f_map = f_map
174 |         self.l_map = l_map
175 |         self.caseless = caseless
176 |         
177 |     def apply_model(self, ner_model, features):
178 |         """
179 |         apply_model function for LSTM-CRF
180 | 
181 |         args:
182 |             ner_model: sequence labeling model
183 |             feature (list): list of words list
184 |         """
185 |         if self.caseless:
186 |             features = list(map(lambda t: list(map(lambda x: x.lower(), t)), features))
187 |         features = encode_safe(features, self.f_map, self.f_map['<unk>'])
188 |         f_len = max(map(lambda t: len(t) + 1, features))
189 | 
190 |         masks = torch.ByteTensor(list(map(lambda t: [1] * (len(t) + 1) + [0] * (f_len - len(t) - 1), features)))
191 |         word_features = torch.LongTensor(list(map(lambda t: t + [self.pad_word] * (f_len - len(t)), features)))
192 | 
193 |         if self.if_cuda:
194 |             fea_v = autograd.Variable(word_features.transpose(0, 1)).cuda()
195 |             mask_v = masks.transpose(0, 1).cuda()
196 |         else:
197 |             fea_v = autograd.Variable(word_features.transpose(0, 1))
198 |             mask_v = masks.transpose(0, 1).contiguous()
199 | 
200 |         scores, _ = ner_model(fea_v)
201 |         decoded = self.decoder.decode(scores.data, mask_v)
202 | 
203 |         return decoded
204 | 
205 | class predict_wc(predict):
206 |     """prediction class for LM-LSTM-CRF
207 | 
208 |     args: 
209 |         if_cuda: if use cuda to speed up 
210 |         f_map: dictionary for words
211 |         c_map: dictionary for chars
212 |         l_map: dictionary for labels
213 |         pad_word: word padding
214 |         pad_char: word padding
215 |         pad_label: label padding
216 |         start_label: start label 
217 |         label_seq: type of decode function, set `True` to couple label with text, or set 'False' to insert label into test
218 |         batch_size: size of batch in decoding
219 |         caseless: caseless or not
220 |     """
221 |    
222 |     def __init__(self, if_cuda, f_map, c_map, l_map, pad_word, pad_char, pad_label, start_label, label_seq = True, batch_size = 50, caseless=True):
223 |         predict.__init__(self, if_cuda, l_map, label_seq, batch_size)
224 |         self.decoder = CRFDecode_vb(len(l_map), start_label, pad_label)
225 |         self.pad_word = pad_word
226 |         self.pad_char = pad_char
227 |         self.f_map = f_map
228 |         self.c_map = c_map
229 |         self.l_map = l_map
230 |         self.caseless = caseless
231 |         
232 |     def apply_model(self, ner_model, features):
233 |         """
234 |         apply_model function for LM-LSTM-CRF
235 | 
236 |         args:
237 |             ner_model: sequence labeling model
238 |             feature (list): list of words list
239 |         """
240 |         char_features = encode2char_safe(features, self.c_map)
241 | 
242 |         if self.caseless:
243 |             word_features = encode_safe(list(map(lambda t: list(map(lambda x: x.lower(), t)), features)), self.f_map, self.f_map['<unk>'])
244 |         else:
245 |             word_features = encode_safe(features, self.f_map, self.f_map['<unk>'])
246 | 
247 |         fea_len = [list( map( lambda t: len(t) + 1, f) ) for f in char_features]
248 |         forw_features = concatChar(char_features, self.c_map)
249 | 
250 |         word_len = max(map(lambda t: len(t) + 1, word_features))
251 |         char_len = max(map(lambda t: len(t[0]) + word_len - len(t[1]), zip(forw_features, word_features)))
252 |         forw_t = list( map( lambda t: t + [self.pad_char] * ( char_len - len(t) ), forw_features ) )
253 |         back_t = torch.LongTensor( list( map( lambda t: t[::-1], forw_t ) ) )
254 |         forw_t = torch.LongTensor( forw_t )
255 |         forw_p = torch.LongTensor( list( map( lambda t: list(itertools.accumulate( t + [1] * (word_len - len(t) ) ) ), fea_len) ) )
256 |         back_p = torch.LongTensor( list( map( lambda t: [char_len - 1] + [ char_len - 1 - tup for tup in t[:-1] ], forw_p) ) )
257 | 
258 |         masks = torch.ByteTensor(list(map(lambda t: [1] * (len(t) + 1) + [0] * (word_len - len(t) - 1), word_features)))
259 |         word_t = torch.LongTensor(list(map(lambda t: t + [self.pad_word] * (word_len - len(t)), word_features)))
260 | 
261 |         if self.if_cuda:
262 |             f_f = autograd.Variable(forw_t.transpose(0, 1)).cuda()
263 |             f_p = autograd.Variable(forw_p.transpose(0, 1)).cuda()
264 |             b_f = autograd.Variable(back_t.transpose(0, 1)).cuda()
265 |             b_p = autograd.Variable(back_p.transpose(0, 1)).cuda()
266 |             w_f = autograd.Variable(word_t.transpose(0, 1)).cuda()
267 |             mask_v = masks.transpose(0, 1).cuda()
268 |         else:
269 |             f_f = autograd.Variable(forw_t.transpose(0, 1))
270 |             f_p = autograd.Variable(forw_p.transpose(0, 1))
271 |             b_f = autograd.Variable(back_t.transpose(0, 1))
272 |             b_p = autograd.Variable(back_p.transpose(0, 1))
273 |             w_f = autograd.Variable(word_t.transpose(0, 1))
274 |             mask_v = masks.transpose(0, 1)
275 | 
276 |         scores = ner_model(f_f, f_p, b_f, b_p, w_f)
277 |         decoded = self.decoder.decode(scores.data, mask_v)
278 | 
279 |         return decoded
280 | 


--------------------------------------------------------------------------------
/model/utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | .. module:: utils
  3 |     :synopsis: utility tools
  4 | 
  5 | .. moduleauthor:: Liyuan Liu, Frank Xu
  6 | """
  7 | 
  8 | import codecs
  9 | import csv
 10 | import itertools
 11 | from functools import reduce
 12 | 
 13 | import numpy as np
 14 | import shutil
 15 | import torch
 16 | import json
 17 | 
 18 | import torch.nn as nn
 19 | import torch.nn.init
 20 | 
 21 | from model.ner_dataset import *
 22 | 
 23 | zip = getattr(itertools, 'izip', zip)
 24 | 
 25 | 
 26 | def to_scalar(var):
 27 |     """change the first element of a tensor to scalar
 28 |     """
 29 |     return var.view(-1).data.tolist()[0]
 30 | 
 31 | 
 32 | def argmax(vec):
 33 |     """helper function to calculate argmax of input vector at dimension 1
 34 |     """
 35 |     _, idx = torch.max(vec, 1)
 36 |     return to_scalar(idx)
 37 | 
 38 | 
 39 | def log_sum_exp(vec, m_size):
 40 |     """
 41 |     calculate log of exp sum
 42 | 
 43 |     args:
 44 |         vec (batch_size, vanishing_dim, hidden_dim) : input tensor
 45 |         m_size : hidden_dim
 46 |     return:
 47 |         batch_size, hidden_dim
 48 |     """
 49 |     _, idx = torch.max(vec, 1)  # B * 1 * M
 50 |     max_score = torch.gather(vec, 1, idx.view(-1, 1, m_size)).view(-1, 1, m_size)  # B * M
 51 | 
 52 |     return max_score.view(-1, m_size) + torch.log(torch.sum(torch.exp(vec - max_score.expand_as(vec)), 1)).view(-1, m_size)  # B * M
 53 | 
 54 | 
 55 | def switch(vec1, vec2, mask):
 56 |     """
 57 |     switch function for pytorch
 58 | 
 59 |     args:
 60 |         vec1 (any size) : input tensor corresponding to 0
 61 |         vec2 (same to vec1) : input tensor corresponding to 1
 62 |         mask (same to vec1) : input tensor, each element equals to 0/1
 63 |     return:
 64 |         vec (*)
 65 |     """
 66 |     catvec = torch.cat([vec1.view(-1, 1), vec2.view(-1, 1)], dim=1)
 67 |     switched_vec = torch.gather(catvec, 1, mask.long().view(-1, 1))
 68 |     return switched_vec.view(-1)
 69 | 
 70 | 
 71 | def encode2char_safe(input_lines, char_dict):
 72 |     """
 73 |     get char representation of lines
 74 | 
 75 |     args:
 76 |         input_lines (list of strings) : input corpus
 77 |         char_dict (dictionary) : char-level dictionary
 78 |     return:
 79 |         forw_lines
 80 |     """
 81 |     unk = char_dict['<u>']
 82 |     forw_lines = [list(map(lambda m: list(map(lambda t: char_dict.get(t, unk), m)), line)) for line in input_lines]
 83 |     return forw_lines
 84 | 
 85 | 
 86 | def concatChar(input_lines, char_dict):
 87 |     """
 88 |     concat char into string
 89 | 
 90 |     args:
 91 |         input_lines (list of list of char) : input corpus
 92 |         char_dict (dictionary) : char-level dictionary
 93 |     return:
 94 |         forw_lines
 95 |     """
 96 |     features = [[char_dict[' ']] + list(reduce(lambda x, y: x + [char_dict[' ']] + y, sentence)) + [char_dict['\n']] for sentence in input_lines]
 97 |     return features
 98 | 
 99 | 
100 | def encode_safe(input_lines, word_dict, unk):
101 |     """
102 |     encode list of strings into word-level representation with unk
103 |     """
104 |     lines = list(map(lambda t: list(map(lambda m: word_dict.get(m, unk), t)), input_lines))
105 |     return lines
106 | 
107 | 
108 | def encode(input_lines, word_dict):
109 |     """
110 |     encode list of strings into word-level representation
111 |     """
112 |     lines = list(map(lambda t: list(map(lambda m: word_dict[m], t)), input_lines))
113 |     return lines
114 | 
115 | 
116 | def encode2Tensor(input_lines, word_dict, unk):
117 |     """
118 |     encode list of strings into word-level representation (tensor) with unk
119 |     """
120 |     lines = list(map(lambda t: torch.LongTensor(list(map(lambda m: word_dict.get(m, unk), t))), input_lines))
121 |     return lines
122 | 
123 | 
124 | def generate_corpus_char(lines, if_shrink_c_feature=False, c_thresholds=1, if_shrink_w_feature=False, w_thresholds=1):
125 |     """
126 |     generate label, feature, word dictionary, char dictionary and label dictionary
127 | 
128 |     args:
129 |         lines : corpus
130 |         if_shrink_c_feature: whether shrink char-dictionary
131 |         c_threshold: threshold for shrinking char-dictionary
132 |         if_shrink_w_feature: whether shrink word-dictionary
133 |         w_threshold: threshold for shrinking word-dictionary
134 | 
135 |     """
136 |     features, labels, feature_map, label_map = generate_corpus(lines, if_shrink_feature=if_shrink_w_feature, thresholds=w_thresholds)
137 |     char_count = dict()
138 |     for feature in features:
139 |         for word in feature:
140 |             for tup in word:
141 |                 if tup not in char_count:
142 |                     char_count[tup] = 0
143 |                 else:
144 |                     char_count[tup] += 1
145 |     if if_shrink_c_feature:
146 |         shrink_char_count = [k for (k, v) in iter(char_count.items()) if v >= c_thresholds]
147 |         char_map = {shrink_char_count[ind]: ind for ind in range(0, len(shrink_char_count))}
148 |     else:
149 |         char_map = {k: v for (v, k) in enumerate(char_count.keys())}
150 |     char_map['<u>'] = len(char_map)  # unk for char
151 |     char_map[' '] = len(char_map)  # concat for char
152 |     char_map['\n'] = len(char_map)  # eof for char
153 |     return features, labels, feature_map, label_map, char_map
154 | 
155 | def shrink_features(feature_map, features, thresholds):
156 |     """
157 |     filter un-common features by threshold
158 |     """
159 |     feature_count = {k: 0 for (k, v) in iter(feature_map.items())}
160 |     for feature_list in features:
161 |         for feature in feature_list:
162 |             feature_count[feature] += 1
163 |     shrinked_feature_count = [k for (k, v) in iter(feature_count.items()) if v >= thresholds]
164 |     feature_map = {shrinked_feature_count[ind]: (ind + 1) for ind in range(0, len(shrinked_feature_count))}
165 | 
166 |     #inserting unk to be 0 encoded
167 |     feature_map['<unk>'] = 0
168 |     #inserting eof
169 |     feature_map['<eof>'] = len(feature_map)
170 |     return feature_map
171 | 
172 | def generate_corpus(lines, if_shrink_feature=False, thresholds=1):
173 |     """
174 |     generate label, feature, word dictionary and label dictionary
175 | 
176 |     args:
177 |         lines : corpus
178 |         if_shrink_feature: whether shrink word-dictionary
179 |         threshold: threshold for shrinking word-dictionary
180 | 
181 |     """
182 |     features = list()
183 |     labels = list()
184 |     tmp_fl = list()
185 |     tmp_ll = list()
186 |     feature_map = dict()
187 |     label_map = dict()
188 |     for line in lines:
189 |         if not (line.isspace() or (len(line) > 10 and line[0:10] == '-DOCSTART-')):
190 |             line = line.rstrip('\n').split()
191 |             tmp_fl.append(line[0])
192 |             if line[0] not in feature_map:
193 |                 feature_map[line[0]] = len(feature_map) + 1 #0 is for unk
194 |             tmp_ll.append(line[-1])
195 |             if line[-1] not in label_map:
196 |                 label_map[line[-1]] = len(label_map)
197 |         elif len(tmp_fl) > 0:
198 |             features.append(tmp_fl)
199 |             labels.append(tmp_ll)
200 |             tmp_fl = list()
201 |             tmp_ll = list()
202 |     if len(tmp_fl) > 0:
203 |         features.append(tmp_fl)
204 |         labels.append(tmp_ll)
205 |     label_map['<start>'] = len(label_map)
206 |     label_map['<pad>'] = len(label_map)
207 |     if if_shrink_feature:
208 |         feature_map = shrink_features(feature_map, features, thresholds)
209 |     else:
210 |         #inserting unk to be 0 encoded
211 |         feature_map['<unk>'] = 0
212 |         #inserting eof
213 |         feature_map['<eof>'] = len(feature_map)
214 | 
215 |     return features, labels, feature_map, label_map
216 | 
217 | 
218 | def read_corpus(lines):
219 |     """
220 |     convert corpus into features and labels
221 |     """
222 |     features = list()
223 |     labels = list()
224 |     tmp_fl = list()
225 |     tmp_ll = list()
226 |     for line in lines:
227 |         if not (line.isspace() or (len(line) > 10 and line[0:10] == '-DOCSTART-')):
228 |             line = line.rstrip('\n').split()
229 |             tmp_fl.append(line[0])
230 |             tmp_ll.append(line[-1])
231 |         elif len(tmp_fl) > 0:
232 |             features.append(tmp_fl)
233 |             labels.append(tmp_ll)
234 |             tmp_fl = list()
235 |             tmp_ll = list()
236 |     if len(tmp_fl) > 0:
237 |         features.append(tmp_fl)
238 |         labels.append(tmp_ll)
239 | 
240 |     return features, labels
241 | 
242 | def read_features(lines, multi_docs = True):
243 |     """
244 |     convert un-annotated corpus into features
245 |     """
246 |     if multi_docs:
247 |         documents = list()
248 |         features = list()
249 |         tmp_fl = list()
250 |         for line in lines:
251 |             if_doc_end = (len(line) > 10 and line[0:10] == '-DOCSTART-')
252 |             if not (line.isspace() or if_doc_end):
253 |                 line = line.split()[0]
254 |                 tmp_fl.append(line)
255 |             else:
256 |                 if len(tmp_fl) > 0:
257 |                     features.append(tmp_fl)
258 |                     tmp_fl = list()
259 |                 if if_doc_end and len(features) > 0:
260 |                     documents.append(features)
261 |                     features = list()
262 |         if len(tmp_fl) > 0:
263 |             features.append(tmp_fl)
264 |         if len(features) >0:
265 |             documents.append(features)
266 |         return documents
267 |     else:
268 |         features = list()
269 |         tmp_fl = list()
270 |         for line in lines:
271 |             if not (line.isspace() or (len(line) > 10 and line[0:10] == '-DOCSTART-')):
272 |                 line = line.split()[0]
273 |                 tmp_fl.append(line)
274 |             elif len(tmp_fl) > 0:
275 |                 features.append(tmp_fl)
276 |                 tmp_fl = list()
277 |         if len(tmp_fl) > 0:
278 |             features.append(tmp_fl)
279 | 
280 |         return features
281 | 
282 | def shrink_embedding(feature_map, word_dict, word_embedding, caseless):
283 |     """
284 |     shrink embedding dictionary to in-doc words only
285 |     """
286 |     if caseless:
287 |         feature_map = set([k.lower() for k in feature_map.keys()])
288 |     new_word_list = [k for k in word_dict.keys() if (k in feature_map)]
289 |     new_word_dict = {k:v for (v, k) in enumerate(new_word_list)}
290 |     new_word_list_ind = torch.LongTensor([word_dict[k] for k in new_word_list])
291 |     new_embedding = word_embedding[new_word_list_ind]
292 |     return new_word_dict, new_embedding
293 | 
294 | def encode_corpus(lines, f_map, l_map, if_lower = False):
295 |     """
296 |     encode corpus into features and labels
297 |     """
298 |     tmp_fl = []
299 |     tmp_ll = []
300 |     features = []
301 |     labels = []
302 |     for line in lines:
303 |         if not (line.isspace() or (len(line) > 10 and line[0:10] == '-DOCSTART-')):
304 |             line = line.rstrip('\n').split()
305 |             tmp_fl.append(line[0])
306 |             tmp_ll.append(line[-1])
307 |         elif len(tmp_fl) > 0:
308 |             features.append(tmp_fl)
309 |             labels.append(tmp_ll)
310 |             tmp_fl = list()
311 |             tmp_ll = list()
312 |     if len(tmp_fl) > 0:
313 |         features.append(tmp_fl)
314 |         labels.append(tmp_ll)
315 |     if if_lower:
316 |         features = list(map(lambda t: list(map(lambda x: x.lower(), t)), features))
317 |     feature_e = encode_safe(features, f_map, f_map['<unk>'])
318 |     label_e = encode(labels, l_map)
319 |     return feature_e, label_e
320 | 
321 | 
322 | def encode_corpus_c(lines, f_map, l_map, c_map):
323 |     """
324 |     encode corpus into features (both word-level and char-level) and labels
325 |     """
326 |     tmp_fl = []
327 |     tmp_ll = []
328 |     features = []
329 |     labels = []
330 |     for line in lines:
331 |         if not (line.isspace() or (len(line) > 10 and line[0:10] == '-DOCSTART-')):
332 |             line = line.rstrip('\n').split()
333 |             tmp_fl.append(line[0])
334 |             tmp_ll.append(line[-1])
335 |         elif len(tmp_fl) > 0:
336 |             features.append(tmp_fl)
337 |             labels.append(tmp_ll)
338 |             tmp_fl = list()
339 |             tmp_ll = list()
340 |     if len(tmp_fl) > 0:
341 |         features.append(tmp_fl)
342 |         labels.append(tmp_ll)
343 | 
344 |     feature_c = encode2char_safe(features, c_map)
345 |     feature_e = encode_safe(features, f_map, f_map['<unk>'])
346 |     label_e = encode(labels, l_map)
347 |     return feature_c, feature_e, label_e
348 | 
349 | def load_embedding(emb_file, delimiter, feature_map, caseless, unk, shrink_to_train=False):
350 |     """
351 |     load embedding
352 |     """
353 |     if caseless:
354 |         feature_set = set([key.lower() for key in feature_map])
355 |     else:
356 |         feature_set = set([key for key in feature_map])
357 | 
358 |     word_dict = dict()
359 |     embedding_array = list()
360 |     for line in open(emb_file, 'r'):
361 |         line = line.split(delimiter)
362 |         vector = list(map(lambda t: float(t), filter(lambda n: n and not n.isspace(), line[1:])))
363 |         if shrink_to_train and line[0] not in feature_set:
364 |             continue
365 |         if line[0] == unk:
366 |             word_dict['<unk>'] = len(word_dict)
367 |         else:
368 |             word_dict[line[0]] = len(word_dict)
369 |         embedding_array.append(vector)
370 |     embedding_tensor_1 = torch.FloatTensor(np.asarray(embedding_array))
371 |     emb_len = embedding_tensor_1.size(1)
372 | 
373 |     rand_embedding_count = 0
374 |     for key in feature_map:
375 |         if caseless:
376 |             key = key.lower()
377 |         if key not in word_dict:
378 |             word_dict[key] = len(word_dict)
379 |             rand_embedding_count += 1
380 | 
381 |     rand_embedding_tensor = torch.FloatTensor(rand_embedding_count, emb_len)
382 |     init_embedding(rand_embedding_tensor)
383 | 
384 |     embedding_tensor = torch.cat((embedding_tensor_1, rand_embedding_tensor), 0)
385 |     return word_dict, embedding_tensor
386 | 
387 | def load_embedding_wlm(emb_file, delimiter, feature_map, full_feature_set, caseless, unk, emb_len, shrink_to_train=False, shrink_to_corpus=False):
388 |     """
389 |     load embedding, indoc words would be listed before outdoc words
390 | 
391 |     args:
392 |         emb_file: path to embedding file
393 |         delimiter: delimiter of lines
394 |         feature_map: word dictionary
395 |         full_feature_set: all words in the corpus
396 |         caseless: convert into casesless style
397 |         unk: string for unknown token
398 |         emb_len: dimension of embedding vectors
399 |         shrink_to_train: whether to shrink out-of-training set or not
400 |         shrink_to_corpus: whether to shrink out-of-corpus or not
401 |     """
402 |     if caseless:
403 |         feature_set = set([key.lower() for key in feature_map])
404 |         full_feature_set = set([key.lower() for key in full_feature_set])
405 |     else:
406 |         feature_set = set([key for key in feature_map])
407 |         full_feature_set = set([key for key in full_feature_set])
408 | 
409 |     #ensure <unk> is 0
410 |     word_dict = {v:(k+1) for (k,v) in enumerate(feature_set - set(['<unk>']))}
411 |     word_dict['<unk>'] = 0
412 | 
413 |     in_doc_freq_num = len(word_dict)
414 |     rand_embedding_tensor = torch.FloatTensor(in_doc_freq_num, emb_len)
415 |     init_embedding(rand_embedding_tensor)
416 | 
417 |     indoc_embedding_array = list()
418 |     indoc_word_array = list()
419 |     outdoc_embedding_array = list()
420 |     outdoc_word_array = list()
421 | 
422 |     for line in open(emb_file, 'r'):
423 |         line = line.split(delimiter)
424 |         vector = list(map(lambda t: float(t), filter(lambda n: n and not n.isspace(), line[1:])))
425 | 
426 |         if shrink_to_train and line[0] not in feature_set:
427 |             continue
428 | 
429 |         if line[0] == unk:
430 |             rand_embedding_tensor[0] = torch.FloatTensor(vector) #unk is 0
431 |         elif line[0] in word_dict:
432 |             rand_embedding_tensor[word_dict[line[0]]] = torch.FloatTensor(vector)
433 |         elif line[0] in full_feature_set:
434 |             indoc_embedding_array.append(vector)
435 |             indoc_word_array.append(line[0])
436 |         elif not shrink_to_corpus:
437 |             outdoc_word_array.append(line[0])
438 |             outdoc_embedding_array.append(vector)
439 | 
440 |     embedding_tensor_0 = torch.FloatTensor(np.asarray(indoc_embedding_array))
441 | 
442 |     if not shrink_to_corpus:
443 |         embedding_tensor_1 = torch.FloatTensor(np.asarray(outdoc_embedding_array))
444 |         word_emb_len = embedding_tensor_0.size(1)
445 |         assert(word_emb_len == emb_len)
446 | 
447 |     if shrink_to_corpus:
448 |         embedding_tensor = torch.cat([rand_embedding_tensor, embedding_tensor_0], 0)
449 |     else:
450 |         embedding_tensor = torch.cat([rand_embedding_tensor, embedding_tensor_0, embedding_tensor_1], 0)
451 | 
452 |     for word in indoc_word_array:
453 |         word_dict[word] = len(word_dict)
454 |     in_doc_num = len(word_dict)
455 |     if  not shrink_to_corpus:
456 |         for word in outdoc_word_array:
457 |             word_dict[word] = len(word_dict)
458 | 
459 |     return word_dict, embedding_tensor, in_doc_num
460 | 
461 | def calc_threshold_mean(features):
462 |     """
463 |     calculate the threshold for bucket by mean
464 |     """
465 |     lines_len = list(map(lambda t: len(t) + 1, features))
466 |     average = int(sum(lines_len) / len(lines_len))
467 |     lower_line = list(filter(lambda t: t < average, lines_len))
468 |     upper_line = list(filter(lambda t: t >= average, lines_len))
469 |     lower_average = int(sum(lower_line) / len(lower_line))
470 |     upper_average = int(sum(upper_line) / len(upper_line))
471 |     max_len = max(lines_len)
472 |     return [lower_average, average, upper_average, max_len]
473 | 
474 | 
475 | def construct_bucket_mean_gd(input_features, input_label, word_dict, label_dict):
476 |     """
477 |     Construct bucket by mean for greedy decode, word-level only
478 |     """
479 |     # encode and padding
480 |     features = encode_safe(input_features, word_dict, word_dict['<unk>'])
481 |     labels = encode(input_label, label_dict)
482 |     labels = list(map(lambda t: [label_dict['<start>']] + list(t), labels))
483 | 
484 |     thresholds = calc_threshold_mean(features)
485 | 
486 |     return construct_bucket_gd(features, labels, thresholds, word_dict['<eof>'], label_dict['<pad>'])
487 | 
488 | 
489 | def construct_bucket_mean_vb(input_features, input_label, word_dict, label_dict, caseless):
490 |     """
491 |     Construct bucket by mean for viterbi decode, word-level only
492 |     """
493 |     # encode and padding
494 |     if caseless:
495 |         input_features = list(map(lambda t: list(map(lambda x: x.lower(), t)), input_features))
496 | 
497 |     features = encode_safe(input_features, word_dict, word_dict['<unk>'])
498 |     labels = encode(input_label, label_dict)
499 |     labels = list(map(lambda t: [label_dict['<start>']] + list(t), labels))
500 | 
501 |     thresholds = calc_threshold_mean(features)
502 | 
503 |     return construct_bucket_vb(features, labels, thresholds, word_dict['<eof>'], label_dict['<pad>'], len(label_dict))
504 | 
505 | def construct_bucket_mean_vb_wc(word_features, input_label, label_dict, char_dict, word_dict, caseless):
506 |     """
507 |     Construct bucket by mean for viterbi decode, word-level and char-level
508 |     """
509 |     # encode and padding
510 |     char_features = encode2char_safe(word_features, char_dict)
511 |     fea_len = [list(map(lambda t: len(t) + 1, f)) for f in char_features]
512 |     forw_features = concatChar(char_features, char_dict)
513 | 
514 |     labels = encode(input_label, label_dict)
515 |     labels = list(map(lambda t: [label_dict['<start>']] + list(t), labels))
516 | 
517 |     thresholds = calc_threshold_mean(fea_len)
518 | 
519 |     if caseless:
520 |         word_features = list(map(lambda t: list(map(lambda x: x.lower(), t)), word_features))
521 |     word_features = encode_safe(word_features, word_dict, word_dict['<unk>'])
522 | 
523 |     return construct_bucket_vb_wc(word_features, forw_features, fea_len, labels, thresholds, word_dict['<eof>'], char_dict['\n'], label_dict['<pad>'], len(label_dict))
524 | 
525 | def construct_bucket_vb_wc(word_features, forw_features, fea_len, input_labels, thresholds, pad_word_feature, pad_char_feature, pad_label, label_size):
526 |     """
527 |     Construct bucket by thresholds for viterbi decode, word-level and char-level
528 |     """
529 |     # construct corpus for language model pre-training
530 |     forw_corpus = [pad_char_feature]
531 |     for forw_feature in forw_features:
532 |         forw_corpus.extend(forw_feature + [pad_char_feature])
533 |     back_corpus = forw_corpus[::-1]
534 |     # two way construct, first build the bucket, then calculate padding length, then do the padding
535 |     buckets = [[[], [], [], [], [], [], [], []] for ind in range(len(thresholds))]
536 |     # forw, forw_ind, back, back_in, label, mask
537 |     buckets_len = [0 for ind in range(len(thresholds))]
538 | 
539 |     # thresholds is the padded length for fea
540 |     # buckets_len is the padded length for char
541 |     for f_f, f_l in zip(forw_features, fea_len):
542 |         cur_len_1 = len(f_l) + 1
543 |         idx = 0
544 |         while thresholds[idx] < cur_len_1:
545 |             idx += 1
546 |         tmp_concat_len = len(f_f) + thresholds[idx] - len(f_l)
547 |         if buckets_len[idx] < tmp_concat_len:
548 |             buckets_len[idx] = tmp_concat_len
549 | 
550 |     # calc padding
551 |     for f_f, f_l, w_f, i_l in zip(forw_features, fea_len, word_features, input_labels):
552 |         cur_len = len(f_l)
553 |         idx = 0
554 |         cur_len_1 = cur_len + 1
555 |         while thresholds[idx] < cur_len_1:
556 |             idx += 1
557 | 
558 |         padded_feature = f_f + [pad_char_feature] * (buckets_len[idx] - len(f_f))  # pad feature with <'\n'>, at least one
559 | 
560 |         padded_feature_len = f_l + [1] * (thresholds[idx] - len(f_l)) # pad feature length with <'\n'>, at least one
561 |         padded_feature_len_cum = list(itertools.accumulate(padded_feature_len)) # start from 0, but the first is ' ', so the position need not to be -1
562 |         buckets[idx][0].append(padded_feature) # char
563 |         buckets[idx][1].append(padded_feature_len_cum)
564 |         buckets[idx][2].append(padded_feature[::-1])
565 |         buckets[idx][3].append([buckets_len[idx] - 1] + [buckets_len[idx] - 1 - tup for tup in padded_feature_len_cum[:-1]])
566 |         buckets[idx][4].append(w_f + [pad_word_feature] * (thresholds[idx] - cur_len)) #word
567 |         buckets[idx][5].append([i_l[ind] * label_size + i_l[ind + 1] for ind in range(0, cur_len)] + [i_l[cur_len] * label_size + pad_label] + [pad_label * label_size + pad_label] * (thresholds[idx] - cur_len_1))  # has additional start, label
568 |         buckets[idx][6].append([1] * cur_len_1 + [0] * (thresholds[idx] - cur_len_1))  # has additional start, mask
569 |         buckets[idx][7].append([len(f_f) + thresholds[idx] - len(f_l), cur_len_1])
570 |     bucket_dataset = [CRFDataset_WC(torch.LongTensor(bucket[0]), torch.LongTensor(bucket[1]),
571 |                                     torch.LongTensor(bucket[2]), torch.LongTensor(bucket[3]),
572 |                                     torch.LongTensor(bucket[4]), torch.LongTensor(bucket[5]),
573 |                                     torch.ByteTensor(bucket[6]), torch.LongTensor(bucket[7])) for bucket in buckets]
574 |     return bucket_dataset, forw_corpus, back_corpus
575 | 
576 | 
577 | def construct_bucket_vb(input_features, input_labels, thresholds, pad_feature, pad_label, label_size):
578 |     """
579 |     Construct bucket by thresholds for viterbi decode, word-level only
580 |     """
581 |     buckets = [[[], [], []] for _ in range(len(thresholds))]
582 |     for feature, label in zip(input_features, input_labels):
583 |         cur_len = len(feature)
584 |         idx = 0
585 |         cur_len_1 = cur_len + 1
586 |         while thresholds[idx] < cur_len_1:
587 |             idx += 1
588 |         buckets[idx][0].append(feature + [pad_feature] * (thresholds[idx] - cur_len))
589 |         buckets[idx][1].append([label[ind] * label_size + label[ind + 1] for ind in range(0, cur_len)] + [
590 |             label[cur_len] * label_size + pad_label] + [pad_label * label_size + pad_label] * (
591 |                                        thresholds[idx] - cur_len_1))
592 |         buckets[idx][2].append([1] * cur_len_1 + [0] * (thresholds[idx] - cur_len_1))
593 |     bucket_dataset = [CRFDataset(torch.LongTensor(bucket[0]), torch.LongTensor(bucket[1]), torch.ByteTensor(bucket[2]))
594 |                       for bucket in buckets]
595 |     return bucket_dataset
596 | 
597 | 
598 | def construct_bucket_gd(input_features, input_labels, thresholds, pad_feature, pad_label):
599 |     """
600 |     Construct bucket by thresholds for greedy decode, word-level only
601 |     """
602 |     buckets = [[[], [], []] for ind in range(len(thresholds))]
603 |     for feature, label in zip(input_features, input_labels):
604 |         cur_len = len(feature)
605 |         cur_len_1 = cur_len + 1
606 |         idx = 0
607 |         while thresholds[idx] < cur_len_1:
608 |             idx += 1
609 |         buckets[idx][0].append(feature + [pad_feature] * (thresholds[idx] - cur_len))
610 |         buckets[idx][1].append(label[1:] + [pad_label] * (thresholds[idx] - cur_len))
611 |         buckets[idx][2].append(label + [pad_label] * (thresholds[idx] - cur_len_1))
612 |     bucket_dataset = [CRFDataset(torch.LongTensor(bucket[0]), torch.LongTensor(bucket[1]), torch.LongTensor(bucket[2])) for bucket in buckets]
613 |     return bucket_dataset
614 | 
615 | 
616 | def find_length_from_feats(feats, feat_to_ix):
617 |     """
618 |     find length of unpadded features based on feature
619 |     """
620 |     end_position = len(feats) - 1
621 |     for position, feat in enumerate(feats):
622 |         if feat.data[0] == feat_to_ix['<eof>']:
623 |             end_position = position
624 |             break
625 |     return end_position + 1
626 | 
627 | 
628 | def find_length_from_labels(labels, label_to_ix):
629 |     """
630 |     find length of unpadded features based on labels
631 |     """
632 |     end_position = len(labels) - 1
633 |     for position, label in enumerate(labels):
634 |         if label == label_to_ix['<pad>']:
635 |             end_position = position
636 |             break
637 |     return end_position
638 | 
639 | 
640 | def revlut(lut):
641 |     return {v: k for k, v in lut.items()}
642 | 
643 | 
644 | # Turn a sequence of IOB chunks into single tokens
645 | def iob_to_spans(sequence, lut, strict_iob2=False):
646 |     """
647 |     convert to iob to span
648 |     """
649 |     iobtype = 2 if strict_iob2 else 1
650 |     chunks = []
651 |     current = None
652 | 
653 |     for i, y in enumerate(sequence):
654 |         label = lut[y]
655 | 
656 |         if label.startswith('B-'):
657 |             if current is not None:
658 |                 chunks.append('@'.join(current))
659 |             current = [label.replace('B-', ''), '%d' % i]
660 | 
661 |         elif label.startswith('I-'):
662 | 
663 |             if current is not None:
664 |                 base = label.replace('I-', '')
665 |                 if base == current[0]:
666 |                     current.append('%d' % i)
667 |                 else:
668 |                     chunks.append('@'.join(current))
669 |                     if iobtype == 2:
670 |                         print('Warning, type=IOB2, unexpected format ([%s] follows other tag type [%s] @ %d)' % (
671 |                             label, current[0], i))
672 | 
673 |                     current = [base, '%d' % i]
674 | 
675 |             else:
676 |                 current = [label.replace('I-', ''), '%d' % i]
677 |                 if iobtype == 2:
678 |                     print('Warning, unexpected format (I before B @ %d) %s' % (i, label))
679 |         else:
680 |             if current is not None:
681 |                 chunks.append('@'.join(current))
682 |             current = None
683 | 
684 |     if current is not None:
685 |         chunks.append('@'.join(current))
686 | 
687 |     return set(chunks)
688 | 
689 | # Turn a sequence of IOBES chunks into single tokens
690 | def iobes_to_spans(sequence, lut, strict_iob2=False):
691 |     """
692 |     convert to iobes to span
693 |     """
694 |     iobtype = 2 if strict_iob2 else 1
695 |     chunks = []
696 |     current = None
697 | 
698 |     for i, y in enumerate(sequence):
699 |         label = lut[y]
700 | 
701 |         if label.startswith('B-'):
702 | 
703 |             if current is not None:
704 |                 chunks.append('@'.join(current))
705 |             current = [label.replace('B-', ''), '%d' % i]
706 | 
707 |         elif label.startswith('S-'):
708 | 
709 |             if current is not None:
710 |                 chunks.append('@'.join(current))
711 |                 current = None
712 |             base = label.replace('S-', '')
713 |             chunks.append('@'.join([base, '%d' % i]))
714 | 
715 |         elif label.startswith('I-'):
716 | 
717 |             if current is not None:
718 |                 base = label.replace('I-', '')
719 |                 if base == current[0]:
720 |                     current.append('%d' % i)
721 |                 else:
722 |                     chunks.append('@'.join(current))
723 |                     if iobtype == 2:
724 |                         print('Warning')
725 |                     current = [base, '%d' % i]
726 | 
727 |             else:
728 |                 current = [label.replace('I-', ''), '%d' % i]
729 |                 if iobtype == 2:
730 |                     print('Warning')
731 | 
732 |         elif label.startswith('E-'):
733 | 
734 |             if current is not None:
735 |                 base = label.replace('E-', '')
736 |                 if base == current[0]:
737 |                     current.append('%d' % i)
738 |                     chunks.append('@'.join(current))
739 |                     current = None
740 |                 else:
741 |                     chunks.append('@'.join(current))
742 |                     if iobtype == 2:
743 |                         print('Warning')
744 |                     current = [base, '%d' % i]
745 |                     chunks.append('@'.join(current))
746 |                     current = None
747 | 
748 |             else:
749 |                 current = [label.replace('E-', ''), '%d' % i]
750 |                 if iobtype == 2:
751 |                     print('Warning')
752 |                 chunks.append('@'.join(current))
753 |                 current = None
754 |         else:
755 |             if current is not None:
756 |                 chunks.append('@'.join(current))
757 |             current = None
758 | 
759 |     if current is not None:
760 |         chunks.append('@'.join(current))
761 | 
762 |     return set(chunks)
763 | 
764 | 
765 | def fill_y(nc, yidx):
766 |     """
767 |     fill y to dense matrix
768 |     """
769 |     batchsz = yidx.shape[0]
770 |     siglen = yidx.shape[1]
771 |     dense = np.zeros((batchsz, siglen, nc), dtype=np.int)
772 |     for i in range(batchsz):
773 |         for j in range(siglen):
774 |             idx = int(yidx[i, j])
775 |             if idx > 0:
776 |                 dense[i, j, idx] = 1
777 | 
778 |     return dense
779 | 
780 | def save_checkpoint(state, track_list, filename):
781 |     """
782 |     save checkpoint
783 |     """
784 |     with open(filename+'.json', 'w') as f:
785 |         json.dump(track_list, f)
786 |     torch.save(state, filename+'.model')
787 | 
788 | def adjust_learning_rate(optimizer, lr):
789 |     """
790 |     shrink learning rate for pytorch
791 |     """
792 |     for param_group in optimizer.param_groups:
793 |         param_group['lr'] = lr
794 | 
795 | def init_embedding(input_embedding):
796 |     """
797 |     Initialize embedding
798 |     """
799 |     bias = np.sqrt(3.0 / input_embedding.size(1))
800 |     nn.init.uniform_(input_embedding, -bias, bias)
801 | 
802 | def init_linear(input_linear):
803 |     """
804 |     Initialize linear transformation
805 |     """
806 |     bias = np.sqrt(6.0 / (input_linear.weight.size(0) + input_linear.weight.size(1)))
807 |     nn.init.uniform_(input_linear.weight, -bias, bias)
808 |     if input_linear.bias is not None:
809 |         input_linear.bias.data.zero_()
810 | 
811 | def init_lstm(input_lstm):
812 |     """
813 |     Initialize lstm
814 |     """
815 |     for ind in range(0, input_lstm.num_layers):
816 |         weight = eval('input_lstm.weight_ih_l'+str(ind))
817 |         bias = np.sqrt(6.0 / (weight.size(0)/4 + weight.size(1)))
818 |         nn.init.uniform_(weight, -bias, bias)
819 |         weight = eval('input_lstm.weight_hh_l'+str(ind))
820 |         bias = np.sqrt(6.0 / (weight.size(0)/4 + weight.size(1)))
821 |         nn.init.uniform_(weight, -bias, bias)
822 | 
823 |     if input_lstm.bias:
824 |         for ind in range(0, input_lstm.num_layers):
825 |             weight = eval('input_lstm.bias_ih_l'+str(ind))
826 |             weight.data.zero_()
827 |             weight.data[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1
828 |             weight = eval('input_lstm.bias_hh_l'+str(ind))
829 |             weight.data.zero_()
830 |             weight.data[input_lstm.hidden_size: 2 * input_lstm.hidden_size] = 1
831 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.22.0
2 | tqdm
3 | http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp35-cp35m-linux_x86_64.whl


--------------------------------------------------------------------------------
/seq_w.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import datetime
 3 | import time
 4 | import torch
 5 | import torch.autograd as autograd
 6 | import torch.nn as nn
 7 | import torch.optim as optim
 8 | import codecs
 9 | from model.crf import *
10 | from model.lstm_crf import *
11 | import model.utils as utils
12 | from model.predictor import predict_w
13 | 
14 | import argparse
15 | import json
16 | import os
17 | import sys
18 | from tqdm import tqdm
19 | import itertools
20 | import functools
21 | 
22 | if __name__ == "__main__":
23 |     parser = argparse.ArgumentParser(description='Evaluating LM-BLSTM-CRF')
24 |     parser.add_argument('--load_arg', default='./checkpoint/ner/ner_4_cwlm_lstm_crf.json', help='path to arg json')
25 |     parser.add_argument('--load_check_point', default='./checkpoint/ner/ner_4_cwlm_lstm_crf.model', help='path to model checkpoint file')
26 |     parser.add_argument('--gpu',type=int, default=0, help='gpu id')
27 |     parser.add_argument('--decode_type', choices=['label', 'string'], default='string', help='type of decode function, set `label` to couple label with text, or set `string` to insert label into test')
28 |     parser.add_argument('--batch_size', type=int, default=50, help='size of batch')
29 |     parser.add_argument('--input_file', default='data/ner2003/test.txt', help='path to input un-annotated corpus')
30 |     parser.add_argument('--output_file', default='output.txt', help='path to output file')
31 |     args = parser.parse_args()
32 | 
33 |     print('loading dictionary')
34 |     with open(args.load_arg, 'r') as f:
35 |         jd = json.load(f)
36 |     jd = jd['args']
37 | 
38 |     checkpoint_file = torch.load(args.load_check_point, map_location=lambda storage, loc: storage)
39 |     f_map = checkpoint_file['f_map']
40 |     l_map = checkpoint_file['l_map']
41 |     if args.gpu >= 0:
42 |         torch.cuda.set_device(args.gpu)
43 | 
44 |     # loading corpus
45 |     print('loading corpus')
46 |     with codecs.open(args.input_file, 'r', 'utf-8') as f:
47 |         lines = f.readlines()
48 | 
49 |     # converting format
50 |     features = utils.read_features(lines)
51 | 
52 |     # build model
53 |     print('loading model')
54 |     ner_model = LSTM_CRF(len(f_map), len(l_map), jd['embedding_dim'], jd['hidden'], jd['layers'], jd['drop_out'], large_CRF=jd['small_crf'])
55 | 
56 |     ner_model.load_state_dict(checkpoint_file['state_dict'])
57 | 
58 |     if args.gpu >= 0:
59 |         if_cuda = True
60 |         torch.cuda.set_device(args.gpu)
61 |         ner_model.cuda()
62 |     else:
63 |         if_cuda = False
64 | 
65 |     decode_label = (args.decode_type == 'label')
66 | 
67 |     predictor = predict_w(if_cuda, f_map, l_map, f_map['<eof>'], l_map['<pad>'], l_map['<start>'], decode_label, args.batch_size, jd['caseless'])
68 | 
69 |     print('annotating')
70 |     with open(args.output_file, 'w') as fout:
71 |         predictor.output_batch(ner_model, features, fout)


--------------------------------------------------------------------------------
/seq_wc.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import datetime
 3 | import time
 4 | import torch
 5 | import torch.autograd as autograd
 6 | import torch.nn as nn
 7 | import torch.optim as optim
 8 | import codecs
 9 | from model.crf import *
10 | from model.lm_lstm_crf import *
11 | import model.utils as utils
12 | from model.predictor import predict_wc
13 | 
14 | import argparse
15 | import json
16 | import os
17 | import sys
18 | from tqdm import tqdm
19 | import itertools
20 | import functools
21 | 
22 | if __name__ == "__main__":
23 |     parser = argparse.ArgumentParser(description='Evaluating LM-BLSTM-CRF')
24 |     parser.add_argument('--load_arg', default='./checkpoint/ner/ner_4_cwlm_lstm_crf.json', help='path to arg json')
25 |     parser.add_argument('--load_check_point', default='./checkpoint/ner/ner_4_cwlm_lstm_crf.model', help='path to model checkpoint file')
26 |     parser.add_argument('--gpu',type=int, default=0, help='gpu id')
27 |     parser.add_argument('--decode_type', choices=['label', 'string'], default='string', help='type of decode function, set `label` to couple label with text, or set `string` to insert label into test')
28 |     parser.add_argument('--batch_size', type=int, default=50, help='size of batch')
29 |     parser.add_argument('--input_file', default='data/ner2003/test.txt', help='path to input un-annotated corpus')
30 |     parser.add_argument('--output_file', default='output.txt', help='path to output file')
31 |     args = parser.parse_args()
32 | 
33 |     print('loading dictionary')
34 |     with open(args.load_arg, 'r') as f:
35 |         jd = json.load(f)
36 |     jd = jd['args']
37 | 
38 |     checkpoint_file = torch.load(args.load_check_point, map_location=lambda storage, loc: storage)
39 |     f_map = checkpoint_file['f_map']
40 |     l_map = checkpoint_file['l_map']
41 |     c_map = checkpoint_file['c_map']
42 |     in_doc_words = checkpoint_file['in_doc_words']
43 |     if args.gpu >= 0:
44 |         torch.cuda.set_device(args.gpu)
45 | 
46 |     # loading corpus
47 |     print('loading corpus')
48 |     with codecs.open(args.input_file, 'r', 'utf-8') as f:
49 |         lines = f.readlines()
50 | 
51 |     # converting format
52 |     features = utils.read_features(lines)
53 | 
54 |     # build model
55 |     print('loading model')
56 |     ner_model = LM_LSTM_CRF(len(l_map), len(c_map), jd['char_dim'], jd['char_hidden'], jd['char_layers'], jd['word_dim'], jd['word_hidden'], jd['word_layers'], len(f_map), jd['drop_out'], large_CRF=jd['small_crf'], if_highway=jd['high_way'], in_doc_words=in_doc_words, highway_layers = jd['highway_layers'])
57 | 
58 |     ner_model.load_state_dict(checkpoint_file['state_dict'])
59 | 
60 |     if args.gpu >= 0:
61 |         if_cuda = True
62 |         torch.cuda.set_device(args.gpu)
63 |         ner_model.cuda()
64 |     else:
65 |         if_cuda = False
66 | 
67 |     decode_label = (args.decode_type == 'label')
68 |     predictor = predict_wc(if_cuda, f_map, c_map, l_map, f_map['<eof>'], c_map['\n'], l_map['<pad>'], l_map['<start>'], decode_label, args.batch_size, jd['caseless'])
69 | 
70 |     print('annotating')
71 |     with open(args.output_file, 'w') as fout:
72 |         predictor.output_batch(ner_model, features, fout)


--------------------------------------------------------------------------------
/train_w.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import datetime
  3 | import time
  4 | import torch
  5 | import torch.autograd as autograd
  6 | import torch.nn as nn
  7 | import torch.optim as optim
  8 | import codecs
  9 | from model.crf import *
 10 | from model.lstm_crf import *
 11 | import model.utils as utils
 12 | from model.evaluator import eval_w
 13 | 
 14 | import argparse
 15 | import json
 16 | import os
 17 | import sys
 18 | from tqdm import tqdm
 19 | import itertools
 20 | import functools
 21 | 
 22 | def eprint(*args, **kwargs):
 23 |     print(*args, file=sys.stderr, **kwargs)
 24 | 
 25 | if __name__ == "__main__":
 26 |     parser = argparse.ArgumentParser(description='Learning with BLSTM-CRF')
 27 |     parser.add_argument('--rand_embedding', action='store_true', help='random initialize word embedding')
 28 |     parser.add_argument('--emb_file', default='./embedding/glove.6B.100d.txt', help='path to pre-trained embedding')
 29 |     parser.add_argument('--train_file', default='./data/ner2003/eng.train.iobes', help='path to training file')
 30 |     parser.add_argument('--dev_file', default='./data/ner2003/eng.testa.iobes', help='path to development file')
 31 |     parser.add_argument('--test_file', default='./data/ner2003/eng.testb.iobes', help='path to test file')
 32 |     parser.add_argument('--gpu', type=int, default=0, help='gpu id, set to -1 if use cpu mode')
 33 |     parser.add_argument('--batch_size', type=int, default=10, help='batch size (10)')
 34 |     parser.add_argument('--unk', default='unk', help='unknow-token in pre-trained embedding')
 35 |     parser.add_argument('--checkpoint', default='./checkpoint/', help='path to checkpoint prefix')
 36 |     parser.add_argument('--hidden', type=int, default=100, help='hidden dimension')
 37 |     parser.add_argument('--drop_out', type=float, default=0.55, help='dropout ratio')
 38 |     parser.add_argument('--epoch', type=int, default=200, help='maximum epoch number')
 39 |     parser.add_argument('--start_epoch', type=int, default=0, help='start epoch idx')
 40 |     parser.add_argument('--caseless', action='store_true', help='caseless or not')
 41 |     parser.add_argument('--embedding_dim', type=int, default=100, help='dimension for word embedding')
 42 |     parser.add_argument('--layers', type=int, default=1, help='number of lstm layers')
 43 |     parser.add_argument('--lr', type=float, default=0.015, help='initial learning rate')
 44 |     parser.add_argument('--lr_decay', type=float, default=0.05, help='decay ratio of learning rate')
 45 |     parser.add_argument('--fine_tune', action='store_false', help='fine tune pre-trained embedding dictionary')
 46 |     parser.add_argument('--load_check_point', default='', help='path of checkpoint')
 47 |     parser.add_argument('--load_opt', action='store_true', help='load optimizer from ')
 48 |     parser.add_argument('--update', choices=['sgd', 'adam'], default='sgd', help='optimizer method')
 49 |     parser.add_argument('--momentum', type=float, default=0.9, help='momentum for sgd')
 50 |     parser.add_argument('--clip_grad', type=float, default=5.0, help='grad clip at')
 51 |     parser.add_argument('--small_crf', action='store_false', help='use small crf instead of large crf, refer model.crf module for more details')
 52 |     parser.add_argument('--mini_count', type=float, default=5, help='thresholds to replace rare words with <unk>')
 53 |     parser.add_argument('--eva_matrix', choices=['a', 'fa'], default='fa', help='use f1 and accuracy or accuracy alone')
 54 |     parser.add_argument('--patience', type=int, default=15, help='patience for early stop')
 55 |     parser.add_argument('--least_iters', type=int, default=50, help='at least train how many epochs before stop')
 56 |     parser.add_argument('--shrink_embedding', action='store_true', help='shrink the embedding dictionary to corpus (open this if pre-trained embedding dictionary is too large, but disable this may yield better results on external corpus)')
 57 |     args = parser.parse_args()
 58 | 
 59 |     if args.gpu >= 0:
 60 |         torch.cuda.set_device(args.gpu)
 61 | 
 62 |     print('setting:')
 63 |     print(args)
 64 | 
 65 |     # load corpus
 66 |     print('loading corpus')
 67 |     with codecs.open(args.train_file, 'r', 'utf-8') as f:
 68 |         lines = f.readlines()
 69 |     with codecs.open(args.dev_file, 'r', 'utf-8') as f:
 70 |         dev_lines = f.readlines()
 71 |     with codecs.open(args.test_file, 'r', 'utf-8') as f:
 72 |         test_lines = f.readlines()
 73 | 
 74 |     # converting format
 75 |     dev_features, dev_labels = utils.read_corpus(dev_lines)
 76 |     test_features, test_labels = utils.read_corpus(test_lines)
 77 | 
 78 |     if args.load_check_point:
 79 |         if os.path.isfile(args.load_check_point):
 80 |             print("loading checkpoint: '{}'".format(args.load_check_point))
 81 |             checkpoint_file = torch.load(args.load_check_point)
 82 |             args.start_epoch = checkpoint_file['epoch']
 83 |             f_map = checkpoint_file['f_map']
 84 |             l_map = checkpoint_file['l_map']
 85 |             train_features, train_labels = utils.read_corpus(lines)
 86 |         else:
 87 |             print("no checkpoint found at: '{}'".format(args.load_check_point))
 88 |     else:
 89 |         print('constructing coding table')
 90 | 
 91 |         # converting format
 92 | 
 93 |         train_features, train_labels, f_map, l_map = utils.generate_corpus(lines, if_shrink_feature=True, thresholds=0)
 94 |         
 95 |         f_set = {v for v in f_map}
 96 |         f_map = utils.shrink_features(f_map, train_features, args.mini_count)
 97 | 
 98 |         dt_f_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), dev_features), f_set)
 99 |         dt_f_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), test_features), dt_f_set)
100 | 
101 |         if not args.rand_embedding:
102 |             print("feature size: '{}'".format(len(f_map)))
103 |             print('loading embedding')
104 |             if args.fine_tune:  # which means does not do fine-tune
105 |                 f_map = {'<eof>': 0}
106 |             f_map, embedding_tensor, in_doc_words = utils.load_embedding_wlm(args.emb_file, ' ', f_map, dt_f_set,args.caseless,args.unk, args.embedding_dim, shrink_to_corpus=args.shrink_embedding)
107 |             print("embedding size: '{}'".format(len(f_map)))
108 | 
109 |         l_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), dev_labels))
110 |         l_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), test_labels), l_set)
111 |         for label in l_set:
112 |             if label not in l_map:
113 |                 l_map[label] = len(l_map)
114 | 
115 |     # construct dataset
116 |     dataset = utils.construct_bucket_mean_vb(train_features, train_labels, f_map, l_map, args.caseless)
117 |     dev_dataset = utils.construct_bucket_mean_vb(dev_features, dev_labels, f_map, l_map, args.caseless)
118 |     test_dataset = utils.construct_bucket_mean_vb(test_features, test_labels, f_map, l_map, args.caseless)
119 | 
120 |     dataset_loader = [torch.utils.data.DataLoader(tup, args.batch_size, shuffle=True, drop_last=False) for tup in dataset]
121 |     dev_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in dev_dataset]
122 |     test_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in test_dataset]
123 | 
124 |     # build model
125 |     print('building model')
126 |     ner_model = LSTM_CRF(len(f_map), len(l_map), args.embedding_dim, args.hidden, args.layers, args.drop_out, large_CRF=args.small_crf)
127 | 
128 |     if args.load_check_point:
129 |             ner_model.load_state_dict(checkpoint_file['state_dict'])
130 |     else:
131 |         if not args.rand_embedding:
132 |             ner_model.load_pretrained_embedding(embedding_tensor)
133 |         print('random initialization')
134 |         ner_model.rand_init(init_embedding=args.rand_embedding)
135 | 
136 |     if args.update == 'sgd':
137 |         optimizer = optim.SGD(ner_model.parameters(), lr=args.lr, momentum=args.momentum)
138 |     elif args.update == 'adam':
139 |         optimizer = optim.Adam(ner_model.parameters(), lr=args.lr)
140 | 
141 | 
142 |     if args.load_check_point and args.load_opt:
143 |         optimizer.load_state_dict(checkpoint_file['optimizer'])
144 | 
145 |     crit = CRFLoss_vb(len(l_map), l_map['<start>'], l_map['<pad>'])
146 | 
147 |     if args.gpu >= 0:
148 |         if_cuda = True
149 |         print('device: ' + str(args.gpu))
150 |         torch.cuda.set_device(args.gpu)
151 |         crit.cuda()
152 |         ner_model.cuda()
153 |         packer = CRFRepack(len(l_map), True)
154 |     else:
155 |         if_cuda = False
156 |         packer = CRFRepack(len(l_map), False)
157 | 
158 |     if args.load_check_point:
159 |         dev_f1, dev_acc = eval_batch(ner_model, dev_dataset_loader, pack, l_map)
160 |         test_f1, test_acc = eval_batch(ner_model, test_dataset_loader, pack, l_map)
161 |         print('(checkpoint: dev F1 = %.4f, dev acc = %.4f, F1 on test = %.4f, acc on test= %.4f)' %
162 |               (dev_f1,
163 |                dev_acc,
164 |                test_f1,
165 |                test_acc))
166 | 
167 |     tot_length = sum(map(lambda t: len(t), dataset_loader))
168 |     best_f1 = float('-inf')
169 |     best_acc = float('-inf')
170 |     track_list = list()
171 |     start_time = time.time()
172 |     epoch_list = range(args.start_epoch, args.start_epoch + args.epoch)
173 |     patience_count = 0
174 | 
175 |     evaluator = eval_w(packer, l_map, args.eva_matrix)
176 | 
177 |     for epoch_idx, args.start_epoch in enumerate(epoch_list):
178 | 
179 |         epoch_loss = 0
180 |         ner_model.train()
181 | 
182 |         for feature, tg, mask in tqdm(
183 |                 itertools.chain.from_iterable(dataset_loader), mininterval=2,
184 |                 desc=' - Tot it %d (epoch %d)' % (tot_length, args.start_epoch), leave=False, file=sys.stdout):
185 | 
186 |             fea_v, tg_v, mask_v = packer.repack_vb(feature, tg, mask)
187 |             ner_model.zero_grad()
188 |             scores, hidden = ner_model.forward(fea_v)
189 |             loss = crit.forward(scores, tg_v, mask_v)
190 |             loss.backward()
191 |             nn.utils.clip_grad_norm_(ner_model.parameters(), args.clip_grad)
192 |             optimizer.step()
193 |             epoch_loss += utils.to_scalar(loss)
194 | 
195 |         # update lr
196 |         utils.adjust_learning_rate(optimizer, args.lr / (1 + (args.start_epoch + 1) * args.lr_decay))
197 | 
198 |         # average
199 |         epoch_loss /= tot_length
200 | 
201 |         # eval & save check_point
202 | 
203 |         if 'f' in args.eva_matrix:
204 |             dev_result = evaluator.calc_score(ner_model, dev_dataset_loader)
205 |             for label, (dev_f1, dev_pre, dev_rec, dev_acc, msg) in dev_result.items():
206 |                 print('DEV : %s : dev_f1: %.4f dev_rec: %.4f dev_pre: %.4f dev_acc: %.4f | %s\n' % (label, dev_f1, dev_pre, dev_rec, dev_acc, msg))
207 |             (dev_f1, dev_pre, dev_rec, dev_acc, msg) = dev_result['total']
208 | 
209 |             if dev_f1 > best_f1:
210 |                 patience_count = 0
211 |                 best_f1 = dev_f1
212 | 
213 |                 test_result = evaluator.calc_score(ner_model, test_dataset_loader)
214 |                 for label, (test_f1, test_pre, test_rec, test_acc, msg) in test_result.items():
215 |                     print('TEST : %s : test_f1: %.4f test_rec: %.4f test_pre: %.4f test_acc: %.4f | %s\n' % (label, test_f1, test_rec, test_pre, test_acc, msg))
216 |                 (test_f1, test_rec, test_pre, test_acc, msg) = test_result['total']
217 | 
218 |                 track_list.append(
219 |                     {'loss': epoch_loss, 'dev_f1': dev_f1, 'dev_acc': dev_acc, 'test_f1': test_f1,
220 |                      'test_acc': test_acc})
221 | 
222 |                 print(
223 |                     '(loss: %.4f, epoch: %d, dev F1 = %.4f, dev acc = %.4f, F1 on test = %.4f, acc on test= %.4f), saving...' %
224 |                     (epoch_loss,
225 |                      args.start_epoch,
226 |                      dev_f1,
227 |                      dev_acc,
228 |                      test_f1,
229 |                      test_acc))
230 | 
231 |                 try:
232 |                     utils.save_checkpoint({
233 |                         'epoch': args.start_epoch,
234 |                         'state_dict': ner_model.state_dict(),
235 |                         'optimizer': optimizer.state_dict(),
236 |                         'f_map': f_map,
237 |                         'l_map': l_map,
238 |                     }, {'track_list': track_list,
239 |                         'args': vars(args)
240 |                         }, args.checkpoint + 'lstm_crf')
241 |                 except Exception as inst:
242 |                     print(inst)
243 | 
244 |             else:
245 |                 patience_count += 1
246 |                 print('(loss: %.4f, epoch: %d, dev F1 = %.4f, dev acc = %.4f)' %
247 |                       (epoch_loss,
248 |                        args.start_epoch,
249 |                        dev_f1,
250 |                        dev_acc))
251 |                 track_list.append({'loss': epoch_loss, 'dev_f1': dev_f1, 'dev_acc': dev_acc})
252 | 
253 |         else:
254 | 
255 |             dev_acc = evaluator.calc_score(ner_model, dev_dataset_loader)
256 | 
257 |             if dev_acc > best_acc:
258 |                 patience_count = 0
259 |                 best_acc = dev_acc
260 |                 
261 |                 test_acc = evaluator.calc_score(ner_model, test_dataset_loader)
262 | 
263 |                 track_list.append(
264 |                     {'loss': epoch_loss, 'dev_acc': dev_acc, 'test_acc': test_acc})
265 | 
266 |                 print(
267 |                     '(loss: %.4f, epoch: %d, dev acc = %.4f, acc on test= %.4f), saving...' %
268 |                     (epoch_loss,
269 |                      args.start_epoch,
270 |                      dev_acc,
271 |                      test_acc))
272 | 
273 |                 try:
274 |                     utils.save_checkpoint({
275 |                         'epoch': args.start_epoch,
276 |                         'state_dict': ner_model.state_dict(),
277 |                         'optimizer': optimizer.state_dict(),
278 |                         'f_map': f_map,
279 |                         'l_map': l_map,
280 |                     }, {'track_list': track_list,
281 |                         'args': vars(args)
282 |                         }, args.checkpoint + 'lstm_crf')
283 |                 except Exception as inst:
284 |                     print(inst)
285 | 
286 |             else:
287 |                 patience_count += 1
288 |                 print('(loss: %.4f, epoch: %d, dev acc = %.4f)' %
289 |                       (epoch_loss,
290 |                        args.start_epoch,
291 |                        dev_acc))
292 |                 track_list.append({'loss': epoch_loss, 'dev_acc': dev_acc})
293 | 
294 |         print('epoch: ' + str(args.start_epoch) + '\t in ' + str(args.epoch) + ' take: ' + str(
295 |             time.time() - start_time) + ' s')
296 | 
297 |         if patience_count >= args.patience and args.start_epoch >= args.least_iters:
298 |             break
299 | 
300 |     #print best
301 |     if 'f' in args.eva_matrix:
302 |         eprint(args.checkpoint + ' dev_f1: %.4f dev_rec: %.4f dev_pre: %.4f dev_acc: %.4f test_f1: %.4f test_rec: %.4f test_pre: %.4f test_acc: %.4f\n' % (dev_f1, dev_rec, dev_pre, dev_acc, test_f1, test_rec, test_pre, test_acc))
303 |     else:
304 |         eprint(args.checkpoint + ' dev_acc: %.4f test_acc: %.4f\n' % (dev_acc, test_acc))
305 | 
306 |     # printing summary
307 |     print('setting:')
308 |     print(args)
309 | 


--------------------------------------------------------------------------------
/train_wc.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import datetime
  3 | import time
  4 | import torch
  5 | import torch.autograd as autograd
  6 | import torch.nn as nn
  7 | import torch.optim as optim
  8 | import codecs
  9 | from model.crf import *
 10 | from model.lm_lstm_crf import *
 11 | import model.utils as utils
 12 | from model.evaluator import eval_wc
 13 | 
 14 | import argparse
 15 | import json
 16 | import os
 17 | import sys
 18 | from tqdm import tqdm
 19 | import itertools
 20 | import functools
 21 | 
 22 | def eprint(*args, **kwargs):
 23 |     print(*args, file=sys.stderr, **kwargs)
 24 | 
 25 | if __name__ == "__main__":
 26 |     parser = argparse.ArgumentParser(description='Learning with LM-LSTM-CRF together with Language Model')
 27 |     parser.add_argument('--rand_embedding', action='store_true', help='random initialize word embedding')
 28 |     parser.add_argument('--emb_file', default='./embedding/glove.6B.100d.txt', help='path to pre-trained embedding')
 29 |     parser.add_argument('--train_file', default='./data/ner/eng.train.iobes', help='path to training file')
 30 |     parser.add_argument('--dev_file', default='./data/ner/eng.testa.iobes', help='path to development file')
 31 |     parser.add_argument('--test_file', default='./data/ner/eng.testb.iobes', help='path to test file')
 32 |     parser.add_argument('--gpu', type=int, default=0, help='gpu id')
 33 |     parser.add_argument('--batch_size', type=int, default=10, help='batch_size')
 34 |     parser.add_argument('--unk', default='unk', help='unknow-token in pre-trained embedding')
 35 |     parser.add_argument('--char_hidden', type=int, default=300, help='dimension of char-level layers')
 36 |     parser.add_argument('--word_hidden', type=int, default=300, help='dimension of word-level layers')
 37 |     parser.add_argument('--drop_out', type=float, default=0.55, help='dropout ratio')
 38 |     parser.add_argument('--epoch', type=int, default=200, help='maximum epoch number')
 39 |     parser.add_argument('--start_epoch', type=int, default=0, help='start point of epoch')
 40 |     parser.add_argument('--checkpoint', default='./checkpoint/', help='checkpoint path')
 41 |     parser.add_argument('--caseless', action='store_true', help='caseless or not')
 42 |     parser.add_argument('--char_dim', type=int, default=30, help='dimension of char embedding')
 43 |     parser.add_argument('--word_dim', type=int, default=100, help='dimension of word embedding')
 44 |     parser.add_argument('--char_layers', type=int, default=1, help='number of char level layers')
 45 |     parser.add_argument('--word_layers', type=int, default=1, help='number of word level layers')
 46 |     parser.add_argument('--lr', type=float, default=0.015, help='initial learning rate')
 47 |     parser.add_argument('--lr_decay', type=float, default=0.05, help='decay ratio of learning rate')
 48 |     parser.add_argument('--fine_tune', action='store_false', help='fine tune the diction of word embedding or not')
 49 |     parser.add_argument('--load_check_point', default='', help='path previous checkpoint that want to be loaded')
 50 |     parser.add_argument('--load_opt', action='store_true', help='also load optimizer from the checkpoint')
 51 |     parser.add_argument('--update', choices=['sgd', 'adam'], default='sgd', help='optimizer choice')
 52 |     parser.add_argument('--momentum', type=float, default=0.9, help='momentum for sgd')
 53 |     parser.add_argument('--clip_grad', type=float, default=5.0, help='clip grad at')
 54 |     parser.add_argument('--small_crf', action='store_false', help='use small crf instead of large crf, refer model.crf module for more details')
 55 |     parser.add_argument('--mini_count', type=float, default=5, help='thresholds to replace rare words with <unk>')
 56 |     parser.add_argument('--lambda0', type=float, default=1, help='lambda0')
 57 |     parser.add_argument('--co_train', action='store_true', help='cotrain language model')
 58 |     parser.add_argument('--patience', type=int, default=15, help='patience for early stop')
 59 |     parser.add_argument('--high_way', action='store_true', help='use highway layers')
 60 |     parser.add_argument('--highway_layers', type=int, default=1, help='number of highway layers')
 61 |     parser.add_argument('--eva_matrix', choices=['a', 'fa'], default='fa', help='use f1 and accuracy or accuracy alone')
 62 |     parser.add_argument('--least_iters', type=int, default=50, help='at least train how many epochs before stop')
 63 |     parser.add_argument('--shrink_embedding', action='store_true', help='shrink the embedding dictionary to corpus (open this if pre-trained embedding dictionary is too large, but disable this may yield better results on external corpus)')
 64 |     args = parser.parse_args()
 65 | 
 66 |     if args.gpu >= 0:
 67 |         torch.cuda.set_device(args.gpu)
 68 | 
 69 |     print('setting:')
 70 |     print(args)
 71 | 
 72 |     # load corpus
 73 |     print('loading corpus')
 74 |     with codecs.open(args.train_file, 'r', 'utf-8') as f:
 75 |         lines = f.readlines()
 76 |     with codecs.open(args.dev_file, 'r', 'utf-8') as f:
 77 |         dev_lines = f.readlines()
 78 |     with codecs.open(args.test_file, 'r', 'utf-8') as f:
 79 |         test_lines = f.readlines()
 80 | 
 81 |     dev_features, dev_labels = utils.read_corpus(dev_lines)
 82 |     test_features, test_labels = utils.read_corpus(test_lines)
 83 | 
 84 |     if args.load_check_point:
 85 |         if os.path.isfile(args.load_check_point):
 86 |             print("loading checkpoint: '{}'".format(args.load_check_point))
 87 |             checkpoint_file = torch.load(args.load_check_point)
 88 |             args.start_epoch = checkpoint_file['epoch']
 89 |             f_map = checkpoint_file['f_map']
 90 |             l_map = checkpoint_file['l_map']
 91 |             c_map = checkpoint_file['c_map']
 92 |             in_doc_words = checkpoint_file['in_doc_words']
 93 |             train_features, train_labels = utils.read_corpus(lines)
 94 |         else:
 95 |             print("no checkpoint found at: '{}'".format(args.load_check_point))
 96 |     else:
 97 |         print('constructing coding table')
 98 | 
 99 |         # converting format
100 |         train_features, train_labels, f_map, l_map, c_map = utils.generate_corpus_char(lines, if_shrink_c_feature=True, c_thresholds=args.mini_count, if_shrink_w_feature=False)
101 |         
102 |         f_set = {v for v in f_map}
103 |         f_map = utils.shrink_features(f_map, train_features, args.mini_count)
104 | 
105 |         if args.rand_embedding:
106 |             print("embedding size: '{}'".format(len(f_map)))
107 |             in_doc_words = len(f_map)
108 |         else:
109 |             dt_f_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), dev_features), f_set)
110 |             dt_f_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), test_features), dt_f_set)
111 |             print("feature size: '{}'".format(len(f_map)))
112 |             print('loading embedding')
113 |             if args.fine_tune:  # which means does not do fine-tune
114 |                 f_map = {'<eof>': 0}
115 |             f_map, embedding_tensor, in_doc_words = utils.load_embedding_wlm(args.emb_file, ' ', f_map, dt_f_set, args.caseless, args.unk, args.word_dim, shrink_to_corpus=args.shrink_embedding)
116 |             print("embedding size: '{}'".format(len(f_map)))
117 | 
118 |         l_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), dev_labels))
119 |         l_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), test_labels), l_set)
120 |         for label in l_set:
121 |             if label not in l_map:
122 |                 l_map[label] = len(l_map)
123 |     
124 |     print('constructing dataset')
125 |     # construct dataset
126 |     dataset, forw_corp, back_corp = utils.construct_bucket_mean_vb_wc(train_features, train_labels, l_map, c_map, f_map, args.caseless)
127 |     dev_dataset, forw_dev, back_dev = utils.construct_bucket_mean_vb_wc(dev_features, dev_labels, l_map, c_map, f_map, args.caseless)
128 |     test_dataset, forw_test, back_test = utils.construct_bucket_mean_vb_wc(test_features, test_labels, l_map, c_map, f_map, args.caseless)
129 |     
130 |     dataset_loader = [torch.utils.data.DataLoader(tup, args.batch_size, shuffle=True, drop_last=False) for tup in dataset]
131 |     dev_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in dev_dataset]
132 |     test_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in test_dataset]
133 | 
134 |     # build model
135 |     print('building model')
136 |     ner_model = LM_LSTM_CRF(len(l_map), len(c_map), args.char_dim, args.char_hidden, args.char_layers, args.word_dim, args.word_hidden, args.word_layers, len(f_map), args.drop_out, large_CRF=args.small_crf, if_highway=args.high_way, in_doc_words=in_doc_words, highway_layers = args.highway_layers)
137 | 
138 |     if args.load_check_point:
139 |         ner_model.load_state_dict(checkpoint_file['state_dict'])
140 |     else:
141 |         if not args.rand_embedding:
142 |             ner_model.load_pretrained_word_embedding(embedding_tensor)
143 |         ner_model.rand_init(init_word_embedding=args.rand_embedding)
144 | 
145 |     if args.update == 'sgd':
146 |         optimizer = optim.SGD(ner_model.parameters(), lr=args.lr, momentum=args.momentum)
147 |     elif args.update == 'adam':
148 |         optimizer = optim.Adam(ner_model.parameters(), lr=args.lr)
149 | 
150 |     if args.load_check_point and args.load_opt:
151 |         optimizer.load_state_dict(checkpoint_file['optimizer'])
152 | 
153 |     crit_lm = nn.CrossEntropyLoss()
154 |     crit_ner = CRFLoss_vb(len(l_map), l_map['<start>'], l_map['<pad>'])
155 | 
156 |     if args.gpu >= 0:
157 |         if_cuda = True
158 |         print('device: ' + str(args.gpu))
159 |         torch.cuda.set_device(args.gpu)
160 |         crit_ner.cuda()
161 |         crit_lm.cuda()
162 |         ner_model.cuda()
163 |         packer = CRFRepack_WC(len(l_map), True)
164 |     else:
165 |         if_cuda = False
166 |         packer = CRFRepack_WC(len(l_map), False)
167 | 
168 |     tot_length = sum(map(lambda t: len(t), dataset_loader))
169 | 
170 |     best_f1 = float('-inf')
171 |     best_acc = float('-inf')
172 |     track_list = list()
173 |     start_time = time.time()
174 |     epoch_list = range(args.start_epoch, args.start_epoch + args.epoch)
175 |     patience_count = 0
176 | 
177 |     evaluator = eval_wc(packer, l_map, args.eva_matrix)
178 | 
179 |     for epoch_idx, args.start_epoch in enumerate(epoch_list):
180 | 
181 |         epoch_loss = 0
182 |         ner_model.train()
183 |         for f_f, f_p, b_f, b_p, w_f, tg_v, mask_v, len_v in tqdm(
184 |                 itertools.chain.from_iterable(dataset_loader), mininterval=2,
185 |                 desc=' - Tot it %d (epoch %d)' % (tot_length, args.start_epoch), leave=False, file=sys.stdout):
186 |             f_f, f_p, b_f, b_p, w_f, tg_v, mask_v = packer.repack_vb(f_f, f_p, b_f, b_p, w_f, tg_v, mask_v, len_v)
187 |             ner_model.zero_grad()
188 |             scores = ner_model(f_f, f_p, b_f, b_p, w_f)
189 |             loss = crit_ner(scores, tg_v, mask_v)
190 |             epoch_loss += utils.to_scalar(loss)
191 |             if args.co_train:
192 |                 cf_p = f_p[0:-1, :].contiguous()
193 |                 cb_p = b_p[1:, :].contiguous()
194 |                 cf_y = w_f[1:, :].contiguous()
195 |                 cb_y = w_f[0:-1, :].contiguous()
196 |                 cfs, _ = ner_model.word_pre_train_forward(f_f, cf_p)
197 |                 loss = loss + args.lambda0 * crit_lm(cfs, cf_y.view(-1))
198 |                 cbs, _ = ner_model.word_pre_train_backward(b_f, cb_p)
199 |                 loss = loss + args.lambda0 * crit_lm(cbs, cb_y.view(-1))
200 |             loss.backward()
201 |             nn.utils.clip_grad_norm_(ner_model.parameters(), args.clip_grad)
202 |             optimizer.step()
203 |         epoch_loss /= tot_length
204 | 
205 |         # update lr
206 |         if args.update == 'sgd':
207 |             utils.adjust_learning_rate(optimizer, args.lr / (1 + (args.start_epoch + 1) * args.lr_decay))
208 | 
209 |         # eval & save check_point
210 | 
211 |         if 'f' in args.eva_matrix:
212 |             dev_result = evaluator.calc_score(ner_model, dev_dataset_loader)
213 |             for label, (dev_f1, dev_pre, dev_rec, dev_acc, msg) in dev_result.items():
214 |                 print('DEV : %s : dev_f1: %.4f dev_rec: %.4f dev_pre: %.4f dev_acc: %.4f | %s\n' % (label, dev_f1, dev_rec, dev_pre, dev_acc, msg))
215 |             (dev_f1, dev_pre, dev_rec, dev_acc, msg) = dev_result['total']
216 | 
217 |             if dev_f1 > best_f1:
218 |                 patience_count = 0
219 |                 best_f1 = dev_f1
220 | 
221 |                 test_result = evaluator.calc_score(ner_model, test_dataset_loader)
222 |                 for label, (test_f1, test_pre, test_rec, test_acc, msg) in test_result.items():
223 |                     print('TEST : %s : test_f1: %.4f test_rec: %.4f test_pre: %.4f test_acc: %.4f | %s\n' % (label, test_f1, test_rec, test_pre, test_acc, msg))
224 |                 (test_f1, test_pre, test_rec, test_acc, msg) = test_result['total']
225 | 
226 |                 track_list.append(
227 |                     {'loss': epoch_loss, 'dev_f1': dev_f1, 'dev_acc': dev_acc, 'test_f1': test_f1,
228 |                      'test_acc': test_acc})
229 | 
230 |                 print(
231 |                     '(loss: %.4f, epoch: %d, dev F1 = %.4f, dev acc = %.4f, F1 on test = %.4f, acc on test= %.4f), saving...' %
232 |                     (epoch_loss,
233 |                      args.start_epoch,
234 |                      dev_f1,
235 |                      dev_acc,
236 |                      test_f1,
237 |                      test_acc))
238 | 
239 |                 try:
240 |                     utils.save_checkpoint({
241 |                         'epoch': args.start_epoch,
242 |                         'state_dict': ner_model.state_dict(),
243 |                         'optimizer': optimizer.state_dict(),
244 |                         'f_map': f_map,
245 |                         'l_map': l_map,
246 |                         'c_map': c_map,
247 |                         'in_doc_words': in_doc_words
248 |                     }, {'track_list': track_list,
249 |                         'args': vars(args)
250 |                         }, args.checkpoint + 'cwlm_lstm_crf')
251 |                 except Exception as inst:
252 |                     print(inst)
253 | 
254 |             else:
255 |                 patience_count += 1
256 |                 print('(loss: %.4f, epoch: %d, dev F1 = %.4f, dev acc = %.4f)' %
257 |                       (epoch_loss,
258 |                        args.start_epoch,
259 |                        dev_f1,
260 |                        dev_acc))
261 |                 track_list.append({'loss': epoch_loss, 'dev_f1': dev_f1, 'dev_acc': dev_acc})
262 | 
263 |         else:
264 | 
265 |             dev_acc = evaluator.calc_score(ner_model, dev_dataset_loader)
266 | 
267 |             if dev_acc > best_acc:
268 |                 patience_count = 0
269 |                 best_acc = dev_acc
270 |                 
271 |                 test_acc = evaluator.calc_score(ner_model, test_dataset_loader)
272 | 
273 |                 track_list.append(
274 |                     {'loss': epoch_loss, 'dev_acc': dev_acc, 'test_acc': test_acc})
275 | 
276 |                 print(
277 |                     '(loss: %.4f, epoch: %d, dev acc = %.4f, acc on test= %.4f), saving...' %
278 |                     (epoch_loss,
279 |                      args.start_epoch,
280 |                      dev_acc,
281 |                      test_acc))
282 | 
283 |                 try:
284 |                     utils.save_checkpoint({
285 |                         'epoch': args.start_epoch,
286 |                         'state_dict': ner_model.state_dict(),
287 |                         'optimizer': optimizer.state_dict(),
288 |                         'f_map': f_map,
289 |                         'l_map': l_map,
290 |                         'c_map': c_map,
291 |                         'in_doc_words': in_doc_words
292 |                     }, {'track_list': track_list,
293 |                         'args': vars(args)
294 |                         }, args.checkpoint + 'cwlm_lstm_crf')
295 |                 except Exception as inst:
296 |                     print(inst)
297 | 
298 |             else:
299 |                 patience_count += 1
300 |                 print('(loss: %.4f, epoch: %d, dev acc = %.4f)' %
301 |                       (epoch_loss,
302 |                        args.start_epoch,
303 |                        dev_acc))
304 |                 track_list.append({'loss': epoch_loss, 'dev_acc': dev_acc})
305 | 
306 |         print('epoch: ' + str(args.start_epoch) + '\t in ' + str(args.epoch) + ' take: ' + str(
307 |             time.time() - start_time) + ' s')
308 | 
309 |         if patience_count >= args.patience and args.start_epoch >= args.least_iters:
310 |             break
311 | 
312 |     #print best
313 |     if 'f' in args.eva_matrix:
314 |         eprint(args.checkpoint + ' dev_f1: %.4f dev_rec: %.4f dev_pre: %.4f dev_acc: %.4f test_f1: %.4f test_rec: %.4f test_pre: %.4f test_acc: %.4f\n' % (dev_f1, dev_rec, dev_pre, dev_acc, test_f1, test_rec, test_pre, test_acc))
315 |     else:
316 |         eprint(args.checkpoint + ' dev_acc: %.4f test_acc: %.4f\n' % (dev_acc, test_acc))
317 | 
318 |     # printing summary
319 |     print('setting:')
320 |     print(args)
321 | 


--------------------------------------------------------------------------------