├── .gitignore
├── LICENCE
├── README.md
├── demo.clf.config
├── demo.decode.config
├── demo.train.config
├── main.py
├── main_parse.py
├── model
    ├── __init__.py
    ├── charbigru.py
    ├── charbilstm.py
    ├── charcnn.py
    ├── crf.py
    ├── sentclassifier.py
    ├── seqlabel.py
    ├── wordrep.py
    └── wordsequence.py
├── readme
    ├── Configuration.md
    ├── Extension.md
    ├── architecture.png
    ├── hyperparameter_tuning.md
    ├── logo.png
    ├── nbest.png
    └── speed.png
├── sample_data
    ├── dev.bmes
    ├── dev.cappos.bmes
    ├── raw.bmes
    ├── sample.word.emb
    ├── test.bmes
    ├── test.cappos.bmes
    ├── train.bmes
    └── train.cappos.bmes
└── utils
    ├── __init__.py
    ├── alphabet.py
    ├── data.py
    ├── functions.py
    ├── metric.py
    └── tagSchemeConverter.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | __pycache__
 3 | *.dset
 4 | *.model
 5 | *.txt
 6 | demo.clf.*
 7 | sent.*
 8 | *.out
 9 | *.log
10 | 


--------------------------------------------------------------------------------
/LICENCE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![NCRF++ Logo](https://github.com/jiesutd/NCRFpp/blob/master/readme/logo.png)
  2 | 
  3 | 
  4 | # NCRF++: An Open-source Neural Sequence Labeling Toolkit
  5 | 
  6 | 
  7 | * [1. Introduction](#Introduction)
  8 | * [2. Requirement](#Requirement)
  9 | * [3. Advantages](#Advantages)
 10 | * [4. Usage](#Usage)
 11 | * [5. Data Format](#Data-Format)
 12 | * [6. Performance](#Performance)
 13 | * [7. Add Handcrafted Features](#Add-Handcrafted-Features)
 14 | * [8. Speed](#Speed)
 15 | * [9. N best Decoding](#N-best-Decoding)
 16 | * [10. Reproduce Paper Results and Hyperparameter Tuning](#Reproduce-Paper-Results-and-Hyperparameter-Tuning)
 17 | * [11. Report Issue or Problem](#Report-Issue-or-Problem)
 18 | * [12. Cite](#Cite)
 19 | * [13. Future Plan](#Future-Plan)
 20 | * [13. Update](#Update)
 21 | 
 22 | ## Introduction
 23 | 
 24 | Sequence labeling models are quite popular in many NLP tasks, such as Named Entity Recognition (NER), part-of-speech (POS) tagging and word segmentation. State-of-the-art sequence labeling models mostly utilize the CRF structure with input word features. LSTM (or bidirectional LSTM) is a popular deep learning based feature extractor in sequence labeling task. And CNN can also be used due to faster computation. Besides, features within word are also useful to represent word, which can be captured by character LSTM or character CNN structure or human-defined neural features.
 25 | 
 26 | NCRF++ is a PyTorch based framework with flexiable choices of input features and output structures. The design of neural sequence labeling models with NCRF++ is fully configurable through a configuration file, which does not require any code work. NCRF++ can be regarded as a neural network version of [CRF++](http://taku910.github.io/crfpp/), which is a famous statistical CRF framework. 
 27 | 
 28 | This framework has been accepted by [ACL 2018](https://arxiv.org/abs/1806.05626) as demonstration paper. And the detailed experiment report and analysis using NCRF++ has been accepted at [COLING 2018](https://arxiv.org/abs/1806.04470) as the best paper.
 29 | 
 30 | NCRF++ supports different structure combinations of on three levels: character sequence representation, word sequence representation and inference layer.
 31 | 
 32 | * Character sequence representation: character LSTM, character GRU, character CNN and handcrafted word features.
 33 | * Word sequence representation: word LSTM, word GRU, word CNN.
 34 | * Inference layer: Softmax, CRF.
 35 | 
 36 | Welcome to star this repository!
 37 | 
 38 | ## Requirement
 39 | 
 40 | 	Python: 2 or 3  
 41 | 	PyTorch: 1.0 
 42 | 
 43 | [PyTorch 0.3 compatible version is here.](https://github.com/jiesutd/NCRFpp/tree/PyTorch0.3)
 44 | 
 45 | 
 46 | ## Advantages
 47 | 
 48 | * Fully configurable: all the neural model structures can be set with a configuration file.
 49 | * State-of-the-art system performance: models build on NCRF++ can give comparable or better results compared with state-of-the-art models.
 50 | * Flexible with features: user can define their own features and pretrained feature embeddings.
 51 | * Fast running speed: NCRF++ utilizes fully batched operations, making the system efficient with the help of GPU (>1000sent/s for training and >2000sents/s for decoding).
 52 | * N best output: NCRF++ support `nbest` decoding (with their probabilities).
 53 | 
 54 | 
 55 | ## Usage
 56 | 
 57 | NCRF++ supports designing the neural network structure through a configuration file. The program can run in two status; ***training*** and ***decoding***. (sample configuration and data have been included in this repository)  
 58 | 
 59 | In ***training*** status:
 60 | `python main.py --config demo.train.config`
 61 | 
 62 | In ***decoding*** status:
 63 | `python main.py --config demo.decode.config`
 64 | 
 65 | The configuration file controls the network structure, I/O, training setting and hyperparameters. 
 66 | 
 67 | ***Detail configurations and explanations are listed [here](readme/Configuration.md).***
 68 | 
 69 | NCRF++ is designed in three layers (shown below): character sequence layer; word sequence layer and inference layer. By using the configuration file, most of the state-of-the-art models can be easily replicated ***without coding***. On the other hand, users can extend each layer by designing their own modules (for example, they may want to design their own neural structures other than CNN/LSTM/GRU). Our layer-wised design makes the module extension convenient, the instruction of module extension can be found [here](readme/Extension.md).
 70 | 
 71 | ![alt text](readme/architecture.png "Layer-size design")
 72 | 
 73 | 
 74 | ## Data Format
 75 | 
 76 | * You can refer the data format in [sample_data](sample_data). 
 77 | * NCRF++ supports both BIO and BIOES(BMES) tag scheme.  
 78 | * Notice that IOB format (***different*** from BIO) is currently not supported, because this tag scheme is old and works worse than other schemes [Reimers and Gurevych, 2017](https://arxiv.org/pdf/1707.06799.pdf). 
 79 | * The difference among these three tag schemes is explained in this [paper](https://arxiv.org/pdf/1707.06799.pdf).
 80 | * I have written a [script](utils/tagSchemeConverter.py) which converts the tag scheme among IOB/BIO/BIOES. Welcome to have a try. 
 81 | 
 82 | 
 83 | ## Performance
 84 | 
 85 | Results on CONLL 2003 English NER task are better or comparable with SOTA results with the same structures. 
 86 | 
 87 | CharLSTM+WordLSTM+CRF: 91.20 vs 90.94 of [Lample .etc, NAACL16](http://www.aclweb.org/anthology/N/N16/N16-1030.pdf);
 88 | 
 89 | CharCNN+WordLSTM+CRF:  91.35 vs 91.21 of [Ma .etc, ACL16](http://www.aclweb.org/anthology/P/P16/P16-1101.pdf).   
 90 | 
 91 | By default, `LSTM` is bidirectional LSTM.    
 92 | 
 93 | |ID| Model | Nochar | CharLSTM |CharCNN   
 94 | |---|--------- | --- | --- | ------    
 95 | |1| WordLSTM | 88.57 | 90.84 | 90.73  
 96 | |2| WordLSTM+CRF | 89.45 | **91.20** | **91.35** 
 97 | |3| WordCNN |  88.56| 90.46 | 90.30  
 98 | |4| WordCNN+CRF |  88.90 | 90.70 | 90.43  
 99 | 
100 | We have compared twelve neural sequence labeling models (`{charLSTM, charCNN, None} x {wordLSTM, wordCNN} x {softmax, CRF}`) on three benchmarks (POS, Chunking, NER) under statistical experiments, detail results and comparisons can be found in our COLING 2018 paper [Design Challenges and Misconceptions in Neural Sequence Labeling](https://arxiv.org/abs/1806.04470).
101 |  
102 | 
103 | ## Add Handcrafted Features
104 | 
105 | NCRF++ has integrated several SOTA neural characrter sequence feature extractors: CNN ([Ma .etc, ACL16](http://www.aclweb.org/anthology/P/P16/P16-1101.pdf)), LSTM ([Lample .etc, NAACL16](http://www.aclweb.org/anthology/N/N16/N16-1030.pdf)) and GRU ([Yang .etc, ICLR17](https://arxiv.org/pdf/1703.06345.pdf)). In addition, handcrafted features have been proven important in sequence labeling tasks. NCRF++ allows users designing their own features such as Capitalization, POS tag or any other features (grey circles in above figure). Users can configure the self-defined features through configuration file (feature embedding size, pretrained feature embeddings .etc). The sample input data format is given at [train.cappos.bmes](sample_data/train.cappos.bmes), which includes two human-defined features `[POS]` and `[Cap]`. (`[POS]` and `[Cap]` are two examples, you can give your feature any name you want, just follow the format `[xx]` and configure the feature with the same name in configuration file.)
106 | User can configure each feature in configuration file by using 
107 | 
108 | ```Python
109 | feature=[POS] emb_size=20 emb_dir=%your_pretrained_POS_embedding
110 | feature=[Cap] emb_size=20 emb_dir=%your_pretrained_Cap_embedding
111 | ```
112 | 
113 | Feature without pretrained embedding will be randomly initialized.
114 | 
115 | 
116 | ## Speed
117 | 
118 | NCRF++ is implemented using fully batched calculation, making it quite effcient on both model training and decoding. With the help of GPU (Nvidia GTX 1080) and large batch size, LSTMCRF model built with NCRF++ can reach 1000 sents/s and 2000sents/s on training and decoding status, respectively.
119 | 
120 | ![alt text](readme/speed.png "System speed on NER data")
121 | 
122 | 
123 | ## N best Decoding
124 | 
125 | Traditional CRF structure decodes only one label sequence with largest probabolities (i.e. 1-best output). While NCRF++ can give a large choice, it can decode `n` label sequences with the top `n` probabilities (i.e. n-best output). The nbest decodeing has been supported by several popular **statistical** CRF framework. However to the best of our knowledge, NCRF++ is the only and the first toolkit which support nbest decoding in **neural** CRF models. 
126 | 
127 | In our implementation, when the nbest=10, CharCNN+WordLSTM+CRF model built in NCRF++ can give 97.47% oracle F1-value (F1 = 91.35% when nbest=1) on CoNLL 2003 NER task.
128 | 
129 | ![alt text](readme/nbest.png  "N best decoding oracle result")
130 | 
131 | 
132 | ## Reproduce Paper Results and Hyperparameter Tuning
133 | 
134 | To reproduce the results in our COLING 2018 paper, you only need to set the `iteration=1` as `iteration=100` in configuration file `demo.train.config` and configure your file directory in this configuration file. The default configuration file describes the `Char CNN + Word LSTM + CRF` model, you can build your own model by modifing the configuration accordingly. The parameters in this demo configuration file are the same in our paper. (Notice the `Word CNN` related models need slightly different parameters, details can be found in our COLING paper.)
135 | 
136 | If you want to use this framework in new tasks or datasets, here are some tuning [tips](readme/hyperparameter_tuning.md) by @Victor0118.
137 | 
138 | 
139 | ## Report Issue or Problem
140 | 
141 | If you want to report an issue or ask a problem, please attach the following materials if necessary. With these information, I can give fast and accurate discussion and suggestion. 
142 | * `log file` 
143 | * `config file` 
144 | * `sample data` 
145 | 
146 | 
147 | ## Cite
148 | 
149 | If you use NCRF++ in your paper, please cite our [ACL demo paper](https://arxiv.org/abs/1806.05626):
150 | 
151 |     @inproceedings{yang2018ncrf,  
152 |      title={NCRF++: An Open-source Neural Sequence Labeling Toolkit},  
153 |      author={Yang, Jie and Zhang, Yue},  
154 |      booktitle={Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics},
155 |      Url = {http://aclweb.org/anthology/P18-4013},
156 |      year={2018}  
157 |     }
158 | 
159 | 
160 | If you use experiments results and analysis of NCRF++, please cite our [COLING paper](https://arxiv.org/abs/1806.04470):
161 | 
162 |     @inproceedings{yang2018design,  
163 |      title={Design Challenges and Misconceptions in Neural Sequence Labeling},  
164 |      author={Yang, Jie and Liang, Shuailong and Zhang, Yue},  
165 |      booktitle={Proceedings of the 27th International Conference on Computational Linguistics (COLING)},
166 |      Url = {http://aclweb.org/anthology/C18-1327},
167 |      year={2018}  
168 |     }
169 | 
170 | ## Future Plan 
171 | 
172 | * Document classification (working)
173 | * Support API usage
174 | * Upload trained model on Word Segmentation/POS tagging/NER
175 | * Enable loading pretrained ELMo parameters
176 | * Add BERT feature extraction layer 
177 | 
178 | 
179 | 
180 | ## Update
181 | 
182 | * 2018-Dec-17, NCRF++ v0.2, support PyTorch 1.0
183 | * 2018-Mar-30, NCRF++ v0.1, initial version
184 | * 2018-Jan-06, add result comparison.
185 | * 2018-Jan-02, support character feature selection. 
186 | * 2017-Dec-06, init version
187 | 
188 | 


--------------------------------------------------------------------------------
/demo.clf.config:
--------------------------------------------------------------------------------
 1 | ### use # to comment out the configure item
 2 | 
 3 | sentence_classification=True
 4 | 
 5 | ### I/O ###
 6 | train_dir=../data/Sentclf/SST1/stsa.fine.train.clf
 7 | dev_dir=../data/Sentclf/SST1/stsa.fine.dev.clf
 8 | test_dir=../data/Sentclf/SST1/stsa.fine.test.clf
 9 | model_dir=sample_data/clf
10 | word_emb_dir=../data/glove.840B.300d.txt
11 | 
12 | 
13 | #raw_dir=
14 | #decode_dir=
15 | #dset_dir=
16 | #load_model_dir=
17 | #char_emb_dir=
18 | 
19 | norm_word_emb=False
20 | norm_char_emb=False
21 | number_normalized=True
22 | seg=False
23 | word_emb_dim=50
24 | char_emb_dim=30
25 | 
26 | ###NetworkConfiguration###
27 | use_crf=False
28 | use_char=False
29 | word_seq_feature=LSTM
30 | char_seq_feature=CNN
31 | #feature=[POS] emb_size=20
32 | #feature=[Cap] emb_size=20
33 | #nbest=1
34 | 
35 | ###TrainingSetting###
36 | status=train
37 | optimizer=SGD
38 | iteration=50
39 | batch_size=10
40 | ave_batch_loss=False
41 | 
42 | ###Hyperparameters###
43 | cnn_layer=4
44 | char_hidden_dim=50
45 | hidden_dim=400
46 | dropout=0
47 | lstm_layer=1
48 | bilstm=True
49 | learning_rate=0.2
50 | lr_decay=0.05
51 | momentum=0
52 | l2=1e-8
53 | #gpu
54 | #clip=
55 | 


--------------------------------------------------------------------------------
/demo.decode.config:
--------------------------------------------------------------------------------
1 | ### Decode ###
2 | status=decode
3 | raw_dir=sample_data/raw.bmes
4 | nbest=10
5 | decode_dir=sample_data/raw.out
6 | dset_dir=sample_data/lstmcrf.dset
7 | load_model_dir=sample_data/lstmcrf.0.model


--------------------------------------------------------------------------------
/demo.train.config:
--------------------------------------------------------------------------------
 1 | ### use # to comment out the configure item
 2 | 
 3 | ### I/O ###
 4 | train_dir=sample_data/train.bmes
 5 | dev_dir=sample_data/dev.bmes
 6 | test_dir=sample_data/test.bmes
 7 | model_dir=sample_data/lstmcrf
 8 | word_emb_dir=sample_data/sample.word.emb
 9 | 
10 | #raw_dir=
11 | #decode_dir=
12 | #dset_dir=
13 | #load_model_dir=
14 | #char_emb_dir=
15 | 
16 | norm_word_emb=False
17 | norm_char_emb=False
18 | number_normalized=True
19 | seg=True
20 | word_emb_dim=50
21 | char_emb_dim=30
22 | 
23 | ###NetworkConfiguration###
24 | use_crf=True
25 | use_char=True
26 | word_seq_feature=LSTM
27 | char_seq_feature=CNN
28 | #feature=[POS] emb_size=20
29 | #feature=[Cap] emb_size=20
30 | #nbest=1
31 | 
32 | ###TrainingSetting###
33 | status=train
34 | optimizer=SGD
35 | iteration=1
36 | batch_size=10
37 | ave_batch_loss=False
38 | 
39 | ###Hyperparameters###
40 | cnn_layer=4
41 | char_hidden_dim=50
42 | hidden_dim=200
43 | dropout=0.5
44 | lstm_layer=1
45 | bilstm=True
46 | learning_rate=0.015
47 | lr_decay=0.05
48 | momentum=0
49 | l2=1e-8
50 | #gpu
51 | #clip=
52 | 


--------------------------------------------------------------------------------
/main_parse.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Author: Jie
  3 | # @Date:   2017-06-15 14:11:08
  4 | # @Last Modified by:   Jie Yang,     Contact: jieynlp@gmail.com
  5 | # @Last Modified time: 2019-02-13 10:58:43
  6 | 
  7 | from __future__ import print_function
  8 | import time
  9 | import sys
 10 | import argparse
 11 | import random
 12 | import copy
 13 | import torch
 14 | import gc
 15 | import torch.autograd as autograd
 16 | import torch.nn as nn
 17 | import torch.nn.functional as F
 18 | import torch.optim as optim
 19 | import numpy as np
 20 | from utils.metric import get_ner_fmeasure
 21 | from model.seqlabel import SeqLabel
 22 | from utils.data import Data
 23 | 
 24 | try:
 25 |     import cPickle as pickle
 26 | except ImportError:
 27 |     import pickle as pickle
 28 | 
 29 | seed_num = 42
 30 | random.seed(seed_num)
 31 | torch.manual_seed(seed_num)
 32 | np.random.seed(seed_num)
 33 | 
 34 | 
 35 | def data_initialization(data):
 36 |     data.initial_feature_alphabets()
 37 |     data.build_alphabet(data.train_dir)
 38 |     data.build_alphabet(data.dev_dir)
 39 |     data.build_alphabet(data.test_dir)
 40 |     data.fix_alphabet()
 41 | 
 42 | 
 43 | def predict_check(pred_variable, gold_variable, mask_variable):
 44 |     """
 45 |         input:
 46 |             pred_variable (batch_size, sent_len): pred tag result, in numpy format
 47 |             gold_variable (batch_size, sent_len): gold result variable
 48 |             mask_variable (batch_size, sent_len): mask variable
 49 |     """
 50 |     pred = pred_variable.cpu().data.numpy()
 51 |     gold = gold_variable.cpu().data.numpy()
 52 |     mask = mask_variable.cpu().data.numpy()
 53 |     overlaped = (pred == gold)
 54 |     right_token = np.sum(overlaped * mask)
 55 |     total_token = mask.sum()
 56 |     # print("right: %s, total: %s"%(right_token, total_token))
 57 |     return right_token, total_token
 58 | 
 59 | 
 60 | def recover_label(pred_variable, gold_variable, mask_variable, label_alphabet, word_recover):
 61 |     """
 62 |         input:
 63 |             pred_variable (batch_size, sent_len): pred tag result
 64 |             gold_variable (batch_size, sent_len): gold result variable
 65 |             mask_variable (batch_size, sent_len): mask variable
 66 |     """
 67 |     
 68 |     pred_variable = pred_variable[word_recover]
 69 |     gold_variable = gold_variable[word_recover]
 70 |     mask_variable = mask_variable[word_recover]
 71 |     batch_size = gold_variable.size(0)
 72 |     seq_len = gold_variable.size(1)
 73 |     mask = mask_variable.cpu().data.numpy()
 74 |     pred_tag = pred_variable.cpu().data.numpy()
 75 |     gold_tag = gold_variable.cpu().data.numpy()
 76 |     batch_size = mask.shape[0]
 77 |     pred_label = []
 78 |     gold_label = []
 79 |     for idx in range(batch_size):
 80 |         pred = [label_alphabet.get_instance(pred_tag[idx][idy]) for idy in range(seq_len) if mask[idx][idy] != 0]
 81 |         gold = [label_alphabet.get_instance(gold_tag[idx][idy]) for idy in range(seq_len) if mask[idx][idy] != 0]
 82 |         # print("p:",pred, pred_tag.tolist())
 83 |         # print("g:", gold, gold_tag.tolist())
 84 |         assert(len(pred)==len(gold))
 85 |         pred_label.append(pred)
 86 |         gold_label.append(gold)
 87 |     return pred_label, gold_label
 88 | 
 89 | 
 90 | def recover_nbest_label(pred_variable, mask_variable, label_alphabet, word_recover):
 91 |     """
 92 |         input:
 93 |             pred_variable (batch_size, sent_len, nbest): pred tag result
 94 |             mask_variable (batch_size, sent_len): mask variable
 95 |             word_recover (batch_size)
 96 |         output:
 97 |             nbest_pred_label list: [batch_size, nbest, each_seq_len]
 98 |     """
 99 |     # print("word recover:", word_recover.size())
100 |     # exit(0)
101 |     pred_variable = pred_variable[word_recover]
102 |     mask_variable = mask_variable[word_recover]
103 |     batch_size = pred_variable.size(0)
104 |     seq_len = pred_variable.size(1)
105 |     print(pred_variable.size())
106 |     nbest = pred_variable.size(2)
107 |     mask = mask_variable.cpu().data.numpy()
108 |     pred_tag = pred_variable.cpu().data.numpy()
109 |     batch_size = mask.shape[0]
110 |     pred_label = []
111 |     for idx in range(batch_size):
112 |         pred = []
113 |         for idz in range(nbest):
114 |             each_pred = [label_alphabet.get_instance(pred_tag[idx][idy][idz]) for idy in range(seq_len) if mask[idx][idy] != 0]
115 |             pred.append(each_pred)
116 |         pred_label.append(pred)
117 |     return pred_label
118 | 
119 | 
120 | 
121 | # def save_data_setting(data, save_file):
122 | #     new_data = copy.deepcopy(data)
123 | #     ## remove input instances
124 | #     new_data.train_texts = []
125 | #     new_data.dev_texts = []
126 | #     new_data.test_texts = []
127 | #     new_data.raw_texts = []
128 | 
129 | #     new_data.train_Ids = []
130 | #     new_data.dev_Ids = []
131 | #     new_data.test_Ids = []
132 | #     new_data.raw_Ids = []
133 | #     ## save data settings
134 | #     with open(save_file, 'w') as fp:
135 | #         pickle.dump(new_data, fp)
136 | #     print("Data setting saved to file: ", save_file)
137 | 
138 | 
139 | # def load_data_setting(save_file):
140 | #     with open(save_file, 'r') as fp:
141 | #         data = pickle.load(fp)
142 | #     print("Data setting loaded from file: ", save_file)
143 | #     data.show_data_summary()
144 | #     return data
145 | 
146 | def lr_decay(optimizer, epoch, decay_rate, init_lr):
147 |     lr = init_lr/(1+decay_rate*epoch)
148 |     print(" Learning rate is set as:", lr)
149 |     for param_group in optimizer.param_groups:
150 |         param_group['lr'] = lr
151 |     return optimizer
152 | 
153 | 
154 | 
155 | def evaluate(data, model, name, nbest=None):
156 |     if name == "train":
157 |         instances = data.train_Ids
158 |     elif name == "dev":
159 |         instances = data.dev_Ids
160 |     elif name == 'test':
161 |         instances = data.test_Ids
162 |     elif name == 'raw':
163 |         instances = data.raw_Ids
164 |     else:
165 |         print("Error: wrong evaluate name,", name)
166 |     right_token = 0
167 |     whole_token = 0
168 |     nbest_pred_results = []
169 |     pred_scores = []
170 |     pred_results = []
171 |     gold_results = []
172 |     ## set model in eval model
173 |     model.eval()
174 |     batch_size = data.HP_batch_size
175 |     start_time = time.time()
176 |     train_num = len(instances)
177 |     total_batch = train_num//batch_size+1
178 |     for batch_id in range(total_batch):
179 |         start = batch_id*batch_size
180 |         end = (batch_id+1)*batch_size 
181 |         if end > train_num:
182 |             end =  train_num
183 |         instance = instances[start:end]
184 |         if not instance:
185 |             continue
186 |         batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask  = batchify_with_label(instance, data.HP_gpu, True)
187 |         if nbest:
188 |             scores, nbest_tag_seq = model.decode_nbest(batch_word,batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask, nbest)
189 |             nbest_pred_result = recover_nbest_label(nbest_tag_seq, mask, data.label_alphabet, batch_wordrecover)
190 |             nbest_pred_results += nbest_pred_result 
191 |             pred_scores += scores[batch_wordrecover].cpu().data.numpy().tolist()
192 |             ## select the best sequence to evalurate
193 |             tag_seq = nbest_tag_seq[:,:,0]
194 |         else:
195 |             tag_seq = model(batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask)
196 |         # print("tag:",tag_seq)
197 |         pred_label, gold_label = recover_label(tag_seq, batch_label, mask, data.label_alphabet, batch_wordrecover)
198 |         pred_results += pred_label
199 |         gold_results += gold_label
200 |     decode_time = time.time() - start_time
201 |     speed = len(instances)/decode_time
202 |     acc, p, r, f = get_ner_fmeasure(gold_results, pred_results, data.tagScheme)
203 |     if nbest:
204 |         return speed, acc, p, r, f, nbest_pred_results, pred_scores
205 |     return speed, acc, p, r, f, pred_results, pred_scores
206 | 
207 | 
208 | def batchify_with_label(input_batch_list, gpu, volatile_flag=False):
209 |     """
210 |         input: list of words, chars and labels, various length. [[words,chars, labels],[words,chars,labels],...]
211 |             words: word ids for one sentence. (batch_size, sent_len) 
212 |             chars: char ids for on sentences, various length. (batch_size, sent_len, each_word_length)
213 |         output:
214 |             zero padding for word and char, with their batch length
215 |             word_seq_tensor: (batch_size, max_sent_len) Variable
216 |             word_seq_lengths: (batch_size,1) Tensor
217 |             char_seq_tensor: (batch_size*max_sent_len, max_word_len) Variable
218 |             char_seq_lengths: (batch_size*max_sent_len,1) Tensor
219 |             char_seq_recover: (batch_size*max_sent_len,1)  recover char sequence order 
220 |             label_seq_tensor: (batch_size, max_sent_len)
221 |             mask: (batch_size, max_sent_len) 
222 |     """
223 |     batch_size = len(input_batch_list)
224 |     words = [sent[0] for sent in input_batch_list]
225 |     features = [np.asarray(sent[1]) for sent in input_batch_list]
226 |     feature_num = len(features[0][0])
227 |     chars = [sent[2] for sent in input_batch_list]
228 |     labels = [sent[3] for sent in input_batch_list]
229 |     word_seq_lengths = torch.LongTensor(map(len, words))
230 |     max_seq_len = word_seq_lengths.max()
231 |     word_seq_tensor = autograd.Variable(torch.zeros((batch_size, max_seq_len)), volatile =  volatile_flag).long()
232 |     label_seq_tensor = autograd.Variable(torch.zeros((batch_size, max_seq_len)),volatile =  volatile_flag).long()
233 |     feature_seq_tensors = []
234 |     for idx in range(feature_num):
235 |         feature_seq_tensors.append(autograd.Variable(torch.zeros((batch_size, max_seq_len)),volatile =  volatile_flag).long())
236 |     mask = autograd.Variable(torch.zeros((batch_size, max_seq_len)),volatile =  volatile_flag).bool()
237 |     for idx, (seq, label, seqlen) in enumerate(zip(words, labels, word_seq_lengths)):
238 |         word_seq_tensor[idx, :seqlen] = torch.LongTensor(seq)
239 |         label_seq_tensor[idx, :seqlen] = torch.LongTensor(label)
240 |         mask[idx, :seqlen] = torch.Tensor([1]*seqlen)
241 |         for idy in range(feature_num):
242 |             feature_seq_tensors[idy][idx,:seqlen] = torch.LongTensor(features[idx][:,idy])
243 |     word_seq_lengths, word_perm_idx = word_seq_lengths.sort(0, descending=True)
244 |     word_seq_tensor = word_seq_tensor[word_perm_idx]
245 |     for idx in range(feature_num):
246 |         feature_seq_tensors[idx] = feature_seq_tensors[idx][word_perm_idx]
247 | 
248 |     label_seq_tensor = label_seq_tensor[word_perm_idx]
249 |     mask = mask[word_perm_idx]
250 |     ### deal with char
251 |     # pad_chars (batch_size, max_seq_len)
252 |     pad_chars = [chars[idx] + [[0]] * (max_seq_len-len(chars[idx])) for idx in range(len(chars))]
253 |     length_list = [map(len, pad_char) for pad_char in pad_chars]
254 |     max_word_len = max(map(max, length_list))
255 |     char_seq_tensor = autograd.Variable(torch.zeros((batch_size, max_seq_len, max_word_len)), volatile =  volatile_flag).long()
256 |     char_seq_lengths = torch.LongTensor(length_list)
257 |     for idx, (seq, seqlen) in enumerate(zip(pad_chars, char_seq_lengths)):
258 |         for idy, (word, wordlen) in enumerate(zip(seq, seqlen)):
259 |             # print len(word), wordlen
260 |             char_seq_tensor[idx, idy, :wordlen] = torch.LongTensor(word)
261 |     
262 |     char_seq_tensor = char_seq_tensor[word_perm_idx].view(batch_size*max_seq_len,-1)
263 |     char_seq_lengths = char_seq_lengths[word_perm_idx].view(batch_size*max_seq_len,)
264 |     char_seq_lengths, char_perm_idx = char_seq_lengths.sort(0, descending=True)
265 |     char_seq_tensor = char_seq_tensor[char_perm_idx]
266 |     _, char_seq_recover = char_perm_idx.sort(0, descending=False)
267 |     _, word_seq_recover = word_perm_idx.sort(0, descending=False)
268 |     if gpu:
269 |         word_seq_tensor = word_seq_tensor.cuda()
270 |         for idx in range(feature_num):
271 |             feature_seq_tensors[idx] = feature_seq_tensors[idx].cuda()
272 |         word_seq_lengths = word_seq_lengths.cuda()
273 |         word_seq_recover = word_seq_recover.cuda()
274 |         label_seq_tensor = label_seq_tensor.cuda()
275 |         char_seq_tensor = char_seq_tensor.cuda()
276 |         char_seq_recover = char_seq_recover.cuda()
277 |         mask = mask.cuda()
278 |     return word_seq_tensor,feature_seq_tensors, word_seq_lengths, word_seq_recover, char_seq_tensor, char_seq_lengths, char_seq_recover, label_seq_tensor, mask
279 | 
280 | 
281 | def train(data):
282 |     print("Training model...")
283 |     data.show_data_summary()
284 |     save_data_name = data.model_dir +".dset"
285 |     data.save(save_data_name)
286 |     model = SeqLabel(data)
287 |     loss_function = nn.NLLLoss()
288 |     if data.optimizer.lower() == "sgd":
289 |         optimizer = optim.SGD(model.parameters(), lr=data.HP_lr, momentum=data.HP_momentum,weight_decay=data.HP_l2)
290 |     elif data.optimizer.lower() == "adagrad":
291 |         optimizer = optim.Adagrad(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
292 |     elif data.optimizer.lower() == "adadelta":
293 |         optimizer = optim.Adadelta(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
294 |     elif data.optimizer.lower() == "rmsprop":
295 |         optimizer = optim.RMSprop(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
296 |     elif data.optimizer.lower() == "adam":
297 |         optimizer = optim.Adam(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2)
298 |     else:
299 |         print("Optimizer illegal: %s"%(data.optimizer))
300 |         exit(0)
301 |     best_dev = -10
302 |     # data.HP_iteration = 1
303 |     ## start training
304 |     for idx in range(data.HP_iteration):
305 |         epoch_start = time.time()
306 |         temp_start = epoch_start
307 |         print("Epoch: %s/%s" %(idx,data.HP_iteration))
308 |         if data.optimizer == "SGD":
309 |             optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr)
310 |         instance_count = 0
311 |         sample_id = 0
312 |         sample_loss = 0
313 |         total_loss = 0
314 |         right_token = 0
315 |         whole_token = 0
316 |         random.shuffle(data.train_Ids)
317 |         ## set model in train model
318 |         model.train()
319 |         model.zero_grad()
320 |         batch_size = data.HP_batch_size
321 |         batch_id = 0
322 |         train_num = len(data.train_Ids)
323 |         total_batch = train_num//batch_size+1
324 |         for batch_id in range(total_batch):
325 |             start = batch_id*batch_size
326 |             end = (batch_id+1)*batch_size 
327 |             if end >train_num:
328 |                 end = train_num
329 |             instance = data.train_Ids[start:end]
330 |             if not instance:
331 |                 continue
332 |             batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask  = batchify_with_label(instance, data.HP_gpu)
333 |             instance_count += 1
334 |             loss, tag_seq = model.neg_log_likelihood_loss(batch_word,batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, batch_label, mask)
335 |             right, whole = predict_check(tag_seq, batch_label, mask)
336 |             right_token += right
337 |             whole_token += whole
338 |             sample_loss += loss.data[0]
339 |             total_loss += loss.data[0]
340 |             if end%500 == 0:
341 |                 temp_time = time.time()
342 |                 temp_cost = temp_time - temp_start
343 |                 temp_start = temp_time
344 |                 print("     Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f"%(end, temp_cost, sample_loss, right_token, whole_token,(right_token+0.)/whole_token))
345 |                 sys.stdout.flush()
346 |                 sample_loss = 0
347 |             loss.backward()
348 |             optimizer.step()
349 |             model.zero_grad()
350 |         temp_time = time.time()
351 |         temp_cost = temp_time - temp_start
352 |         print("     Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f"%(end, temp_cost, sample_loss, right_token, whole_token,(right_token+0.)/whole_token))       
353 |         epoch_finish = time.time()
354 |         epoch_cost = epoch_finish - epoch_start
355 |         print("Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s,  total loss: %s"%(idx, epoch_cost, train_num/epoch_cost, total_loss))
356 |         # continue
357 |         speed, acc, p, r, f, _,_ = evaluate(data, model, "dev")
358 |         dev_finish = time.time()
359 |         dev_cost = dev_finish - epoch_finish
360 | 
361 |         if data.seg:
362 |             current_score = f
363 |             print("Dev: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"%(dev_cost, speed, acc, p, r, f))
364 |         else:
365 |             current_score = acc
366 |             print("Dev: time: %.2fs speed: %.2fst/s; acc: %.4f"%(dev_cost, speed, acc))
367 | 
368 |         if current_score > best_dev:
369 |             if data.seg:
370 |                 print("Exceed previous best f score:", best_dev)
371 |             else:
372 |                 print("Exceed previous best acc score:", best_dev)
373 |             model_name = data.model_dir +'.'+ str(idx) + ".model"
374 |             print("Save current best model in file:", model_name)
375 |             torch.save(model.state_dict(), model_name)
376 |             best_dev = current_score 
377 |         # ## decode test
378 |         speed, acc, p, r, f, _,_ = evaluate(data, model, "test")
379 |         test_finish = time.time()
380 |         test_cost = test_finish - dev_finish
381 |         if data.seg:
382 |             print("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"%(test_cost, speed, acc, p, r, f))
383 |         else:
384 |             print("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f"%(test_cost, speed, acc))
385 |         gc.collect() 
386 | 
387 | 
388 | def load_model_decode(data, name):
389 |     print("Load Model from file: ", data.model_dir)
390 |     model = SeqLabel(data)
391 |     ## load model need consider if the model trained in GPU and load in CPU, or vice versa
392 |     # if not gpu:
393 |     #     model.load_state_dict(torch.load(model_dir))
394 |     #     # model.load_state_dict(torch.load(model_dir), map_location=lambda storage, loc: storage)
395 |     #     # model = torch.load(model_dir, map_location=lambda storage, loc: storage)
396 |     # else:
397 |     #     model.load_state_dict(torch.load(model_dir))
398 |     #     # model = torch.load(model_dir)
399 |     model.load_state_dict(torch.load(data.load_model_dir))
400 | 
401 |     print("Decode %s data, nbest: %s ..."%(name, data.nbest))
402 |     start_time = time.time()
403 |     speed, acc, p, r, f, pred_results, pred_scores = evaluate(data, model, name, data.nbest)
404 |     end_time = time.time()
405 |     time_cost = end_time - start_time
406 |     if data.seg:
407 |         print("%s: time:%.2fs, speed:%.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"%(name, time_cost, speed, acc, p, r, f))
408 |     else:
409 |         print("%s: time:%.2fs, speed:%.2fst/s; acc: %.4f"%(name, time_cost, speed, acc))
410 |     return pred_results, pred_scores
411 | 
412 | 
413 | 
414 | 
415 | if __name__ == '__main__':
416 |     parser = argparse.ArgumentParser(description='Tuning with NCRF++')
417 |     parser.add_argument('--wordemb',  help='Embedding for words', default='None')
418 |     parser.add_argument('--charemb',  help='Embedding for chars', default='None')
419 |     parser.add_argument('--status', choices=['train', 'decode'], help='update algorithm', default='train')
420 |     parser.add_argument('--savemodel', default="data/model/saved_model.lstmcrf.")
421 |     parser.add_argument('--savedset', help='Dir of saved data setting')
422 |     parser.add_argument('--train', default="data/conll03/train.bmes") 
423 |     parser.add_argument('--dev', default="data/conll03/dev.bmes" )  
424 |     parser.add_argument('--test', default="data/conll03/test.bmes") 
425 |     parser.add_argument('--seg', default="True") 
426 |     parser.add_argument('--raw') 
427 |     parser.add_argument('--loadmodel')
428 |     parser.add_argument('--output') 
429 |     args = parser.parse_args()
430 |     data = Data()
431 |     
432 |     data.train_dir = args.train 
433 |     data.dev_dir = args.dev 
434 |     data.test_dir = args.test
435 |     data.model_dir = args.savemodel
436 |     data.dset_dir = args.savedset
437 |     print("dset directory:",data.dset_dir)
438 |     status = args.status.lower()
439 |     save_model_dir = args.savemodel
440 |     data.HP_gpu = torch.cuda.is_available()
441 |     print("Seed num:",seed_num)
442 |     data.number_normalized = True
443 |     data.word_emb_dir = "../data/glove.6B.100d.txt"
444 |     
445 |     if status == 'train':
446 |         print("MODEL: train")
447 |         data_initialization(data)
448 |         data.use_char = True
449 |         data.HP_batch_size = 10
450 |         data.HP_lr = 0.015
451 |         data.char_seq_feature = "CNN"
452 |         data.generate_instance('train')
453 |         data.generate_instance('dev')
454 |         data.generate_instance('test')
455 |         data.build_pretrain_emb()
456 |         train(data)
457 |     elif status == 'decode':   
458 |         print("MODEL: decode")
459 |         data.load(data.dset_dir)    
460 |         data.raw_dir = args.raw
461 |         data.decode_dir = args.output
462 |         data.load_model_dir = args.loadmodel
463 |         data.show_data_summary()
464 |         data.generate_instance('raw')
465 |         print("nbest: %s"%(data.nbest))
466 |         decode_results, pred_scores = load_model_decode(data, 'raw')
467 |         if data.nbest:
468 |             data.write_nbest_decoded_results(decode_results, pred_scores, 'raw')
469 |         else:
470 |             data.write_decoded_results(decode_results, 'raw')
471 |     else:
472 |         print("Invalid argument! Please use valid arguments! (train/test/decode)")
473 | 
474 | 
475 | 
476 | 
477 | 


--------------------------------------------------------------------------------
/model/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'max'
2 | 


--------------------------------------------------------------------------------
/model/charbigru.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author: Jie Yang
 3 | # @Date:   2017-10-17 16:47:32
 4 | # @Last Modified by:   Jie Yang,     Contact: jieynlp@gmail.com
 5 | # @Last Modified time: 2018-10-18 11:12:13
 6 | from __future__ import print_function
 7 | import torch
 8 | import torch.nn as nn
 9 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
10 | import numpy as np
11 | 
12 | class CharBiGRU(nn.Module):
13 |     def __init__(self, alphabet_size, pretrain_char_embedding, embedding_dim, hidden_dim, dropout, gpu, bidirect_flag = True):
14 |         super(CharBiGRU, self).__init__()
15 |         print("build char sequence feature extractor: GRU ...")
16 |         self.gpu = gpu
17 |         self.hidden_dim = hidden_dim
18 |         if bidirect_flag:
19 |             self.hidden_dim = hidden_dim // 2
20 |         self.char_drop = nn.Dropout(dropout)
21 |         self.char_embeddings = nn.Embedding(alphabet_size, embedding_dim)
22 |         if pretrain_char_embedding is not None:
23 |             self.char_embeddings.weight.data.copy_(torch.from_numpy(pretrain_char_embedding))
24 |         else:
25 |             self.char_embeddings.weight.data.copy_(torch.from_numpy(self.random_embedding(alphabet_size, embedding_dim)))
26 |         self.char_lstm = nn.GRU(embedding_dim, self.hidden_dim, num_layers=1, batch_first=True, bidirectional=bidirect_flag)
27 |         if self.gpu:
28 |             self.char_drop = self.char_drop.cuda()
29 |             self.char_embeddings = self.char_embeddings.cuda()
30 |             self.char_lstm = self.char_lstm.cuda()
31 | 
32 | 
33 |     def random_embedding(self, vocab_size, embedding_dim):
34 |         pretrain_emb = np.empty([vocab_size, embedding_dim])
35 |         scale = np.sqrt(3.0 / embedding_dim)
36 |         for index in range(vocab_size):
37 |             pretrain_emb[index,:] = np.random.uniform(-scale, scale, [1, embedding_dim])
38 |         return pretrain_emb
39 | 
40 | 
41 |     def get_last_hiddens(self, input, seq_lengths):
42 |         """
43 |             input:
44 |                 input: Variable(batch_size, word_length)
45 |                 seq_lengths: numpy array (batch_size,  1)
46 |             output:
47 |                 Variable(batch_size, char_hidden_dim)
48 |             Note it only accepts ordered (length) variable, length size is recorded in seq_lengths
49 |         """
50 |         batch_size = input.size(0)
51 |         char_embeds = self.char_drop(self.char_embeddings(input))
52 |         char_hidden = None
53 |         pack_input = pack_padded_sequence(char_embeds, seq_lengths, True)
54 |         char_rnn_out, char_hidden = self.char_lstm(pack_input, char_hidden)
55 |         # char_rnn_out, _ = pad_packed_sequence(char_rnn_out)
56 |         return char_hidden.transpose(1,0).contiguous().view(batch_size,-1)
57 | 
58 |     def get_all_hiddens(self, input, seq_lengths):
59 |         """
60 |             input:
61 |                 input: Variable(batch_size,  word_length)
62 |                 seq_lengths: numpy array (batch_size,  1)
63 |             output:
64 |                 Variable(batch_size, word_length, char_hidden_dim)
65 |             Note it only accepts ordered (length) variable, length size is recorded in seq_lengths
66 |         """
67 |         batch_size = input.size(0)
68 |         char_embeds = self.char_drop(self.char_embeddings(input))
69 |         char_hidden = None
70 |         pack_input = pack_padded_sequence(char_embeds, seq_lengths, True)
71 |         char_rnn_out, char_hidden = self.char_lstm(pack_input, char_hidden)
72 |         char_rnn_out, _ = pad_packed_sequence(char_rnn_out)
73 |         return char_rnn_out.transpose(1,0)
74 | 
75 | 
76 |     def forward(self, input, seq_lengths):
77 |         return self.get_all_hiddens(input, seq_lengths)
78 | 


--------------------------------------------------------------------------------
/model/charbilstm.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author: Jie Yang
 3 | # @Date:   2017-10-17 16:47:32
 4 | # @Last Modified by:   Jie Yang,     Contact: jieynlp@gmail.com
 5 | # @Last Modified time: 2018-10-18 11:19:37
 6 | from __future__ import print_function
 7 | import torch
 8 | import torch.nn as nn
 9 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
10 | import numpy as np
11 | 
12 | class CharBiLSTM(nn.Module):
13 |     def __init__(self, alphabet_size, pretrain_char_embedding, embedding_dim, hidden_dim, dropout, gpu, bidirect_flag = True):
14 |         super(CharBiLSTM, self).__init__()
15 |         print("build char sequence feature extractor: LSTM ...")
16 |         self.gpu = gpu
17 |         self.hidden_dim = hidden_dim
18 |         if bidirect_flag:
19 |             self.hidden_dim = hidden_dim // 2
20 |         self.char_drop = nn.Dropout(dropout)
21 |         self.char_embeddings = nn.Embedding(alphabet_size, embedding_dim)
22 |         if pretrain_char_embedding is not None:
23 |             self.char_embeddings.weight.data.copy_(torch.from_numpy(pretrain_char_embedding))
24 |         else:
25 |             self.char_embeddings.weight.data.copy_(torch.from_numpy(self.random_embedding(alphabet_size, embedding_dim)))
26 |         self.char_lstm = nn.LSTM(embedding_dim, self.hidden_dim, num_layers=1, batch_first=True, bidirectional=bidirect_flag)
27 |         if self.gpu:
28 |             self.char_drop = self.char_drop.cuda()
29 |             self.char_embeddings = self.char_embeddings.cuda()
30 |             self.char_lstm = self.char_lstm.cuda()
31 | 
32 | 
33 |     def random_embedding(self, vocab_size, embedding_dim):
34 |         pretrain_emb = np.empty([vocab_size, embedding_dim])
35 |         scale = np.sqrt(3.0 / embedding_dim)
36 |         for index in range(vocab_size):
37 |             pretrain_emb[index,:] = np.random.uniform(-scale, scale, [1, embedding_dim])
38 |         return pretrain_emb
39 | 
40 | 
41 |     def get_last_hiddens(self, input, seq_lengths):
42 |         """
43 |             input:
44 |                 input: Variable(batch_size, word_length)
45 |                 seq_lengths: numpy array (batch_size,  1)
46 |             output:
47 |                 Variable(batch_size, char_hidden_dim)
48 |             Note it only accepts ordered (length) variable, length size is recorded in seq_lengths
49 |         """
50 |         batch_size = input.size(0)
51 |         char_embeds = self.char_drop(self.char_embeddings(input))
52 |         char_hidden = None
53 |         pack_input = pack_padded_sequence(char_embeds, seq_lengths, True)
54 |         char_rnn_out, char_hidden = self.char_lstm(pack_input, char_hidden)
55 |         ## char_hidden = (h_t, c_t)
56 |         #  char_hidden[0] = h_t = (2, batch_size, lstm_dimension)
57 |         # char_rnn_out, _ = pad_packed_sequence(char_rnn_out)
58 |         return char_hidden[0].transpose(1,0).contiguous().view(batch_size,-1)
59 | 
60 |     def get_all_hiddens(self, input, seq_lengths):
61 |         """
62 |             input:
63 |                 input: Variable(batch_size,  word_length)
64 |                 seq_lengths: numpy array (batch_size,  1)
65 |             output:
66 |                 Variable(batch_size, word_length, char_hidden_dim)
67 |             Note it only accepts ordered (length) variable, length size is recorded in seq_lengths
68 |         """
69 |         batch_size = input.size(0)
70 |         char_embeds = self.char_drop(self.char_embeddings(input))
71 |         char_hidden = None
72 |         pack_input = pack_padded_sequence(char_embeds, seq_lengths, True)
73 |         char_rnn_out, char_hidden = self.char_lstm(pack_input, char_hidden)
74 |         char_rnn_out, _ = pad_packed_sequence(char_rnn_out)
75 |         return char_rnn_out.transpose(1,0)
76 | 
77 | 
78 |     def forward(self, input, seq_lengths):
79 |         return self.get_all_hiddens(input, seq_lengths)
80 | 


--------------------------------------------------------------------------------
/model/charcnn.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author: Jie Yang
 3 | # @Date:   2017-10-17 16:47:32
 4 | # @Last Modified by:   Jie Yang,     Contact: jieynlp@gmail.com
 5 | # @Last Modified time: 2019-01-18 21:06:06
 6 | from __future__ import print_function
 7 | import torch
 8 | import torch.nn as nn
 9 | import torch.nn.functional as F
10 | import numpy as np
11 | 
12 | class CharCNN(nn.Module):
13 |     def __init__(self, alphabet_size, pretrain_char_embedding, embedding_dim, hidden_dim, dropout, gpu):
14 |         super(CharCNN, self).__init__()
15 |         print("build char sequence feature extractor: CNN ...")
16 |         self.gpu = gpu
17 |         self.hidden_dim = hidden_dim
18 |         self.char_drop = nn.Dropout(dropout)
19 |         self.char_embeddings = nn.Embedding(alphabet_size, embedding_dim)
20 |         if pretrain_char_embedding is not None:
21 |             self.char_embeddings.weight.data.copy_(torch.from_numpy(pretrain_char_embedding))
22 |         else:
23 |             self.char_embeddings.weight.data.copy_(torch.from_numpy(self.random_embedding(alphabet_size, embedding_dim)))
24 |         self.char_cnn = nn.Conv1d(embedding_dim, self.hidden_dim, kernel_size=3, padding=1)
25 |         if self.gpu:
26 |             self.char_drop = self.char_drop.cuda()
27 |             self.char_embeddings = self.char_embeddings.cuda()
28 |             self.char_cnn = self.char_cnn.cuda()
29 | 
30 | 
31 |     def random_embedding(self, vocab_size, embedding_dim):
32 |         pretrain_emb = np.empty([vocab_size, embedding_dim])
33 |         scale = np.sqrt(3.0 / embedding_dim)
34 |         for index in range(vocab_size):
35 |             pretrain_emb[index,:] = np.random.uniform(-scale, scale, [1, embedding_dim])
36 |         return pretrain_emb
37 | 
38 | 
39 |     def get_last_hiddens(self, input, seq_lengths):
40 |         """
41 |             input:
42 |                 input: Variable(batch_size, word_length)
43 |                 seq_lengths: numpy array (batch_size,  1)
44 |             output:
45 |                 Variable(batch_size, char_hidden_dim)
46 |             Note it only accepts ordered (length) variable, length size is recorded in seq_lengths
47 |         """
48 |         batch_size = input.size(0)
49 |         char_embeds = self.char_drop(self.char_embeddings(input))
50 |         char_embeds = char_embeds.transpose(2,1).contiguous()
51 |         char_cnn_out = self.char_cnn(char_embeds)
52 |         char_cnn_out = F.max_pool1d(char_cnn_out, char_cnn_out.size(2)).view(batch_size, -1)
53 |         return char_cnn_out
54 | 
55 |     def get_all_hiddens(self, input, seq_lengths):
56 |         """
57 |             input:
58 |                 input: Variable(batch_size,  word_length)
59 |                 seq_lengths: numpy array (batch_size,  1)
60 |             output:
61 |                 Variable(batch_size, word_length, char_hidden_dim)
62 |             Note it only accepts ordered (length) variable, length size is recorded in seq_lengths
63 |         """
64 |         batch_size = input.size(0)
65 |         char_embeds = self.char_drop(self.char_embeddings(input))
66 |         char_embeds = char_embeds.transpose(2,1).contiguous()
67 |         char_cnn_out = self.char_cnn(char_embeds).transpose(2,1).contiguous()
68 |         return char_cnn_out
69 | 
70 | 
71 | 
72 |     def forward(self, input, seq_lengths):
73 |         return self.get_all_hiddens(input, seq_lengths)
74 | 


--------------------------------------------------------------------------------
/model/crf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Author: Jie Yang
  3 | # @Date:   2017-12-04 23:19:38
  4 | # @Last Modified by:   Jie Yang,     Contact: jieynlp@gmail.com
  5 | # @Last Modified time: 2018-12-16 22:15:56
  6 | from __future__ import print_function
  7 | import torch
  8 | import torch.autograd as autograd
  9 | import torch.nn as nn
 10 | import torch.nn.functional as F
 11 | START_TAG = -2
 12 | STOP_TAG = -1
 13 | 
 14 | 
 15 | # Compute log sum exp in a numerically stable way for the forward algorithm
 16 | def log_sum_exp(vec, m_size):
 17 |     """
 18 |     calculate log of exp sum
 19 |     args:
 20 |         vec (batch_size, vanishing_dim, hidden_dim) : input tensor
 21 |         m_size : hidden_dim
 22 |     return:
 23 |         batch_size, hidden_dim
 24 |     """
 25 |     _, idx = torch.max(vec, 1)  # B * 1 * M
 26 |     max_score = torch.gather(vec, 1, idx.view(-1, 1, m_size)).view(-1, 1, m_size)  # B * M
 27 |     return max_score.view(-1, m_size) + torch.log(torch.sum(torch.exp(vec - max_score.expand_as(vec)), 1)).view(-1, m_size)  # B * M
 28 | 
 29 | class CRF(nn.Module):
 30 | 
 31 |     def __init__(self, tagset_size, gpu):
 32 |         super(CRF, self).__init__()
 33 |         print("build CRF...")
 34 |         self.gpu = gpu
 35 |         # Matrix of transition parameters.  Entry i,j is the score of transitioning from i to j.
 36 |         self.tagset_size = tagset_size
 37 |         # # We add 2 here, because of START_TAG and STOP_TAG
 38 |         # # transitions (f_tag_size, t_tag_size), transition value from f_tag to t_tag
 39 |         init_transitions = torch.zeros(self.tagset_size+2, self.tagset_size+2)
 40 |         init_transitions[:,START_TAG] = -10000.0
 41 |         init_transitions[STOP_TAG,:] = -10000.0
 42 |         init_transitions[:,0] = -10000.0
 43 |         init_transitions[0,:] = -10000.0
 44 |         if self.gpu:
 45 |             init_transitions = init_transitions.cuda()
 46 |         self.transitions = nn.Parameter(init_transitions)
 47 | 
 48 |         # self.transitions = nn.Parameter(torch.Tensor(self.tagset_size+2, self.tagset_size+2))
 49 |         # self.transitions.data.zero_()
 50 | 
 51 |     def _calculate_PZ(self, feats, mask):
 52 |         """
 53 |             input:
 54 |                 feats: (batch, seq_len, self.tag_size+2)
 55 |                 masks: (batch, seq_len)
 56 |         """
 57 |         batch_size = feats.size(0)
 58 |         seq_len = feats.size(1)
 59 |         tag_size = feats.size(2)
 60 |         # print feats.view(seq_len, tag_size)
 61 |         assert(tag_size == self.tagset_size+2)
 62 |         mask = mask.transpose(1,0).contiguous()
 63 |         ins_num = seq_len * batch_size
 64 |         ## be careful the view shape, it is .view(ins_num, 1, tag_size) but not .view(ins_num, tag_size, 1)
 65 |         feats = feats.transpose(1,0).contiguous().view(ins_num,1, tag_size).expand(ins_num, tag_size, tag_size)
 66 |         ## need to consider start
 67 |         scores = feats + self.transitions.view(1,tag_size,tag_size).expand(ins_num, tag_size, tag_size)
 68 |         scores = scores.view(seq_len, batch_size, tag_size, tag_size)
 69 |         # build iter
 70 |         seq_iter = enumerate(scores)
 71 |         _, inivalues = next(seq_iter)  # bat_size * from_target_size * to_target_size
 72 |         # only need start from start_tag
 73 |         partition = inivalues[:, START_TAG, :].clone().view(batch_size, tag_size, 1)  # bat_size * to_target_size
 74 | 
 75 |         ## add start score (from start to all tag, duplicate to batch_size)
 76 |         # partition = partition + self.transitions[START_TAG,:].view(1, tag_size, 1).expand(batch_size, tag_size, 1)
 77 |         # iter over last scores
 78 |         for idx, cur_values in seq_iter:
 79 |             # previous to_target is current from_target
 80 |             # partition: previous results log(exp(from_target)), #(batch_size * from_target)
 81 |             # cur_values: bat_size * from_target * to_target
 82 | 
 83 |             cur_values = cur_values + partition.contiguous().view(batch_size, tag_size, 1).expand(batch_size, tag_size, tag_size)
 84 |             cur_partition = log_sum_exp(cur_values, tag_size)
 85 |             # print cur_partition.data
 86 | 
 87 |                 # (bat_size * from_target * to_target) -> (bat_size * to_target)
 88 |             # partition = utils.switch(partition, cur_partition, mask[idx].view(bat_size, 1).expand(bat_size, self.tagset_size)).view(bat_size, -1)
 89 |             mask_idx = mask[idx, :].view(batch_size, 1).expand(batch_size, tag_size)
 90 | 
 91 |             ## effective updated partition part, only keep the partition value of mask value = 1
 92 |             masked_cur_partition = cur_partition.masked_select(mask_idx)
 93 |             ## let mask_idx broadcastable, to disable warning
 94 |             mask_idx = mask_idx.contiguous().view(batch_size, tag_size, 1)
 95 | 
 96 |             ## replace the partition where the maskvalue=1, other partition value keeps the same
 97 |             partition.masked_scatter_(mask_idx, masked_cur_partition)
 98 |         # until the last state, add transition score for all partition (and do log_sum_exp) then select the value in STOP_TAG
 99 |         cur_values = self.transitions.view(1,tag_size, tag_size).expand(batch_size, tag_size, tag_size) + partition.contiguous().view(batch_size, tag_size, 1).expand(batch_size, tag_size, tag_size)
100 |         cur_partition = log_sum_exp(cur_values, tag_size)
101 |         final_partition = cur_partition[:, STOP_TAG]
102 |         return final_partition.sum(), scores
103 | 
104 | 
105 |     def _viterbi_decode(self, feats, mask):
106 |         """
107 |             input:
108 |                 feats: (batch, seq_len, self.tag_size+2)
109 |                 mask: (batch, seq_len)
110 |             output:
111 |                 decode_idx: (batch, seq_len) decoded sequence
112 |                 path_score: (batch, 1) corresponding score for each sequence (to be implementated)
113 |         """
114 |         batch_size = feats.size(0)
115 |         seq_len = feats.size(1)
116 |         tag_size = feats.size(2)
117 |         assert(tag_size == self.tagset_size+2)
118 |         ## calculate sentence length for each sentence
119 |         length_mask = torch.sum(mask.long(), dim = 1).view(batch_size,1).long()
120 |         ## mask to (seq_len, batch_size)
121 |         mask = mask.transpose(1,0).contiguous()
122 |         ins_num = seq_len * batch_size
123 |         ## be careful the view shape, it is .view(ins_num, 1, tag_size) but not .view(ins_num, tag_size, 1)
124 |         feats = feats.transpose(1,0).contiguous().view(ins_num, 1, tag_size).expand(ins_num, tag_size, tag_size)
125 |         ## need to consider start
126 |         scores = feats + self.transitions.view(1,tag_size,tag_size).expand(ins_num, tag_size, tag_size)
127 |         scores = scores.view(seq_len, batch_size, tag_size, tag_size)
128 | 
129 |         # build iter
130 |         seq_iter = enumerate(scores)
131 |         ## record the position of best score
132 |         back_points = list()
133 |         partition_history = list()
134 |         ##  reverse mask (bug for mask = 1- mask, use this as alternative choice)
135 |         # mask = 1 + (-1)*mask
136 |         mask =  (1 - mask.long()).bool()
137 |         _, inivalues = next(seq_iter)  # bat_size * from_target_size * to_target_size
138 |         # only need start from start_tag
139 |         partition = inivalues[:, START_TAG, :].clone().view(batch_size, tag_size)  # bat_size * to_target_size
140 |         # print "init part:",partition.size()
141 |         partition_history.append(partition)
142 |         # iter over last scores
143 |         for idx, cur_values in seq_iter:
144 |             # previous to_target is current from_target
145 |             # partition: previous results log(exp(from_target)), #(batch_size * from_target)
146 |             # cur_values: batch_size * from_target * to_target
147 |             cur_values = cur_values + partition.contiguous().view(batch_size, tag_size, 1).expand(batch_size, tag_size, tag_size)
148 |             ## forscores, cur_bp = torch.max(cur_values[:,:-2,:], 1) # do not consider START_TAG/STOP_TAG
149 |             # print "cur value:", cur_values.size()
150 |             partition, cur_bp = torch.max(cur_values, 1)
151 |             # print "partsize:",partition.size()
152 |             # exit(0)
153 |             # print partition
154 |             # print cur_bp
155 |             # print "one best, ",idx
156 |             partition_history.append(partition)
157 |             ## cur_bp: (batch_size, tag_size) max source score position in current tag
158 |             ## set padded label as 0, which will be filtered in post processing
159 |             cur_bp.masked_fill_(mask[idx].view(batch_size, 1).expand(batch_size, tag_size), 0)
160 |             back_points.append(cur_bp)
161 |         # exit(0)
162 |         ### add score to final STOP_TAG
163 |         partition_history = torch.cat(partition_history, 0).view(seq_len, batch_size, -1).transpose(1,0).contiguous() ## (batch_size, seq_len. tag_size)
164 |         ### get the last position for each setences, and select the last partitions using gather()
165 |         last_position = length_mask.view(batch_size,1,1).expand(batch_size, 1, tag_size) -1
166 |         last_partition = torch.gather(partition_history, 1, last_position).view(batch_size,tag_size,1)
167 |         ### calculate the score from last partition to end state (and then select the STOP_TAG from it)
168 |         last_values = last_partition.expand(batch_size, tag_size, tag_size) + self.transitions.view(1,tag_size, tag_size).expand(batch_size, tag_size, tag_size)
169 |         _, last_bp = torch.max(last_values, 1)
170 |         pad_zero = autograd.Variable(torch.zeros(batch_size, tag_size)).long()
171 |         if self.gpu:
172 |             pad_zero = pad_zero.cuda()
173 |         back_points.append(pad_zero)
174 |         back_points  =  torch.cat(back_points).view(seq_len, batch_size, tag_size)
175 | 
176 |         ## select end ids in STOP_TAG
177 |         pointer = last_bp[:, STOP_TAG]
178 |         insert_last = pointer.contiguous().view(batch_size,1,1).expand(batch_size,1, tag_size)
179 |         back_points = back_points.transpose(1,0).contiguous()
180 |         ## move the end ids(expand to tag_size) to the corresponding position of back_points to replace the 0 values
181 |         # print "lp:",last_position
182 |         # print "il:",insert_last
183 |         back_points.scatter_(1, last_position, insert_last)
184 |         # print "bp:",back_points
185 |         # exit(0)
186 |         back_points = back_points.transpose(1,0).contiguous()
187 |         ## decode from the end, padded position ids are 0, which will be filtered if following evaluation
188 |         decode_idx = autograd.Variable(torch.LongTensor(seq_len, batch_size))
189 |         if self.gpu:
190 |             decode_idx = decode_idx.cuda()
191 |         decode_idx[-1] = pointer.detach()
192 |         for idx in range(len(back_points)-2, -1, -1):
193 |             pointer = torch.gather(back_points[idx], 1, pointer.contiguous().view(batch_size, 1))
194 |             decode_idx[idx] = pointer.detach().view(batch_size)
195 |         path_score = None
196 |         decode_idx = decode_idx.transpose(1,0)
197 |         return path_score, decode_idx
198 | 
199 | 
200 | 
201 |     def forward(self, feats):
202 |     	path_score, best_path = self._viterbi_decode(feats)
203 |     	return path_score, best_path
204 | 
205 | 
206 |     def _score_sentence(self, scores, mask, tags):
207 |         """
208 |             input:
209 |                 scores: variable (seq_len, batch, tag_size, tag_size)
210 |                 mask: (batch, seq_len)
211 |                 tags: tensor  (batch, seq_len)
212 |             output:
213 |                 score: sum of score for gold sequences within whole batch
214 |         """
215 |         # Gives the score of a provided tag sequence
216 |         batch_size = scores.size(1)
217 |         seq_len = scores.size(0)
218 |         tag_size = scores.size(2)
219 |         ## convert tag value into a new format, recorded label bigram information to index
220 |         new_tags = autograd.Variable(torch.LongTensor(batch_size, seq_len))
221 |         if self.gpu:
222 |             new_tags = new_tags.cuda()
223 |         for idx in range(seq_len):
224 |             if idx == 0:
225 |                 ## start -> first score
226 |                 new_tags[:,0] =  (tag_size - 2)*tag_size + tags[:,0]
227 | 
228 |             else:
229 |                 new_tags[:,idx] =  tags[:,idx-1]*tag_size + tags[:,idx]
230 | 
231 |         ## transition for label to STOP_TAG
232 |         end_transition = self.transitions[:,STOP_TAG].contiguous().view(1, tag_size).expand(batch_size, tag_size)
233 |         ## length for batch,  last word position = length - 1
234 |         length_mask = torch.sum(mask.long(), dim = 1).view(batch_size,1).long()
235 |         ## index the label id of last word
236 |         end_ids = torch.gather(tags, 1, length_mask - 1)
237 | 
238 |         ## index the transition score for end_id to STOP_TAG
239 |         end_energy = torch.gather(end_transition, 1, end_ids)
240 | 
241 |         ## convert tag as (seq_len, batch_size, 1)
242 |         new_tags = new_tags.transpose(1,0).contiguous().view(seq_len, batch_size, 1)
243 |         ### need convert tags id to search from 400 positions of scores
244 |         tg_energy = torch.gather(scores.view(seq_len, batch_size, -1), 2, new_tags).view(seq_len, batch_size)  # seq_len * bat_size
245 |         ## mask transpose to (seq_len, batch_size)
246 |         tg_energy = tg_energy.masked_select(mask.transpose(1,0))
247 | 
248 |         # ## calculate the score from START_TAG to first label
249 |         # start_transition = self.transitions[START_TAG,:].view(1, tag_size).expand(batch_size, tag_size)
250 |         # start_energy = torch.gather(start_transition, 1, tags[0,:])
251 | 
252 |         ## add all score together
253 |         # gold_score = start_energy.sum() + tg_energy.sum() + end_energy.sum()
254 |         gold_score = tg_energy.sum() + end_energy.sum()
255 |         return gold_score
256 | 
257 |     def neg_log_likelihood_loss(self, feats, mask, tags):
258 |         # nonegative log likelihood
259 |         batch_size = feats.size(0)
260 |         forward_score, scores = self._calculate_PZ(feats, mask)
261 |         gold_score = self._score_sentence(scores, mask, tags)
262 |         # print "batch, f:", forward_score.data[0], " g:", gold_score.data[0], " dis:", forward_score.data[0] - gold_score.data[0]
263 |         # exit(0)
264 |         return forward_score - gold_score
265 | 
266 | 
267 | 
268 |     def _viterbi_decode_nbest(self, feats, mask, nbest):
269 |         """
270 |             input:
271 |                 feats: (batch, seq_len, self.tag_size+2)
272 |                 mask: (batch, seq_len)
273 |             output:
274 |                 decode_idx: (batch, nbest, seq_len) decoded sequence
275 |                 path_score: (batch, nbest) corresponding score for each sequence (to be implementated)
276 |                 nbest decode for sentence with one token is not well supported, to be optimized
277 |         """
278 |         batch_size = feats.size(0)
279 |         seq_len = feats.size(1)
280 |         tag_size = feats.size(2)
281 |         assert(tag_size == self.tagset_size+2)
282 |         ## calculate sentence length for each sentence
283 |         length_mask = torch.sum(mask.long(), dim = 1).view(batch_size,1).long()
284 |         ## mask to (seq_len, batch_size)
285 |         mask = mask.transpose(1,0).contiguous()
286 |         ins_num = seq_len * batch_size
287 |         ## be careful the view shape, it is .view(ins_num, 1, tag_size) but not .view(ins_num, tag_size, 1)
288 |         feats = feats.transpose(1,0).contiguous().view(ins_num, 1, tag_size).expand(ins_num, tag_size, tag_size)
289 |         ## need to consider start
290 |         scores = feats + self.transitions.view(1,tag_size,tag_size).expand(ins_num, tag_size, tag_size)
291 |         scores = scores.view(seq_len, batch_size, tag_size, tag_size)
292 | 
293 |         # build iter
294 |         seq_iter = enumerate(scores)
295 |         ## record the position of best score
296 |         back_points = list()
297 |         partition_history = list()
298 |         ##  reverse mask (bug for mask = 1- mask, use this as alternative choice)
299 |         # mask = 1 + (-1)*mask
300 |         mask =  (1 - mask.long()).bool()
301 |         _, inivalues = next(seq_iter)  # bat_size * from_target_size * to_target_size
302 |         # only need start from start_tag
303 |         partition = inivalues[:, START_TAG, :].clone()  # bat_size * to_target_size
304 |         ## initial partition [batch_size, tag_size]
305 |         partition_history.append(partition.view(batch_size, tag_size, 1).expand(batch_size, tag_size, nbest))
306 |         # iter over last scores
307 |         for idx, cur_values in seq_iter:
308 |             if idx == 1:
309 |                 cur_values = cur_values.view(batch_size, tag_size, tag_size) + partition.contiguous().view(batch_size, tag_size, 1).expand(batch_size, tag_size, tag_size)
310 |             else:
311 |                 # previous to_target is current from_target
312 |                 # partition: previous results log(exp(from_target)), #(batch_size * nbest * from_target)
313 |                 # cur_values: batch_size * from_target * to_target
314 |                 cur_values = cur_values.view(batch_size, tag_size, 1, tag_size).expand(batch_size, tag_size, nbest, tag_size) + partition.contiguous().view(batch_size, tag_size, nbest, 1).expand(batch_size, tag_size, nbest, tag_size)
315 |                 ## compare all nbest and all from target
316 |                 cur_values = cur_values.view(batch_size, tag_size*nbest, tag_size)
317 |                 # print "cur size:",cur_values.size()
318 |             partition, cur_bp = torch.topk(cur_values, nbest, 1)
319 |             ## cur_bp/partition: [batch_size, nbest, tag_size], id should be normize through nbest in following backtrace step
320 |             # print partition[:,0,:]
321 |             # print cur_bp[:,0,:]
322 |             # print "nbest, ",idx
323 |             if idx == 1:
324 |                 cur_bp = cur_bp*nbest
325 |             partition = partition.transpose(2,1)
326 |             cur_bp = cur_bp.transpose(2,1)
327 | 
328 |             # print partition
329 |             # exit(0)
330 |             #partition: (batch_size * to_target * nbest)
331 |             #cur_bp: (batch_size * to_target * nbest) Notice the cur_bp number is the whole position of tag_size*nbest, need to convert when decode
332 |             partition_history.append(partition)
333 |             ## cur_bp: (batch_size,nbest, tag_size) topn source score position in current tag
334 |             ## set padded label as 0, which will be filtered in post processing
335 |             ## mask[idx] ? mask[idx-1]
336 |             cur_bp.masked_fill_(mask[idx].view(batch_size, 1, 1).expand(batch_size, tag_size, nbest), 0)
337 |             # print cur_bp[0]
338 |             back_points.append(cur_bp)
339 |         ### add score to final STOP_TAG
340 |         partition_history = torch.cat(partition_history,0).view(seq_len, batch_size, tag_size, nbest).transpose(1,0).contiguous() ## (batch_size, seq_len, nbest, tag_size)
341 |         ### get the last position for each setences, and select the last partitions using gather()
342 |         last_position = length_mask.view(batch_size,1,1,1).expand(batch_size, 1, tag_size, nbest) - 1
343 |         last_partition = torch.gather(partition_history, 1, last_position).view(batch_size, tag_size, nbest, 1)
344 |         ### calculate the score from last partition to end state (and then select the STOP_TAG from it)
345 |         last_values = last_partition.expand(batch_size, tag_size, nbest, tag_size) + self.transitions.view(1, tag_size, 1, tag_size).expand(batch_size, tag_size, nbest, tag_size)
346 |         last_values = last_values.view(batch_size, tag_size*nbest, tag_size)
347 |         end_partition, end_bp = torch.topk(last_values, nbest, 1)
348 |         ## end_partition: (batch, nbest, tag_size)
349 |         end_bp = end_bp.transpose(2,1)
350 |         # end_bp: (batch, tag_size, nbest)
351 |         pad_zero = autograd.Variable(torch.zeros(batch_size, tag_size, nbest)).long()
352 |         if self.gpu:
353 |             pad_zero = pad_zero.cuda()
354 |         back_points.append(pad_zero)
355 |         back_points = torch.cat(back_points).view(seq_len, batch_size, tag_size, nbest)
356 | 
357 |         ## select end ids in STOP_TAG
358 |         pointer = end_bp[:, STOP_TAG, :] ## (batch_size, nbest)
359 |         insert_last = pointer.contiguous().view(batch_size, 1, 1, nbest).expand(batch_size, 1, tag_size, nbest)
360 |         back_points = back_points.transpose(1,0).contiguous()
361 |         ## move the end ids(expand to tag_size) to the corresponding position of back_points to replace the 0 values
362 |         # print "lp:",last_position
363 |         # print "il:",insert_last[0]
364 |         # exit(0)
365 |         ## copy the ids of last position:insert_last to back_points, though the last_position index
366 |         ## last_position includes the length of batch sentences
367 |         # print "old:", back_points[9,0,:,:]
368 |         back_points.scatter_(1, last_position, insert_last)
369 |         ## back_points: [batch_size, seq_length, tag_size, nbest]
370 |         # print "new:", back_points[9,0,:,:]
371 |         # exit(0)
372 |         # print pointer[2]
373 |         '''
374 |         back_points: in simple demonstratration
375 |         x,x,x,x,x,x,x,x,x,7
376 |         x,x,x,x,x,4,0,0,0,0
377 |         x,x,6,0,0,0,0,0,0,0
378 |         '''
379 | 
380 |         back_points = back_points.transpose(1,0).contiguous()
381 |         # print back_points[0]
382 |         ## back_points: (seq_len, batch, tag_size, nbest)
383 |         ## decode from the end, padded position ids are 0, which will be filtered in following evaluation
384 |         decode_idx = autograd.Variable(torch.LongTensor(seq_len, batch_size, nbest))
385 |         if self.gpu:
386 |             decode_idx = decode_idx.cuda()
387 |         decode_idx[-1] = pointer.data/nbest
388 |         # print "pointer-1:",pointer[2]
389 |         # exit(0)
390 |         # use old mask, let 0 means has token
391 |         for idx in range(len(back_points)-2, -1, -1):
392 |             # print "pointer: ",idx,  pointer[3]
393 |             # print "back:",back_points[idx][3]
394 |             # print "mask:",mask[idx+1,3]
395 |             new_pointer = torch.gather(back_points[idx].view(batch_size, tag_size*nbest), 1, pointer.contiguous().view(batch_size,nbest))
396 |             decode_idx[idx] = new_pointer.data/nbest
397 |             # # use new pointer to remember the last end nbest ids for non longest
398 |             pointer = new_pointer + pointer.contiguous().view(batch_size,nbest)*mask[idx].view(batch_size,1).expand(batch_size, nbest).long()
399 | 
400 |         # exit(0)
401 |         path_score = None
402 |         decode_idx = decode_idx.transpose(1,0)
403 |         ## decode_idx: [batch, seq_len, nbest]
404 |         # print decode_idx[:,:,0]
405 |         # print "nbest:",nbest
406 |         # print "diff:", decode_idx[:,:,0]- decode_idx[:,:,4]
407 |         # print decode_idx[:,0,:]
408 |         # exit(0)
409 | 
410 |         ### calculate probability for each sequence
411 |         scores = end_partition[:, :, STOP_TAG]
412 |         ## scores: [batch_size, nbest]
413 |         max_scores,_ = torch.max(scores, 1)
414 |         minus_scores = scores - max_scores.view(batch_size,1).expand(batch_size, nbest)
415 |         path_score = F.softmax(minus_scores, 1)
416 |         ## path_score: [batch_size, nbest]
417 |         # exit(0)
418 |         return path_score, decode_idx
419 | 
420 | 
421 | 
422 | 
423 | 
424 | 
425 | 
426 | 
427 | 
428 | 
429 | 
430 | 
431 | 
432 | 
433 | 
434 | 
435 | 
436 | 
437 | 
438 | 
439 | 


--------------------------------------------------------------------------------
/model/sentclassifier.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author: Jie Yang
 3 | # @Date:   2019-01-01 21:11:50
 4 | # @Last Modified by:   Jie Yang,     Contact: jieynlp@gmail.com
 5 | # @Last Modified time: 2019-02-13 12:30:56
 6 | 
 7 | from __future__ import print_function
 8 | from __future__ import absolute_import
 9 | import torch
10 | import torch.nn as nn
11 | import torch.nn.functional as F
12 | from .wordsequence import WordSequence
13 | 
14 | class SentClassifier(nn.Module):
15 |     def __init__(self, data):
16 |         super(SentClassifier, self).__init__()
17 |         print("build sentence classification network...")
18 |         print("use_char: ", data.use_char)
19 |         if data.use_char:
20 |             print("char feature extractor: ", data.char_feature_extractor)
21 |         print("word feature extractor: ", data.word_feature_extractor)
22 | 
23 |         self.gpu = data.HP_gpu
24 |         self.average_batch = data.average_batch_loss
25 |         label_size = data.label_alphabet_size
26 |         self.word_hidden = WordSequence(data)
27 | 
28 | 
29 | 
30 |     def calculate_loss(self, word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, batch_label, mask):
31 |         outs = self.word_hidden.sentence_representation(word_inputs,feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover)
32 |         batch_size = word_inputs.size(0)
33 |         # loss_function = nn.CrossEntropyLoss(ignore_index=0, reduction='sum')
34 |         outs = outs.view(batch_size, -1)
35 |         # print("a",outs)
36 |         # score = F.log_softmax(outs, 1)
37 |         # print(score.size(), batch_label.view(batch_size).size())
38 |         # print(score)
39 |         # print(batch_label)
40 |         # exit(0)
41 |         total_loss = F.cross_entropy(outs, batch_label.view(batch_size))
42 |         # total_loss = loss_function(score, batch_label.view(batch_size))
43 |         
44 |         _, tag_seq  = torch.max(outs, 1)
45 |         if self.average_batch:
46 |             total_loss = total_loss / batch_size
47 |         return total_loss, tag_seq
48 | 
49 | 
50 |     def forward(self, word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, mask):
51 |         outs = self.word_hidden.sentence_representation(word_inputs,feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover)
52 |         batch_size = word_inputs.size(0)
53 |         outs = outs.view(batch_size, -1)
54 |         _, tag_seq  = torch.max(outs, 1)
55 |         # if a == 0:
56 |         #     print(tag_seq)
57 |         return tag_seq
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/model/seqlabel.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author: Jie Yang
 3 | # @Date:   2017-10-17 16:47:32
 4 | # @Last Modified by:   Jie Yang,     Contact: jieynlp@gmail.com
 5 | # @Last Modified time: 2019-02-13 11:49:38
 6 | 
 7 | from __future__ import print_function
 8 | from __future__ import absolute_import
 9 | import torch
10 | import torch.nn as nn
11 | import torch.nn.functional as F
12 | from .wordsequence import WordSequence
13 | from .crf import CRF
14 | 
15 | class SeqLabel(nn.Module):
16 |     def __init__(self, data):
17 |         super(SeqLabel, self).__init__()
18 |         self.use_crf = data.use_crf
19 |         print("build sequence labeling network...")
20 |         print("use_char: ", data.use_char)
21 |         if data.use_char:
22 |             print("char feature extractor: ", data.char_feature_extractor)
23 |         print("word feature extractor: ", data.word_feature_extractor)
24 |         print("use crf: ", self.use_crf)
25 | 
26 |         self.gpu = data.HP_gpu
27 |         self.average_batch = data.average_batch_loss
28 |         ## add two more label for downlayer lstm, use original label size for CRF
29 |         label_size = data.label_alphabet_size
30 |         data.label_alphabet_size += 2
31 |         self.word_hidden = WordSequence(data)
32 |         if self.use_crf:
33 |             self.crf = CRF(label_size, self.gpu)
34 | 
35 | 
36 |     def calculate_loss(self, word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, batch_label, mask):
37 |         outs = self.word_hidden(word_inputs,feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover)
38 |         batch_size = word_inputs.size(0)
39 |         seq_len = word_inputs.size(1)
40 |         if self.use_crf:
41 |             total_loss = self.crf.neg_log_likelihood_loss(outs, mask, batch_label)
42 |             scores, tag_seq = self.crf._viterbi_decode(outs, mask)
43 |         else:
44 |             loss_function = nn.NLLLoss(ignore_index=0, size_average=False)
45 |             outs = outs.view(batch_size * seq_len, -1)
46 |             score = F.log_softmax(outs, 1)
47 |             total_loss = loss_function(score, batch_label.view(batch_size * seq_len))
48 |             _, tag_seq  = torch.max(score, 1)
49 |             tag_seq = tag_seq.view(batch_size, seq_len)
50 |         if self.average_batch:
51 |             total_loss = total_loss / batch_size
52 |         return total_loss, tag_seq
53 | 
54 | 
55 |     def forward(self, word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, mask):
56 |         outs = self.word_hidden(word_inputs,feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover)
57 |         batch_size = word_inputs.size(0)
58 |         seq_len = word_inputs.size(1)
59 |         if self.use_crf:
60 |             scores, tag_seq = self.crf._viterbi_decode(outs, mask)
61 |         else:
62 |             outs = outs.view(batch_size * seq_len, -1)
63 |             _, tag_seq  = torch.max(outs, 1)
64 |             tag_seq = tag_seq.view(batch_size, seq_len)
65 |             ## filter padded position with zero
66 |             tag_seq = mask.long() * tag_seq
67 |         return tag_seq
68 | 
69 | 
70 |     # def get_lstm_features(self, word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover):
71 |     #     return self.word_hidden(word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover)
72 | 
73 | 
74 |     def decode_nbest(self, word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, mask, nbest):
75 |         if not self.use_crf:
76 |             print("Nbest output is currently supported only for CRF! Exit...")
77 |             exit(0)
78 |         outs = self.word_hidden(word_inputs,feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover)
79 |         batch_size = word_inputs.size(0)
80 |         seq_len = word_inputs.size(1)
81 |         scores, tag_seq = self.crf._viterbi_decode_nbest(outs, mask, nbest)
82 |         return scores, tag_seq
83 | 
84 | 


--------------------------------------------------------------------------------
/model/wordrep.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Author: Jie Yang
  3 | # @Date:   2017-10-17 16:47:32
  4 | # @Last Modified by:   Jie Yang,     Contact: jieynlp@gmail.com
  5 | # @Last Modified time: 2019-02-01 15:52:01
  6 | from __future__ import print_function
  7 | from __future__ import absolute_import
  8 | import torch
  9 | import torch.nn as nn
 10 | import numpy as np
 11 | from .charbilstm import CharBiLSTM
 12 | from .charbigru import CharBiGRU
 13 | from .charcnn import CharCNN
 14 | 
 15 | class WordRep(nn.Module):
 16 |     def __init__(self, data):
 17 |         super(WordRep, self).__init__()
 18 |         print("build word representation...")
 19 |         self.gpu = data.HP_gpu
 20 |         self.use_char = data.use_char
 21 |         self.batch_size = data.HP_batch_size
 22 |         self.char_hidden_dim = 0
 23 |         self.char_all_feature = False
 24 |         self.sentence_classification = data.sentence_classification
 25 |         if self.use_char:
 26 |             self.char_hidden_dim = data.HP_char_hidden_dim
 27 |             self.char_embedding_dim = data.char_emb_dim
 28 |             if data.char_feature_extractor == "CNN":
 29 |                 self.char_feature = CharCNN(data.char_alphabet.size(), data.pretrain_char_embedding, self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, self.gpu)
 30 |             elif data.char_feature_extractor == "LSTM":
 31 |                 self.char_feature = CharBiLSTM(data.char_alphabet.size(), data.pretrain_char_embedding, self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, self.gpu)
 32 |             elif data.char_feature_extractor == "GRU":
 33 |                 self.char_feature = CharBiGRU(data.char_alphabet.size(), data.pretrain_char_embedding, self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, self.gpu)
 34 |             elif data.char_feature_extractor == "ALL":
 35 |                 self.char_all_feature = True
 36 |                 self.char_feature = CharCNN(data.char_alphabet.size(), data.pretrain_char_embedding, self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, self.gpu)
 37 |                 self.char_feature_extra = CharBiLSTM(data.char_alphabet.size(), data.pretrain_char_embedding, self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, self.gpu)
 38 |             else:
 39 |                 print("Error char feature selection, please check parameter data.char_feature_extractor (CNN/LSTM/GRU/ALL).")
 40 |                 exit(0)
 41 |         self.embedding_dim = data.word_emb_dim
 42 |         self.drop = nn.Dropout(data.HP_dropout)
 43 |         self.word_embedding = nn.Embedding(data.word_alphabet.size(), self.embedding_dim)
 44 |         if data.pretrain_word_embedding is not None:
 45 |             self.word_embedding.weight.data.copy_(torch.from_numpy(data.pretrain_word_embedding))
 46 |         else:
 47 |             self.word_embedding.weight.data.copy_(torch.from_numpy(self.random_embedding(data.word_alphabet.size(), self.embedding_dim)))
 48 | 
 49 |         self.feature_num = data.feature_num
 50 |         self.feature_embedding_dims = data.feature_emb_dims
 51 |         self.feature_embeddings = nn.ModuleList()
 52 |         for idx in range(self.feature_num):
 53 |             self.feature_embeddings.append(nn.Embedding(data.feature_alphabets[idx].size(), self.feature_embedding_dims[idx]))
 54 |         for idx in range(self.feature_num):
 55 |             if data.pretrain_feature_embeddings[idx] is not None:
 56 |                 self.feature_embeddings[idx].weight.data.copy_(torch.from_numpy(data.pretrain_feature_embeddings[idx]))
 57 |             else:
 58 |                 self.feature_embeddings[idx].weight.data.copy_(torch.from_numpy(self.random_embedding(data.feature_alphabets[idx].size(), self.feature_embedding_dims[idx])))
 59 | 
 60 |         if self.gpu:
 61 |             self.drop = self.drop.cuda()
 62 |             self.word_embedding = self.word_embedding.cuda()
 63 |             for idx in range(self.feature_num):
 64 |                 self.feature_embeddings[idx] = self.feature_embeddings[idx].cuda()
 65 | 
 66 | 
 67 | 
 68 |     def random_embedding(self, vocab_size, embedding_dim):
 69 |         pretrain_emb = np.empty([vocab_size, embedding_dim])
 70 |         scale = np.sqrt(3.0 / embedding_dim)
 71 |         for index in range(vocab_size):
 72 |             pretrain_emb[index,:] = np.random.uniform(-scale, scale, [1, embedding_dim])
 73 |         return pretrain_emb
 74 | 
 75 | 
 76 |     def forward(self, word_inputs,feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover):
 77 |         """
 78 |             input:
 79 |                 word_inputs: (batch_size, sent_len)
 80 |                 features: list [(batch_size, sent_len), (batch_len, sent_len),...]
 81 |                 word_seq_lengths: list of batch_size, (batch_size,1)
 82 |                 char_inputs: (batch_size*sent_len, word_length)
 83 |                 char_seq_lengths: list of whole batch_size for char, (batch_size*sent_len, 1)
 84 |                 char_seq_recover: variable which records the char order information, used to recover char order
 85 |             output:
 86 |                 Variable(batch_size, sent_len, hidden_dim)
 87 |         """
 88 |         batch_size = word_inputs.size(0)
 89 |         sent_len = word_inputs.size(1)
 90 | 
 91 |         word_embs =  self.word_embedding(word_inputs)
 92 | 
 93 |         word_list = [word_embs]
 94 |         if not self.sentence_classification:
 95 |             for idx in range(self.feature_num):
 96 |                 word_list.append(self.feature_embeddings[idx](feature_inputs[idx]))
 97 |         if self.use_char:
 98 |             ## calculate char lstm last hidden
 99 |             # print("charinput:", char_inputs)
100 |             # exit(0)
101 |             char_features = self.char_feature.get_last_hiddens(char_inputs, char_seq_lengths.cpu().numpy())
102 |             char_features = char_features[char_seq_recover]
103 |             char_features = char_features.view(batch_size,sent_len,-1)
104 |             ## concat word and char together
105 |             word_list.append(char_features)
106 |             word_embs = torch.cat([word_embs, char_features], 2)
107 |             if self.char_all_feature:
108 |                 char_features_extra = self.char_feature_extra.get_last_hiddens(char_inputs, char_seq_lengths.cpu().numpy())
109 |                 char_features_extra = char_features_extra[char_seq_recover]
110 |                 char_features_extra = char_features_extra.view(batch_size,sent_len,-1)
111 |                 ## concat word and char together
112 |                 word_list.append(char_features_extra)    
113 |         word_embs = torch.cat(word_list, 2)
114 |         # if a == 0:
115 |         #     print("inputs", word_inputs)
116 |         #     print("embeddings:", word_embs)
117 |         word_represent = self.drop(word_embs)
118 |         return word_represent
119 | 


--------------------------------------------------------------------------------
/model/wordsequence.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Author: Jie Yang
  3 | # @Date:   2017-10-17 16:47:32
  4 | # @Last Modified by:   Jie Yang,     Contact: jieynlp@gmail.com
  5 | # @Last Modified time: 2019-02-01 15:59:26
  6 | from __future__ import print_function
  7 | from __future__ import absolute_import
  8 | import torch
  9 | import torch.nn as nn
 10 | import torch.nn.functional as F
 11 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
 12 | from .wordrep import WordRep
 13 | 
 14 | class WordSequence(nn.Module):
 15 |     def __init__(self, data):
 16 |         super(WordSequence, self).__init__()
 17 |         print("build word sequence feature extractor: %s..."%(data.word_feature_extractor))
 18 |         self.gpu = data.HP_gpu
 19 |         self.use_char = data.use_char
 20 |         # self.batch_size = data.HP_batch_size
 21 |         # self.hidden_dim = data.HP_hidden_dim
 22 |         self.droplstm = nn.Dropout(data.HP_dropout)
 23 |         self.bilstm_flag = data.HP_bilstm
 24 |         self.lstm_layer = data.HP_lstm_layer
 25 |         self.wordrep = WordRep(data)
 26 |         self.input_size = data.word_emb_dim
 27 |         self.feature_num = data.feature_num
 28 |         if self.use_char:
 29 |             self.input_size += data.HP_char_hidden_dim
 30 |             if data.char_feature_extractor == "ALL":
 31 |                 self.input_size += data.HP_char_hidden_dim
 32 |         for idx in range(self.feature_num):
 33 |             self.input_size += data.feature_emb_dims[idx]
 34 |         # The LSTM takes word embeddings as inputs, and outputs hidden states
 35 |         # with dimensionality hidden_dim.
 36 |         if self.bilstm_flag:
 37 |             lstm_hidden = data.HP_hidden_dim // 2
 38 |         else:
 39 |             lstm_hidden = data.HP_hidden_dim
 40 | 
 41 |         self.word_feature_extractor = data.word_feature_extractor
 42 |         if self.word_feature_extractor == "GRU":
 43 |             self.lstm = nn.GRU(self.input_size, lstm_hidden, num_layers=self.lstm_layer, batch_first=True, bidirectional=self.bilstm_flag)
 44 |         elif self.word_feature_extractor == "LSTM":
 45 |             self.lstm = nn.LSTM(self.input_size, lstm_hidden, num_layers=self.lstm_layer, batch_first=True, bidirectional=self.bilstm_flag)
 46 |         elif self.word_feature_extractor == "CNN":
 47 |             # cnn_hidden = data.HP_hidden_dim
 48 |             self.word2cnn = nn.Linear(self.input_size, data.HP_hidden_dim)
 49 |             self.cnn_layer = data.HP_cnn_layer
 50 |             print("CNN layer: ", self.cnn_layer)
 51 |             self.cnn_list = nn.ModuleList()
 52 |             self.cnn_drop_list = nn.ModuleList()
 53 |             self.cnn_batchnorm_list = nn.ModuleList()
 54 |             kernel = 3
 55 |             pad_size = int((kernel-1)/2)
 56 |             for idx in range(self.cnn_layer):
 57 |                 self.cnn_list.append(nn.Conv1d(data.HP_hidden_dim, data.HP_hidden_dim, kernel_size=kernel, padding=pad_size))
 58 |                 self.cnn_drop_list.append(nn.Dropout(data.HP_dropout))
 59 |                 self.cnn_batchnorm_list.append(nn.BatchNorm1d(data.HP_hidden_dim))
 60 |         # The linear layer that maps from hidden state space to tag space
 61 |         self.hidden2tag = nn.Linear(data.HP_hidden_dim, data.label_alphabet_size)
 62 | 
 63 |         if self.gpu:
 64 |             self.droplstm = self.droplstm.cuda()
 65 |             self.hidden2tag = self.hidden2tag.cuda()
 66 |             if self.word_feature_extractor == "CNN":
 67 |                 self.word2cnn = self.word2cnn.cuda()
 68 |                 for idx in range(self.cnn_layer):
 69 |                     self.cnn_list[idx] = self.cnn_list[idx].cuda()
 70 |                     self.cnn_drop_list[idx] = self.cnn_drop_list[idx].cuda()
 71 |                     self.cnn_batchnorm_list[idx] = self.cnn_batchnorm_list[idx].cuda()
 72 |             else:
 73 |                 self.lstm = self.lstm.cuda()
 74 | 
 75 | 
 76 |     def forward(self, word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover):
 77 |         """
 78 |             input:
 79 |                 word_inputs: (batch_size, sent_len)
 80 |                 feature_inputs: [(batch_size, sent_len), ...] list of variables
 81 |                 word_seq_lengths: list of batch_size, (batch_size,1)
 82 |                 char_inputs: (batch_size*sent_len, word_length)
 83 |                 char_seq_lengths: list of whole batch_size for char, (batch_size*sent_len, 1)
 84 |                 char_seq_recover: variable which records the char order information, used to recover char order
 85 |             output:
 86 |                 Variable(batch_size, sent_len, hidden_dim)
 87 |         """
 88 |         
 89 |         word_represent = self.wordrep(word_inputs,feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover)
 90 |         ## word_embs (batch_size, seq_len, embed_size)
 91 |         if self.word_feature_extractor == "CNN":
 92 |             batch_size = word_inputs.size(0)
 93 |             word_in = torch.tanh(self.word2cnn(word_represent)).transpose(2,1).contiguous()
 94 |             for idx in range(self.cnn_layer):
 95 |                 if idx == 0:
 96 |                     cnn_feature = F.relu(self.cnn_list[idx](word_in))
 97 |                 else:
 98 |                     cnn_feature = F.relu(self.cnn_list[idx](cnn_feature))
 99 |                 cnn_feature = self.cnn_drop_list[idx](cnn_feature)
100 |                 if batch_size > 1:
101 |                     cnn_feature = self.cnn_batchnorm_list[idx](cnn_feature)
102 |             feature_out = cnn_feature.transpose(2,1).contiguous()
103 |         else:
104 |             packed_words = pack_padded_sequence(word_represent, word_seq_lengths.cpu().numpy(), True)
105 |             hidden = None
106 |             lstm_out, hidden = self.lstm(packed_words, hidden)
107 |             lstm_out, _ = pad_packed_sequence(lstm_out)
108 |             ## lstm_out (seq_len, seq_len, hidden_size)
109 |             feature_out = self.droplstm(lstm_out.transpose(1,0))
110 |         ## feature_out (batch_size, seq_len, hidden_size)
111 |         outputs = self.hidden2tag(feature_out)
112 |         return outputs
113 | 
114 |     def sentence_representation(self, word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover):
115 |         """
116 |             input:
117 |                 word_inputs: (batch_size, sent_len)
118 |                 feature_inputs: [(batch_size, ), ...] list of variables
119 |                 word_seq_lengths: list of batch_size, (batch_size,1)
120 |                 char_inputs: (batch_size*sent_len, word_length)
121 |                 char_seq_lengths: list of whole batch_size for char, (batch_size*sent_len, 1)
122 |                 char_seq_recover: variable which records the char order information, used to recover char order
123 |             output:
124 |                 Variable(batch_size, sent_len, hidden_dim)
125 |         """
126 | 
127 |         word_represent = self.wordrep(word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover)
128 |         ## word_embs (batch_size, seq_len, embed_size)
129 |         batch_size = word_inputs.size(0)
130 |         if self.word_feature_extractor == "CNN":
131 |             word_in = torch.tanh(self.word2cnn(word_represent)).transpose(2,1).contiguous()
132 |             for idx in range(self.cnn_layer):
133 |                 if idx == 0:
134 |                     cnn_feature = F.relu(self.cnn_list[idx](word_in))
135 |                 else:
136 |                     cnn_feature = F.relu(self.cnn_list[idx](cnn_feature))
137 |                 cnn_feature = self.cnn_drop_list[idx](cnn_feature)
138 |                 if batch_size > 1:
139 |                     cnn_feature = self.cnn_batchnorm_list[idx](cnn_feature)
140 |             feature_out = F.max_pool1d(cnn_feature, cnn_feature.size(2)).view(batch_size, -1)
141 |         else:
142 |             packed_words = pack_padded_sequence(word_represent, word_seq_lengths.cpu().numpy(), True)
143 |             hidden = None
144 |             lstm_out, hidden = self.lstm(packed_words, hidden)
145 |             ## lstm_out (seq_len, seq_len, hidden_size)
146 |             ## feature_out (batch_size, hidden_size)
147 |             feature_out = hidden[0].transpose(1,0).contiguous().view(batch_size,-1)
148 |             
149 |         feature_list = [feature_out]
150 |         for idx in range(self.feature_num):
151 |             feature_list.append(self.feature_embeddings[idx](feature_inputs[idx]))
152 |         final_feature = torch.cat(feature_list, 1)
153 |         outputs = self.hidden2tag(self.droplstm(final_feature))
154 |         ## outputs: (batch_size, label_alphabet_size)
155 |         return outputs
156 | 


--------------------------------------------------------------------------------
/readme/Configuration.md:
--------------------------------------------------------------------------------
 1 | ### I/O ###
 2 | ```Python
 3 | train_dir=xx    #string (necessary in training). Set training file directory.
 4 | dev_dir=xx    #string (necessary in training). Set dev file directory.
 5 | test_dir=xx    #string . Set test file directory.
 6 | model_dir=xx    #string (optional). Set saved model file directory.
 7 | word_emb_dir=xx    #string (optional). Set pretrained word embedding file directory.
 8 | 
 9 | raw_dir=xx    #string (optional). Set input raw file directory.
10 | decode_dir=xx    #string (necessary in decoding). Set decoded file directory.
11 | dset_dir=xx    #string (necessary). Set saved model file directory.
12 | load_model_dir=xx    #string (necessary in decoding). Set loaded model file directory. (when decoding)
13 | char_emb_dir=xx    #string (optional). Set pretrained character embedding file directory.
14 | 
15 | norm_word_emb=False    #boolen. If normalize the pretrained word embedding.
16 | norm_char_emb=False    #boolen. If normalize the pretrained character embedding.
17 | number_normalized=True    #boolen. If normalize the digit into `0` for input files.
18 | seg=True    #boolen. If task is segmentation like, tasks with token accuracy evaluation (e.g. POS, CCG) is False; tasks with F-value evaluation(e.g. Word Segmentation, NER, Chunking) is True .
19 | word_emb_dim=50    #int. Word embedding dimension, if model use pretrained word embedding, word_emb_dim will be reset as the same dimension as pretrained embedidng.
20 | char_emb_dim=30    #int. Character embedding dimension, if model use pretrained character embedding, char_emb_dim will be reset as the same dimension as pretrained embedidng.
21 | ```
22 | 
23 | ### NetworkConfiguration ###
24 | ```Python
25 | use_crf=True    #boolen (necessary in training). Flag of if using CRF layer. If it is set as False, then Softmax is used in inference layer.
26 | use_char=True    #boolen (necessary in training). Flag of if using character sequence layer. 
27 | word_seq_feature=XX    #boolen (necessary in training): CNN/LSTM/GRU. Neural structure selection for word sequence. 
28 | char_seq_feature=CNN    #boolen (necessary in training): CNN/LSTM/GRU. Neural structure selection for character sequence, it only be used when use_char=True.
29 | feature=[POS] emb_size=20 emb_dir=xx   #feature configuration. It includes the feature prefix [POS], pretrained feature embedding file and the embedding size. 
30 | feature=[Cap] emb_size=20 emb_dir=xx    #feature configuration. Another feature [Cap].
31 | nbest=1    #int (necessary in decoding). Set the nbest size during decoding.
32 | ```
33 | 
34 | ### TrainingSetting ###
35 | ```Python
36 | status=train    #string: train or decode. Set the program running in training or decoding mode.
37 | optimizer=SGD    #string: SGD/Adagrad/AdaDelta/RMSprop/Adam. optimizer selection.
38 | iteration=1    #int. Set the iteration number of training.
39 | batch_size=10    #int. Set the batch size of training or decoding.
40 | ave_batch_loss=False    #boolen. Set average the batched loss during training.
41 | ```
42 | 
43 | ### Hyperparameters ###
44 | ```Python
45 | cnn_layer=4    #int. CNN layer number for word sequence layer.
46 | char_hidden_dim=50    #int. Character hidden vector dimension for character sequence layer.
47 | hidden_dim=200    #int. Word hidden vector dimension for word sequence layer.
48 | dropout=0.5    #float. Dropout probability.
49 | lstm_layer=1    #int. LSTM layer number for word sequence layer.
50 | bilstm=True    #boolen. If use bidirection lstm for word seuquence layer.
51 | learning_rate=0.015    #float. Learning rate.
52 | lr_decay=0.05    #float. Learning rate decay rate, only works when optimizer=SGD.
53 | momentum=0    #float. Momentum 
54 | l2=1e-8    #float. L2-regulization.
55 | #gpu=True  #boolen. If use GPU, generally it depends on the hardward environment.
56 | #clip=     #float. Clip the gradient which is larger than the setted number.
57 | ```
58 | 


--------------------------------------------------------------------------------
/readme/Extension.md:
--------------------------------------------------------------------------------
1 | ### Module Extension. ###
2 | 
3 | If you want to extend character sequence layer: please refer to the file [charlstm.py](model/charlstm.py).
4 | 
5 | If you want to extend word sequence layer: please refer to the file [wordsequence.py](model/wordsequence.py).
6 | 
7 | More details will be updated soon.


--------------------------------------------------------------------------------
/readme/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiesutd/NCRFpp/105a53a321eca9c1280037c473967858e01aaa43/readme/architecture.png


--------------------------------------------------------------------------------
/readme/hyperparameter_tuning.md:
--------------------------------------------------------------------------------
 1 | ## Hyperparamter tuning on CoNLL 2003 English NER task
 2 | 
 3 | 1. If you use large batch (e.g. batch_size > 100), you'd better set `avg_batch_loss=True` to get a stable training process. For small batch size, `avg_batch_loss=True` will converge faster and sometimes gives better performance (e.g. CoNLL 2003 NER).
 4 | 2. You can get better performance on the CoNLL 2003 English dataset if you use 100-d pretrained word vectors [here](https://nlp.stanford.edu/projects/glove/) instead of 50-d pretrained word vectors.
 5 | 3. If you want to write a script to tune hyperparameters, you can use the `main_parse.py` to set hyperparameters in command line arguements.
 6 | 4. Model performance is sensitive with `lr` which needs to be carefully tuned under different structures:
 7 |     * Word level LSTM models (e.g. char LSTM + word LSTM + CRF) would prefer a `lr` around 0.015.
 8 |     * Word level CNN models (e.g. char LSTM + word CNN + CRF) would prefer a `lr` around 0.005 and with more iterations.
 9 |     * You can refer the COLING paper "[Design Challenges and Misconceptions in Neural Sequence Labeling](https://arxiv.org/pdf/1806.04470.pdf)" for more hyperparameter settings.
10 | 


--------------------------------------------------------------------------------
/readme/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiesutd/NCRFpp/105a53a321eca9c1280037c473967858e01aaa43/readme/logo.png


--------------------------------------------------------------------------------
/readme/nbest.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiesutd/NCRFpp/105a53a321eca9c1280037c473967858e01aaa43/readme/nbest.png


--------------------------------------------------------------------------------
/readme/speed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiesutd/NCRFpp/105a53a321eca9c1280037c473967858e01aaa43/readme/speed.png


--------------------------------------------------------------------------------
/sample_data/dev.bmes:
--------------------------------------------------------------------------------
   1 | CRICKET O
   2 | - O
   3 | LEICESTERSHIRE S-ORG
   4 | TAKE O
   5 | OVER O
   6 | AT O
   7 | TOP O
   8 | AFTER O
   9 | INNINGS O
  10 | VICTORY O
  11 | . O
  12 | 
  13 | LONDON S-LOC
  14 | 1996-08-30 O
  15 | 
  16 | West B-MISC
  17 | Indian E-MISC
  18 | all-rounder O
  19 | Phil B-PER
  20 | Simmons E-PER
  21 | took O
  22 | four O
  23 | for O
  24 | 38 O
  25 | on O
  26 | Friday O
  27 | as O
  28 | Leicestershire S-ORG
  29 | beat O
  30 | Somerset S-ORG
  31 | by O
  32 | an O
  33 | innings O
  34 | and O
  35 | 39 O
  36 | runs O
  37 | in O
  38 | two O
  39 | days O
  40 | to O
  41 | take O
  42 | over O
  43 | at O
  44 | the O
  45 | head O
  46 | of O
  47 | the O
  48 | county O
  49 | championship O
  50 | . O
  51 | 
  52 | Their O
  53 | stay O
  54 | on O
  55 | top O
  56 | , O
  57 | though O
  58 | , O
  59 | may O
  60 | be O
  61 | short-lived O
  62 | as O
  63 | title O
  64 | rivals O
  65 | Essex S-ORG
  66 | , O
  67 | Derbyshire S-ORG
  68 | and O
  69 | Surrey S-ORG
  70 | all O
  71 | closed O
  72 | in O
  73 | on O
  74 | victory O
  75 | while O
  76 | Kent S-ORG
  77 | made O
  78 | up O
  79 | for O
  80 | lost O
  81 | time O
  82 | in O
  83 | their O
  84 | rain-affected O
  85 | match O
  86 | against O
  87 | Nottinghamshire S-ORG
  88 | . O
  89 | 
  90 | After O
  91 | bowling O
  92 | Somerset S-ORG
  93 | out O
  94 | for O
  95 | 83 O
  96 | on O
  97 | the O
  98 | opening O
  99 | morning O
 100 | at O
 101 | Grace B-LOC
 102 | Road E-LOC
 103 | , O
 104 | Leicestershire S-ORG
 105 | extended O
 106 | their O
 107 | first O
 108 | innings O
 109 | by O
 110 | 94 O
 111 | runs O
 112 | before O
 113 | being O
 114 | bowled O
 115 | out O
 116 | for O
 117 | 296 O
 118 | with O
 119 | England S-LOC
 120 | discard O
 121 | Andy B-PER
 122 | Caddick E-PER
 123 | taking O
 124 | three O
 125 | for O
 126 | 83 O
 127 | . O
 128 | 
 129 | Trailing O
 130 | by O
 131 | 213 O
 132 | , O
 133 | Somerset S-ORG
 134 | got O
 135 | a O
 136 | solid O
 137 | start O
 138 | to O
 139 | their O
 140 | second O
 141 | innings O
 142 | before O
 143 | Simmons S-PER
 144 | stepped O
 145 | in O
 146 | to O
 147 | bundle O
 148 | them O
 149 | out O
 150 | for O
 151 | 174 O
 152 | . O
 153 | 
 154 | Essex S-ORG
 155 | , O
 156 | however O
 157 | , O
 158 | look O
 159 | certain O
 160 | to O
 161 | regain O
 162 | their O
 163 | top O
 164 | spot O
 165 | after O
 166 | Nasser B-PER
 167 | Hussain E-PER
 168 | and O
 169 | Peter B-PER
 170 | Such E-PER
 171 | gave O
 172 | them O
 173 | a O
 174 | firm O
 175 | grip O
 176 | on O
 177 | their O
 178 | match O
 179 | against O
 180 | Yorkshire S-ORG
 181 | at O
 182 | Headingley S-LOC
 183 | . O
 184 | 
 185 | Hussain S-PER
 186 | , O
 187 | considered O
 188 | surplus O
 189 | to O
 190 | England S-LOC
 191 | 's O
 192 | one-day O
 193 | requirements O
 194 | , O
 195 | struck O
 196 | 158 O
 197 | , O
 198 | his O
 199 | first O
 200 | championship O
 201 | century O
 202 | of O
 203 | the O
 204 | season O
 205 | , O
 206 | as O
 207 | Essex S-ORG
 208 | reached O
 209 | 372 O
 210 | and O
 211 | took O
 212 | a O
 213 | first O
 214 | innings O
 215 | lead O
 216 | of O
 217 | 82 O
 218 | . O
 219 | 
 220 | By O
 221 | the O
 222 | close O
 223 | Yorkshire S-ORG
 224 | had O
 225 | turned O
 226 | that O
 227 | into O
 228 | a O
 229 | 37-run O
 230 | advantage O
 231 | but O
 232 | off-spinner O
 233 | Such S-PER
 234 | had O
 235 | scuttled O
 236 | their O
 237 | hopes O
 238 | , O
 239 | taking O
 240 | four O
 241 | for O
 242 | 24 O
 243 | in O
 244 | 48 O
 245 | balls O
 246 | and O
 247 | leaving O
 248 | them O
 249 | hanging O
 250 | on O
 251 | 119 O
 252 | for O
 253 | five O
 254 | and O
 255 | praying O
 256 | for O
 257 | rain O
 258 | . O
 259 | 
 260 | At O
 261 | the O
 262 | Oval S-LOC
 263 | , O
 264 | Surrey S-ORG
 265 | captain O
 266 | Chris B-PER
 267 | Lewis E-PER
 268 | , O
 269 | another O
 270 | man O
 271 | dumped O
 272 | by O
 273 | England S-LOC
 274 | , O
 275 | continued O
 276 | to O
 277 | silence O
 278 | his O
 279 | critics O
 280 | as O
 281 | he O
 282 | followed O
 283 | his O
 284 | four O
 285 | for O
 286 | 45 O
 287 | on O
 288 | Thursday O
 289 | with O
 290 | 80 O
 291 | not O
 292 | out O
 293 | on O
 294 | Friday O
 295 | in O
 296 | the O
 297 | match O
 298 | against O
 299 | Warwickshire S-ORG
 300 | . O
 301 | 
 302 | He O
 303 | was O
 304 | well O
 305 | backed O
 306 | by O
 307 | England S-LOC
 308 | hopeful O
 309 | Mark B-PER
 310 | Butcher E-PER
 311 | who O
 312 | made O
 313 | 70 O
 314 | as O
 315 | Surrey S-ORG
 316 | closed O
 317 | on O
 318 | 429 O
 319 | for O
 320 | seven O
 321 | , O
 322 | a O
 323 | lead O
 324 | of O
 325 | 234 O
 326 | . O
 327 | 
 328 | Derbyshire S-ORG
 329 | kept O
 330 | up O
 331 | the O
 332 | hunt O
 333 | for O
 334 | their O
 335 | first O
 336 | championship O
 337 | title O
 338 | since O
 339 | 1936 O
 340 | by O
 341 | reducing O
 342 | Worcestershire S-ORG
 343 | to O
 344 | 133 O
 345 | for O
 346 | five O
 347 | in O
 348 | their O
 349 | second O
 350 | innings O
 351 | , O
 352 | still O
 353 | 100 O
 354 | runs O
 355 | away O
 356 | from O
 357 | avoiding O
 358 | an O
 359 | innings O
 360 | defeat O
 361 | . O
 362 | 
 363 | Australian S-MISC
 364 | Tom B-PER
 365 | Moody E-PER
 366 | took O
 367 | six O
 368 | for O
 369 | 82 O
 370 | but O
 371 | Chris B-PER
 372 | Adams E-PER
 373 | , O
 374 | 123 O
 375 | , O
 376 | and O
 377 | Tim B-PER
 378 | O'Gorman E-PER
 379 | , O
 380 | 109 O
 381 | , O
 382 | took O
 383 | Derbyshire S-ORG
 384 | to O
 385 | 471 O
 386 | and O
 387 | a O
 388 | first O
 389 | innings O
 390 | lead O
 391 | of O
 392 | 233 O
 393 | . O
 394 | 
 395 | After O
 396 | the O
 397 | frustration O
 398 | of O
 399 | seeing O
 400 | the O
 401 | opening O
 402 | day O
 403 | of O
 404 | their O
 405 | match O
 406 | badly O
 407 | affected O
 408 | by O
 409 | the O
 410 | weather O
 411 | , O
 412 | Kent S-ORG
 413 | stepped O
 414 | up O
 415 | a O
 416 | gear O
 417 | to O
 418 | dismiss O
 419 | Nottinghamshire S-ORG
 420 | for O
 421 | 214 O
 422 | . O
 423 | 
 424 | They O
 425 | were O
 426 | held O
 427 | up O
 428 | by O
 429 | a O
 430 | gritty O
 431 | 84 O
 432 | from O
 433 | Paul B-PER
 434 | Johnson E-PER
 435 | but O
 436 | ex-England S-MISC
 437 | fast O
 438 | bowler O
 439 | Martin B-PER
 440 | McCague E-PER
 441 | took O
 442 | four O
 443 | for O
 444 | 55 O
 445 | . O
 446 | 
 447 | By O
 448 | stumps O
 449 | Kent S-ORG
 450 | had O
 451 | reached O
 452 | 108 O
 453 | for O
 454 | three O
 455 | . O
 456 | 
 457 | -DOCSTART- O
 458 | 
 459 | CRICKET O
 460 | - O
 461 | ENGLISH B-MISC
 462 | COUNTY I-MISC
 463 | CHAMPIONSHIP E-MISC
 464 | SCORES O
 465 | . O
 466 | 
 467 | LONDON S-LOC
 468 | 1996-08-30 O
 469 | 
 470 | Result O
 471 | and O
 472 | close O
 473 | of O
 474 | play O
 475 | scores O
 476 | in O
 477 | English S-MISC
 478 | county O
 479 | championship O
 480 | matches O
 481 | on O
 482 | Friday O
 483 | : O
 484 | 
 485 | Leicester S-LOC
 486 | : O
 487 | Leicestershire S-ORG
 488 | beat O
 489 | Somerset S-ORG
 490 | by O
 491 | an O
 492 | innings O
 493 | and O
 494 | 39 O
 495 | runs O
 496 | . O
 497 | 
 498 | Somerset S-ORG
 499 | 83 O
 500 | and O
 501 | 174 O
 502 | ( O
 503 | P. B-PER
 504 | Simmons E-PER
 505 | 4-38 O
 506 | ) O
 507 | , O
 508 | Leicestershire S-ORG
 509 | 296 O
 510 | . O
 511 | 
 512 | Leicestershire S-ORG
 513 | 22 O
 514 | points O
 515 | , O
 516 | Somerset S-ORG
 517 | 4 O
 518 | . O
 519 | 
 520 | Chester-le-Street S-LOC
 521 | : O
 522 | Glamorgan S-ORG
 523 | 259 O
 524 | and O
 525 | 207 O
 526 | ( O
 527 | A. B-PER
 528 | Dale E-PER
 529 | 69 O
 530 | , O
 531 | H. B-PER
 532 | Morris E-PER
 533 | 69 O
 534 | ; O
 535 | D. B-PER
 536 | Blenkiron E-PER
 537 | 4-43 O
 538 | ) O
 539 | , O
 540 | Durham S-ORG
 541 | 114 O
 542 | ( O
 543 | S. B-PER
 544 | Watkin E-PER
 545 | 4-28 O
 546 | ) O
 547 | and O
 548 | 81-3 O
 549 | . O
 550 | 
 551 | Tunbridge B-LOC
 552 | Wells E-LOC
 553 | : O
 554 | Nottinghamshire S-ORG
 555 | 214 O
 556 | ( O
 557 | P. B-PER
 558 | Johnson E-PER
 559 | 84 O
 560 | ; O
 561 | M. B-PER
 562 | McCague E-PER
 563 | 4-55 O
 564 | ) O
 565 | , O
 566 | Kent S-ORG
 567 | 108-3 O
 568 | . O
 569 | 
 570 | London S-LOC
 571 | ( O
 572 | The B-LOC
 573 | Oval E-LOC
 574 | ) O
 575 | : O
 576 | Warwickshire S-ORG
 577 | 195 O
 578 | , O
 579 | Surrey S-ORG
 580 | 429-7 O
 581 | ( O
 582 | C. B-PER
 583 | Lewis E-PER
 584 | 80 O
 585 | not O
 586 | out O
 587 | , O
 588 | M. B-PER
 589 | Butcher E-PER
 590 | 70 O
 591 | , O
 592 | G. B-PER
 593 | Kersey E-PER
 594 | 63 O
 595 | , O
 596 | J. B-PER
 597 | Ratcliffe E-PER
 598 | 63 O
 599 | , O
 600 | D. B-PER
 601 | Bicknell E-PER
 602 | 55 O
 603 | ) O
 604 | . O
 605 | 
 606 | Hove S-LOC
 607 | : O
 608 | Sussex S-ORG
 609 | 363 O
 610 | ( O
 611 | W. B-PER
 612 | Athey E-PER
 613 | 111 O
 614 | , O
 615 | V. B-PER
 616 | Drakes E-PER
 617 | 52 O
 618 | ; O
 619 | I. B-PER
 620 | Austin E-PER
 621 | 4-37 O
 622 | ) O
 623 | , O
 624 | Lancashire S-ORG
 625 | 197-8 O
 626 | ( O
 627 | W. B-PER
 628 | Hegg E-PER
 629 | 54 O
 630 | ) O
 631 | 
 632 | Portsmouth S-LOC
 633 | : O
 634 | Middlesex S-ORG
 635 | 199 O
 636 | and O
 637 | 426 O
 638 | ( O
 639 | J. B-PER
 640 | Pooley E-PER
 641 | 111 O
 642 | , O
 643 | M. B-PER
 644 | Ramprakash E-PER
 645 | 108 O
 646 | , O
 647 | M. B-PER
 648 | Gatting E-PER
 649 | 83 O
 650 | ) O
 651 | , O
 652 | Hampshire S-ORG
 653 | 232 O
 654 | and O
 655 | 109-5 O
 656 | . O
 657 | 
 658 | Chesterfield S-LOC
 659 | : O
 660 | Worcestershire S-ORG
 661 | 238 O
 662 | and O
 663 | 133-5 O
 664 | , O
 665 | Derbyshire S-ORG
 666 | 471 O
 667 | ( O
 668 | J. B-PER
 669 | Adams E-PER
 670 | 123 O
 671 | , O
 672 | T.O'Gorman S-PER
 673 | 109 O
 674 | not O
 675 | out O
 676 | , O
 677 | K. B-PER
 678 | Barnett E-PER
 679 | 87 O
 680 | ; O
 681 | T. B-PER
 682 | Moody E-PER
 683 | 6-82 O
 684 | ) O
 685 | 
 686 | Bristol S-LOC
 687 | : O
 688 | Gloucestershire S-ORG
 689 | 183 O
 690 | and O
 691 | 185-6 O
 692 | ( O
 693 | J. B-PER
 694 | Russell E-PER
 695 | 56 O
 696 | not O
 697 | out O
 698 | ) O
 699 | , O
 700 | Northamptonshire S-ORG
 701 | 190 O
 702 | ( O
 703 | K. B-PER
 704 | Curran E-PER
 705 | 52 O
 706 | ; O
 707 | A. B-PER
 708 | Smith E-PER
 709 | 5-68 O
 710 | ) O
 711 | . O
 712 | 
 713 | -DOCSTART- O
 714 | 
 715 | CRICKET O
 716 | - O
 717 | 1997 O
 718 | ASHES S-MISC
 719 | INTINERARY O
 720 | . O
 721 | 
 722 | LONDON S-LOC
 723 | 1996-08-30 O
 724 | 
 725 | Australia S-LOC
 726 | will O
 727 | defend O
 728 | the O
 729 | Ashes S-MISC
 730 | in O
 731 | 
 732 | a O
 733 | six-test O
 734 | series O
 735 | against O
 736 | England S-LOC
 737 | during O
 738 | a O
 739 | four-month O
 740 | tour O
 741 | 
 742 | starting O
 743 | on O
 744 | May O
 745 | 13 O
 746 | next O
 747 | year O
 748 | , O
 749 | the O
 750 | Test B-ORG
 751 | and I-ORG
 752 | County I-ORG
 753 | Cricket I-ORG
 754 | Board E-ORG
 755 | 
 756 | said O
 757 | on O
 758 | Friday O
 759 | . O
 760 | 
 761 | Australia S-LOC
 762 | will O
 763 | also O
 764 | play O
 765 | three O
 766 | one-day O
 767 | internationals O
 768 | and O
 769 | 
 770 | four O
 771 | one-day O
 772 | warm-up O
 773 | matches O
 774 | at O
 775 | the O
 776 | start O
 777 | of O
 778 | the O
 779 | tour O
 780 | . O
 781 | 
 782 | The O
 783 | tourists O
 784 | will O
 785 | play O
 786 | nine O
 787 | first-class O
 788 | matches O
 789 | against O
 790 | 
 791 | English S-MISC
 792 | county O
 793 | sides O
 794 | and O
 795 | another O
 796 | against O
 797 | British B-ORG
 798 | Universities E-ORG
 799 | , O
 800 | 
 801 | as O
 802 | well O
 803 | as O
 804 | one-day O
 805 | matches O
 806 | against O
 807 | the O
 808 | Minor B-ORG
 809 | Counties E-ORG
 810 | and O
 811 | 
 812 | Scotland S-LOC
 813 | . O
 814 | 
 815 | Tour O
 816 | itinerary O
 817 | : O
 818 | 
 819 | May O
 820 | 
 821 | May O
 822 | 13 O
 823 | Arrive O
 824 | in O
 825 | London S-LOC
 826 | 
 827 | May O
 828 | 14 O
 829 | Practice O
 830 | at O
 831 | Lord B-LOC
 832 | 's E-LOC
 833 | 
 834 | May O
 835 | 15 O
 836 | v O
 837 | Duke B-ORG
 838 | of I-ORG
 839 | Norfolk I-ORG
 840 | 's I-ORG
 841 | XI E-ORG
 842 | ( O
 843 | at O
 844 | Arundel S-LOC
 845 | ) O
 846 | 
 847 | May O
 848 | 17 O
 849 | v O
 850 | Northampton S-ORG
 851 | 
 852 | May O
 853 | 18 O
 854 | v O
 855 | Worcestershire S-ORG
 856 | 
 857 | May O
 858 | 20 O
 859 | v O
 860 | Durham S-ORG
 861 | 
 862 | May O
 863 | 22 O
 864 | First O
 865 | one-day O
 866 | international O
 867 | ( O
 868 | at O
 869 | Headingley S-LOC
 870 | , O
 871 | 
 872 | Leeds S-ORG
 873 | ) O
 874 | 
 875 | May O
 876 | 24 O
 877 | Second O
 878 | one-day O
 879 | international O
 880 | ( O
 881 | at O
 882 | The B-LOC
 883 | Oval E-LOC
 884 | , O
 885 | 
 886 | London S-LOC
 887 | ) O
 888 | 
 889 | May O
 890 | 25 O
 891 | Third O
 892 | one-day O
 893 | international O
 894 | ( O
 895 | at O
 896 | Lord B-LOC
 897 | 's E-LOC
 898 | , O
 899 | London S-LOC
 900 | ) O
 901 | 
 902 | May O
 903 | 27-29 O
 904 | v O
 905 | Gloucestershire S-ORG
 906 | or O
 907 | Sussex S-ORG
 908 | or O
 909 | Surrey S-ORG
 910 | ( O
 911 | three O
 912 | 
 913 | days O
 914 | ) O
 915 | 
 916 | May O
 917 | 31 O
 918 | - O
 919 | June O
 920 | 2 O
 921 | v O
 922 | Derbyshire S-ORG
 923 | ( O
 924 | three O
 925 | days O
 926 | ) O
 927 | 
 928 | June O
 929 | 
 930 | June O
 931 | 5-9 O
 932 | First O
 933 | test O
 934 | match O
 935 | ( O
 936 | at O
 937 | Edgbaston S-LOC
 938 | , O
 939 | Birmingham S-LOC
 940 | ) O
 941 | 
 942 | June O
 943 | 11-13 O
 944 | v O
 945 | a O
 946 | first O
 947 | class O
 948 | county O
 949 | ( O
 950 | to O
 951 | be O
 952 | confirmed O
 953 | ) O
 954 | 
 955 | June O
 956 | 14-16 O
 957 | v O
 958 | Leicestershire S-ORG
 959 | ( O
 960 | three O
 961 | days O
 962 | ) O
 963 | 
 964 | June O
 965 | 19-23 O
 966 | Second O
 967 | test O
 968 | ( O
 969 | at O
 970 | Lord B-LOC
 971 | 's E-LOC
 972 | ) O
 973 | 
 974 | June O
 975 | 25-27 O
 976 | v O
 977 | British B-ORG
 978 | Universities E-ORG
 979 | ( O
 980 | at O
 981 | Oxford S-LOC
 982 | , O
 983 | three O
 984 | days O
 985 | ) O
 986 | 
 987 | June O
 988 | 28-30 O
 989 | v O
 990 | Hampshire S-ORG
 991 | ( O
 992 | three O
 993 | days O
 994 | ) O
 995 | 
 996 | July O
 997 | 
 998 | July O
 999 | 3-7 O
1000 | Third O
1001 | test O
1002 | ( O
1003 | at O
1004 | Old B-LOC
1005 | Trafford E-LOC
1006 | , O
1007 | Manchester S-LOC
1008 | ) O
1009 | 
1010 | July O
1011 | 9 O
1012 | v O
1013 | Minor B-ORG
1014 | Counties I-ORG
1015 | XI E-ORG
1016 | 
1017 | July O
1018 | 12 O
1019 | v O
1020 | Scotland S-LOC
1021 | 
1022 | July O
1023 | 16-18 O
1024 | v O
1025 | Glamorgan S-ORG
1026 | ( O
1027 | three O
1028 | days O
1029 | ) O
1030 | 
1031 | July O
1032 | 19-21 O
1033 | v O
1034 | Middlesex S-ORG
1035 | ( O
1036 | three O
1037 | days O
1038 | ) O
1039 | 
1040 | July O
1041 | 24-28 O
1042 | Fourth O
1043 | test O
1044 | ( O
1045 | at O
1046 | Headingley S-LOC
1047 | ) O
1048 | 
1049 | August O
1050 | 
1051 | August O
1052 | 1-4 O
1053 | v O
1054 | Somerset S-ORG
1055 | ( O
1056 | four O
1057 | days O
1058 | ) O
1059 | 
1060 | August O
1061 | 7-11 O
1062 | Fifth O
1063 | test O
1064 | ( O
1065 | at O
1066 | Trent B-LOC
1067 | Bridge E-LOC
1068 | , O
1069 | Nottingham S-LOC
1070 | ) O
1071 | 
1072 | August O
1073 | 16-18 O
1074 | v O
1075 | Kent S-ORG
1076 | ( O
1077 | three O
1078 | days O
1079 | ) O
1080 | 
1081 | August O
1082 | 21-25 O
1083 | Sixth O
1084 | test O
1085 | ( O
1086 | at O
1087 | The B-LOC
1088 | Oval E-LOC
1089 | , O
1090 | London S-LOC
1091 | ) O
1092 | . O
1093 | 
1094 | -DOCSTART- O
1095 | 
1096 | SOCCER O
1097 | - O
1098 | SHEARER S-PER
1099 | NAMED O
1100 | AS O
1101 | ENGLAND S-LOC
1102 | CAPTAIN O
1103 | . O
1104 | 
1105 | LONDON S-LOC
1106 | 1996-08-30 O
1107 | 
1108 | The O
1109 | world O
1110 | 's O
1111 | costliest O
1112 | footballer O
1113 | Alan B-PER
1114 | Shearer E-PER
1115 | was O
1116 | named O
1117 | as O
1118 | the O
1119 | new O
1120 | England S-LOC
1121 | captain O
1122 | on O
1123 | Friday O
1124 | . O
1125 | 
1126 | The O
1127 | 26-year-old O
1128 | , O
1129 | who O
1130 | joined O
1131 | Newcastle S-ORG
1132 | for O
1133 | 15 O
1134 | million O
1135 | pounds O
1136 | sterling O
1137 | ( O
1138 | $ O
1139 | 23.4 O
1140 | million O
1141 | ) O
1142 | , O
1143 | takes O
1144 | over O
1145 | from O
1146 | Tony B-PER
1147 | Adams E-PER
1148 | , O
1149 | who O
1150 | led O
1151 | the O
1152 | side O
1153 | during O
1154 | the O
1155 | European S-MISC
1156 | championship O
1157 | in O
1158 | June O
1159 | , O
1160 | and O
1161 | former O
1162 | captain O
1163 | David B-PER
1164 | Platt E-PER
1165 | . O
1166 | 
1167 | Adams S-PER
1168 | and O
1169 | Platt S-PER
1170 | are O
1171 | both O
1172 | injured O
1173 | and O
1174 | will O
1175 | miss O
1176 | England S-LOC
1177 | 's O
1178 | opening O
1179 | World B-MISC
1180 | Cup E-MISC
1181 | qualifier O
1182 | against O
1183 | Moldova S-LOC
1184 | on O
1185 | Sunday O
1186 | . O
1187 | 
1188 | Shearer S-PER
1189 | takes O
1190 | the O
1191 | captaincy O
1192 | on O
1193 | a O
1194 | trial O
1195 | basis O
1196 | , O
1197 | but O
1198 | new O
1199 | coach O
1200 | Glenn B-PER
1201 | Hoddle E-PER
1202 | said O
1203 | he O
1204 | saw O
1205 | no O
1206 | reason O
1207 | why O
1208 | the O
1209 | former O
1210 | Blackburn S-ORG
1211 | and O
1212 | Southampton S-ORG
1213 | skipper O
1214 | should O
1215 | not O
1216 | make O
1217 | the O
1218 | post O
1219 | his O
1220 | own O
1221 | . O
1222 | 
1223 | " O
1224 | I O
1225 | 'm O
1226 | sure O
1227 | there O
1228 | wo O
1229 | n't O
1230 | be O
1231 | a O
1232 | problem O
1233 | , O
1234 | I O
1235 | 'm O
1236 | sure O
1237 | Alan S-PER
1238 | is O
1239 | the O
1240 | man O
1241 | for O
1242 | the O
1243 | job O
1244 | , O
1245 | " O
1246 | Hoddle S-PER
1247 | said O
1248 | . O
1249 | 
1250 | " O
1251 | There O
1252 | were O
1253 | three O
1254 | or O
1255 | four O
1256 | people O
1257 | who O
1258 | could O
1259 | have O
1260 | done O
1261 | it O
1262 | but O
1263 | when O
1264 | I O
1265 | spoke O
1266 | to O
1267 | Alan S-PER
1268 | he O
1269 | was O
1270 | up O
1271 | for O
1272 | it O
1273 | and O
1274 | really O
1275 | wanted O
1276 | it O
1277 | . O
1278 | 
1279 | " O
1280 | In O
1281 | four O
1282 | days O
1283 | it O
1284 | 's O
1285 | very O
1286 | difficult O
1287 | to O
1288 | come O
1289 | to O
1290 | a O
1291 | 100 O
1292 | percent O
1293 | conclusion O
1294 | about O
1295 | something O
1296 | like O
1297 | this O
1298 | ... O
1299 | 
1300 | but O
1301 | he O
1302 | knows O
1303 | how O
1304 | to O
1305 | conduct O
1306 | himself O
1307 | , O
1308 | his O
1309 | team O
1310 | mates O
1311 | respect O
1312 | him O
1313 | and O
1314 | he O
1315 | knows O
1316 | about O
1317 | the O
1318 | team O
1319 | situation O
1320 | even O
1321 | though O
1322 | he O
1323 | plays O
1324 | up O
1325 | front O
1326 | . O
1327 | " O
1328 | 
1329 | Shearer S-PER
1330 | 's O
1331 | Euro B-MISC
1332 | 96 E-MISC
1333 | striking O
1334 | partner O
1335 | Teddy B-PER
1336 | Sheringham E-PER
1337 | withdrew O
1338 | from O
1339 | the O
1340 | squad O
1341 | with O
1342 | an O
1343 | injury O
1344 | on O
1345 | Friday O
1346 | . O
1347 | 
1348 | He O
1349 | will O
1350 | probably O
1351 | be O
1352 | replaced O
1353 | by O
1354 | Shearer S-PER
1355 | 's O
1356 | Newcastle S-ORG
1357 | team O
1358 | mate O
1359 | Les B-PER
1360 | Ferdinand E-PER
1361 | . O
1362 | 
1363 | -DOCSTART- O
1364 | 
1365 | BASKETBALL O
1366 | - O
1367 | INTERNATIONAL O
1368 | TOURNAMENT O
1369 | RESULT O
1370 | . O
1371 | 
1372 | BELGRADE S-LOC
1373 | 1996-08-30 O
1374 | 
1375 | Result O
1376 | in O
1377 | an O
1378 | international O
1379 | 
1380 | basketball O
1381 | tournament O
1382 | on O
1383 | Friday O
1384 | : O
1385 | 
1386 | Red B-ORG
1387 | Star E-ORG
1388 | ( O
1389 | Yugoslavia S-LOC
1390 | ) O
1391 | beat O
1392 | Dinamo S-ORG
1393 | ( O
1394 | Russia S-LOC
1395 | ) O
1396 | 92-90 O
1397 | ( O
1398 | halftime O
1399 | 
1400 | 47-47 O
1401 | ) O
1402 | 
1403 | -DOCSTART- O
1404 | 
1405 | SOCCER O
1406 | - O
1407 | ROMANIA S-LOC
1408 | BEAT O
1409 | LITHUANIA S-LOC
1410 | IN O
1411 | UNDER-21 O
1412 | MATCH O
1413 | . O
1414 | 
1415 | BUCHAREST S-LOC
1416 | 1996-08-30 O
1417 | 
1418 | Romania S-LOC
1419 | beat O
1420 | Lithuania S-LOC
1421 | 2-1 O
1422 | ( O
1423 | halftime O
1424 | 1-1 O
1425 | ) O
1426 | in O
1427 | their O
1428 | European S-MISC
1429 | under-21 O
1430 | soccer O
1431 | match O
1432 | on O
1433 | Friday O
1434 | . O
1435 | 
1436 | Scorers O
1437 | : O
1438 | 
1439 | Romania S-LOC
1440 | - O
1441 | Cosmin B-PER
1442 | Contra E-PER
1443 | ( O
1444 | 31st O
1445 | ) O
1446 | , O
1447 | Mihai B-PER
1448 | Tararache E-PER
1449 | ( O
1450 | 75th O
1451 | ) O
1452 | 
1453 | Lithuania S-LOC
1454 | - O
1455 | Danius B-PER
1456 | Gleveckas E-PER
1457 | ( O
1458 | 13rd O
1459 | ) O
1460 | 
1461 | Attendance O
1462 | : O
1463 | 200 O
1464 | 
1465 | -DOCSTART- O
1466 | 
1467 | SOCCER O
1468 | - O
1469 | ROTOR S-ORG
1470 | FANS O
1471 | LOCKED O
1472 | OUT O
1473 | AFTER O
1474 | VOLGOGRAD S-LOC
1475 | VIOLENCE O
1476 | . O
1477 | 
1478 | MOSCOW S-LOC
1479 | 1996-08-30 O
1480 | 
1481 | Rotor B-ORG
1482 | Volgograd E-ORG
1483 | must O
1484 | play O
1485 | their O
1486 | next O
1487 | home O
1488 | game O
1489 | behind O
1490 | closed O
1491 | doors O
1492 | after O
1493 | fans O
1494 | hurled O
1495 | bottles O
1496 | and O
1497 | stones O
1498 | at O
1499 | Dynamo B-ORG
1500 | Moscow E-ORG
1501 | players O
1502 | during O
1503 | a O
1504 | 1-0 O
1505 | home O
1506 | defeat O
1507 | on O
1508 | Saturday O
1509 | that O
1510 | ended O
1511 | Rotor S-ORG
1512 | 's O
1513 | brief O
1514 | spell O
1515 | as O
1516 | league O
1517 | leaders O
1518 | . O
1519 | 
1520 | The O
1521 | head O
1522 | of O
1523 | the O
1524 | Russian S-MISC
1525 | league O
1526 | 's O
1527 | disciplinary O
1528 | committee O
1529 | , O
1530 | Anatoly B-PER
1531 | Gorokhovsky E-PER
1532 | , O
1533 | said O
1534 | on O
1535 | Friday O
1536 | that O
1537 | Rotor S-ORG
1538 | would O
1539 | play O
1540 | Lada B-ORG
1541 | Togliatti E-ORG
1542 | to O
1543 | empty O
1544 | stands O
1545 | on O
1546 | September O
1547 | 3 O
1548 | . O
1549 | 
1550 | The O
1551 | club O
1552 | , O
1553 | who O
1554 | put O
1555 | Manchester B-ORG
1556 | United E-ORG
1557 | out O
1558 | of O
1559 | last O
1560 | year O
1561 | 's O
1562 | UEFA B-MISC
1563 | Cup E-MISC
1564 | , O
1565 | were O
1566 | fined O
1567 | $ O
1568 | 1,000 O
1569 | . O


--------------------------------------------------------------------------------
/sample_data/raw.bmes:
--------------------------------------------------------------------------------
   1 | CRICKET O
   2 | - O
   3 | LEICESTERSHIRE S-ORG
   4 | TAKE O
   5 | OVER O
   6 | AT O
   7 | TOP O
   8 | AFTER O
   9 | INNINGS O
  10 | VICTORY O
  11 | . O
  12 | 
  13 | LONDON S-LOC
  14 | 1996-08-30 O
  15 | 
  16 | West B-MISC
  17 | Indian E-MISC
  18 | all-rounder O
  19 | Phil B-PER
  20 | Simmons E-PER
  21 | took O
  22 | four O
  23 | for O
  24 | 38 O
  25 | on O
  26 | Friday O
  27 | as O
  28 | Leicestershire S-ORG
  29 | beat O
  30 | Somerset S-ORG
  31 | by O
  32 | an O
  33 | innings O
  34 | and O
  35 | 39 O
  36 | runs O
  37 | in O
  38 | two O
  39 | days O
  40 | to O
  41 | take O
  42 | over O
  43 | at O
  44 | the O
  45 | head O
  46 | of O
  47 | the O
  48 | county O
  49 | championship O
  50 | . O
  51 | 
  52 | Their O
  53 | stay O
  54 | on O
  55 | top O
  56 | , O
  57 | though O
  58 | , O
  59 | may O
  60 | be O
  61 | short-lived O
  62 | as O
  63 | title O
  64 | rivals O
  65 | Essex S-ORG
  66 | , O
  67 | Derbyshire S-ORG
  68 | and O
  69 | Surrey S-ORG
  70 | all O
  71 | closed O
  72 | in O
  73 | on O
  74 | victory O
  75 | while O
  76 | Kent S-ORG
  77 | made O
  78 | up O
  79 | for O
  80 | lost O
  81 | time O
  82 | in O
  83 | their O
  84 | rain-affected O
  85 | match O
  86 | against O
  87 | Nottinghamshire S-ORG
  88 | . O
  89 | 
  90 | After O
  91 | bowling O
  92 | Somerset S-ORG
  93 | out O
  94 | for O
  95 | 83 O
  96 | on O
  97 | the O
  98 | opening O
  99 | morning O
 100 | at O
 101 | Grace B-LOC
 102 | Road E-LOC
 103 | , O
 104 | Leicestershire S-ORG
 105 | extended O
 106 | their O
 107 | first O
 108 | innings O
 109 | by O
 110 | 94 O
 111 | runs O
 112 | before O
 113 | being O
 114 | bowled O
 115 | out O
 116 | for O
 117 | 296 O
 118 | with O
 119 | England S-LOC
 120 | discard O
 121 | Andy B-PER
 122 | Caddick E-PER
 123 | taking O
 124 | three O
 125 | for O
 126 | 83 O
 127 | . O
 128 | 
 129 | Trailing O
 130 | by O
 131 | 213 O
 132 | , O
 133 | Somerset S-ORG
 134 | got O
 135 | a O
 136 | solid O
 137 | start O
 138 | to O
 139 | their O
 140 | second O
 141 | innings O
 142 | before O
 143 | Simmons S-PER
 144 | stepped O
 145 | in O
 146 | to O
 147 | bundle O
 148 | them O
 149 | out O
 150 | for O
 151 | 174 O
 152 | . O
 153 | 
 154 | Essex S-ORG
 155 | , O
 156 | however O
 157 | , O
 158 | look O
 159 | certain O
 160 | to O
 161 | regain O
 162 | their O
 163 | top O
 164 | spot O
 165 | after O
 166 | Nasser B-PER
 167 | Hussain E-PER
 168 | and O
 169 | Peter B-PER
 170 | Such E-PER
 171 | gave O
 172 | them O
 173 | a O
 174 | firm O
 175 | grip O
 176 | on O
 177 | their O
 178 | match O
 179 | against O
 180 | Yorkshire S-ORG
 181 | at O
 182 | Headingley S-LOC
 183 | . O
 184 | 
 185 | Hussain S-PER
 186 | , O
 187 | considered O
 188 | surplus O
 189 | to O
 190 | England S-LOC
 191 | 's O
 192 | one-day O
 193 | requirements O
 194 | , O
 195 | struck O
 196 | 158 O
 197 | , O
 198 | his O
 199 | first O
 200 | championship O
 201 | century O
 202 | of O
 203 | the O
 204 | season O
 205 | , O
 206 | as O
 207 | Essex S-ORG
 208 | reached O
 209 | 372 O
 210 | and O
 211 | took O
 212 | a O
 213 | first O
 214 | innings O
 215 | lead O
 216 | of O
 217 | 82 O
 218 | . O
 219 | 
 220 | By O
 221 | the O
 222 | close O
 223 | Yorkshire S-ORG
 224 | had O
 225 | turned O
 226 | that O
 227 | into O
 228 | a O
 229 | 37-run O
 230 | advantage O
 231 | but O
 232 | off-spinner O
 233 | Such S-PER
 234 | had O
 235 | scuttled O
 236 | their O
 237 | hopes O
 238 | , O
 239 | taking O
 240 | four O
 241 | for O
 242 | 24 O
 243 | in O
 244 | 48 O
 245 | balls O
 246 | and O
 247 | leaving O
 248 | them O
 249 | hanging O
 250 | on O
 251 | 119 O
 252 | for O
 253 | five O
 254 | and O
 255 | praying O
 256 | for O
 257 | rain O
 258 | . O
 259 | 
 260 | At O
 261 | the O
 262 | Oval S-LOC
 263 | , O
 264 | Surrey S-ORG
 265 | captain O
 266 | Chris B-PER
 267 | Lewis E-PER
 268 | , O
 269 | another O
 270 | man O
 271 | dumped O
 272 | by O
 273 | England S-LOC
 274 | , O
 275 | continued O
 276 | to O
 277 | silence O
 278 | his O
 279 | critics O
 280 | as O
 281 | he O
 282 | followed O
 283 | his O
 284 | four O
 285 | for O
 286 | 45 O
 287 | on O
 288 | Thursday O
 289 | with O
 290 | 80 O
 291 | not O
 292 | out O
 293 | on O
 294 | Friday O
 295 | in O
 296 | the O
 297 | match O
 298 | against O
 299 | Warwickshire S-ORG
 300 | . O
 301 | 
 302 | He O
 303 | was O
 304 | well O
 305 | backed O
 306 | by O
 307 | England S-LOC
 308 | hopeful O
 309 | Mark B-PER
 310 | Butcher E-PER
 311 | who O
 312 | made O
 313 | 70 O
 314 | as O
 315 | Surrey S-ORG
 316 | closed O
 317 | on O
 318 | 429 O
 319 | for O
 320 | seven O
 321 | , O
 322 | a O
 323 | lead O
 324 | of O
 325 | 234 O
 326 | . O
 327 | 
 328 | Derbyshire S-ORG
 329 | kept O
 330 | up O
 331 | the O
 332 | hunt O
 333 | for O
 334 | their O
 335 | first O
 336 | championship O
 337 | title O
 338 | since O
 339 | 1936 O
 340 | by O
 341 | reducing O
 342 | Worcestershire S-ORG
 343 | to O
 344 | 133 O
 345 | for O
 346 | five O
 347 | in O
 348 | their O
 349 | second O
 350 | innings O
 351 | , O
 352 | still O
 353 | 100 O
 354 | runs O
 355 | away O
 356 | from O
 357 | avoiding O
 358 | an O
 359 | innings O
 360 | defeat O
 361 | . O
 362 | 
 363 | Australian S-MISC
 364 | Tom B-PER
 365 | Moody E-PER
 366 | took O
 367 | six O
 368 | for O
 369 | 82 O
 370 | but O
 371 | Chris B-PER
 372 | Adams E-PER
 373 | , O
 374 | 123 O
 375 | , O
 376 | and O
 377 | Tim B-PER
 378 | O'Gorman E-PER
 379 | , O
 380 | 109 O
 381 | , O
 382 | took O
 383 | Derbyshire S-ORG
 384 | to O
 385 | 471 O
 386 | and O
 387 | a O
 388 | first O
 389 | innings O
 390 | lead O
 391 | of O
 392 | 233 O
 393 | . O
 394 | 
 395 | After O
 396 | the O
 397 | frustration O
 398 | of O
 399 | seeing O
 400 | the O
 401 | opening O
 402 | day O
 403 | of O
 404 | their O
 405 | match O
 406 | badly O
 407 | affected O
 408 | by O
 409 | the O
 410 | weather O
 411 | , O
 412 | Kent S-ORG
 413 | stepped O
 414 | up O
 415 | a O
 416 | gear O
 417 | to O
 418 | dismiss O
 419 | Nottinghamshire S-ORG
 420 | for O
 421 | 214 O
 422 | . O
 423 | 
 424 | They O
 425 | were O
 426 | held O
 427 | up O
 428 | by O
 429 | a O
 430 | gritty O
 431 | 84 O
 432 | from O
 433 | Paul B-PER
 434 | Johnson E-PER
 435 | but O
 436 | ex-England S-MISC
 437 | fast O
 438 | bowler O
 439 | Martin B-PER
 440 | McCague E-PER
 441 | took O
 442 | four O
 443 | for O
 444 | 55 O
 445 | . O
 446 | 
 447 | By O
 448 | stumps O
 449 | Kent S-ORG
 450 | had O
 451 | reached O
 452 | 108 O
 453 | for O
 454 | three O
 455 | . O
 456 | 
 457 | -DOCSTART- O
 458 | 
 459 | CRICKET O
 460 | - O
 461 | ENGLISH B-MISC
 462 | COUNTY I-MISC
 463 | CHAMPIONSHIP E-MISC
 464 | SCORES O
 465 | . O
 466 | 
 467 | LONDON S-LOC
 468 | 1996-08-30 O
 469 | 
 470 | Result O
 471 | and O
 472 | close O
 473 | of O
 474 | play O
 475 | scores O
 476 | in O
 477 | English S-MISC
 478 | county O
 479 | championship O
 480 | matches O
 481 | on O
 482 | Friday O
 483 | : O
 484 | 
 485 | Leicester S-LOC
 486 | : O
 487 | Leicestershire S-ORG
 488 | beat O
 489 | Somerset S-ORG
 490 | by O
 491 | an O
 492 | innings O
 493 | and O
 494 | 39 O
 495 | runs O
 496 | . O
 497 | 
 498 | Somerset S-ORG
 499 | 83 O
 500 | and O
 501 | 174 O
 502 | ( O
 503 | P. B-PER
 504 | Simmons E-PER
 505 | 4-38 O
 506 | ) O
 507 | , O
 508 | Leicestershire S-ORG
 509 | 296 O
 510 | . O
 511 | 
 512 | Leicestershire S-ORG
 513 | 22 O
 514 | points O
 515 | , O
 516 | Somerset S-ORG
 517 | 4 O
 518 | . O
 519 | 
 520 | Chester-le-Street S-LOC
 521 | : O
 522 | Glamorgan S-ORG
 523 | 259 O
 524 | and O
 525 | 207 O
 526 | ( O
 527 | A. B-PER
 528 | Dale E-PER
 529 | 69 O
 530 | , O
 531 | H. B-PER
 532 | Morris E-PER
 533 | 69 O
 534 | ; O
 535 | D. B-PER
 536 | Blenkiron E-PER
 537 | 4-43 O
 538 | ) O
 539 | , O
 540 | Durham S-ORG
 541 | 114 O
 542 | ( O
 543 | S. B-PER
 544 | Watkin E-PER
 545 | 4-28 O
 546 | ) O
 547 | and O
 548 | 81-3 O
 549 | . O
 550 | 
 551 | Tunbridge B-LOC
 552 | Wells E-LOC
 553 | : O
 554 | Nottinghamshire S-ORG
 555 | 214 O
 556 | ( O
 557 | P. B-PER
 558 | Johnson E-PER
 559 | 84 O
 560 | ; O
 561 | M. B-PER
 562 | McCague E-PER
 563 | 4-55 O
 564 | ) O
 565 | , O
 566 | Kent S-ORG
 567 | 108-3 O
 568 | . O
 569 | 
 570 | London S-LOC
 571 | ( O
 572 | The B-LOC
 573 | Oval E-LOC
 574 | ) O
 575 | : O
 576 | Warwickshire S-ORG
 577 | 195 O
 578 | , O
 579 | Surrey S-ORG
 580 | 429-7 O
 581 | ( O
 582 | C. B-PER
 583 | Lewis E-PER
 584 | 80 O
 585 | not O
 586 | out O
 587 | , O
 588 | M. B-PER
 589 | Butcher E-PER
 590 | 70 O
 591 | , O
 592 | G. B-PER
 593 | Kersey E-PER
 594 | 63 O
 595 | , O
 596 | J. B-PER
 597 | Ratcliffe E-PER
 598 | 63 O
 599 | , O
 600 | D. B-PER
 601 | Bicknell E-PER
 602 | 55 O
 603 | ) O
 604 | . O
 605 | 
 606 | Hove S-LOC
 607 | : O
 608 | Sussex S-ORG
 609 | 363 O
 610 | ( O
 611 | W. B-PER
 612 | Athey E-PER
 613 | 111 O
 614 | , O
 615 | V. B-PER
 616 | Drakes E-PER
 617 | 52 O
 618 | ; O
 619 | I. B-PER
 620 | Austin E-PER
 621 | 4-37 O
 622 | ) O
 623 | , O
 624 | Lancashire S-ORG
 625 | 197-8 O
 626 | ( O
 627 | W. B-PER
 628 | Hegg E-PER
 629 | 54 O
 630 | ) O
 631 | 
 632 | Portsmouth S-LOC
 633 | : O
 634 | Middlesex S-ORG
 635 | 199 O
 636 | and O
 637 | 426 O
 638 | ( O
 639 | J. B-PER
 640 | Pooley E-PER
 641 | 111 O
 642 | , O
 643 | M. B-PER
 644 | Ramprakash E-PER
 645 | 108 O
 646 | , O
 647 | M. B-PER
 648 | Gatting E-PER
 649 | 83 O
 650 | ) O
 651 | , O
 652 | Hampshire S-ORG
 653 | 232 O
 654 | and O
 655 | 109-5 O
 656 | . O
 657 | 
 658 | Chesterfield S-LOC
 659 | : O
 660 | Worcestershire S-ORG
 661 | 238 O
 662 | and O
 663 | 133-5 O
 664 | , O
 665 | Derbyshire S-ORG
 666 | 471 O
 667 | ( O
 668 | J. B-PER
 669 | Adams E-PER
 670 | 123 O
 671 | , O
 672 | T.O'Gorman S-PER
 673 | 109 O
 674 | not O
 675 | out O
 676 | , O
 677 | K. B-PER
 678 | Barnett E-PER
 679 | 87 O
 680 | ; O
 681 | T. B-PER
 682 | Moody E-PER
 683 | 6-82 O
 684 | ) O
 685 | 
 686 | Bristol S-LOC
 687 | : O
 688 | Gloucestershire S-ORG
 689 | 183 O
 690 | and O
 691 | 185-6 O
 692 | ( O
 693 | J. B-PER
 694 | Russell E-PER
 695 | 56 O
 696 | not O
 697 | out O
 698 | ) O
 699 | , O
 700 | Northamptonshire S-ORG
 701 | 190 O
 702 | ( O
 703 | K. B-PER
 704 | Curran E-PER
 705 | 52 O
 706 | ; O
 707 | A. B-PER
 708 | Smith E-PER
 709 | 5-68 O
 710 | ) O
 711 | . O
 712 | 
 713 | -DOCSTART- O
 714 | 
 715 | CRICKET O
 716 | - O
 717 | 1997 O
 718 | ASHES S-MISC
 719 | INTINERARY O
 720 | . O
 721 | 
 722 | LONDON S-LOC
 723 | 1996-08-30 O
 724 | 
 725 | Australia S-LOC
 726 | will O
 727 | defend O
 728 | the O
 729 | Ashes S-MISC
 730 | in O
 731 | 
 732 | a O
 733 | six-test O
 734 | series O
 735 | against O
 736 | England S-LOC
 737 | during O
 738 | a O
 739 | four-month O
 740 | tour O
 741 | 
 742 | starting O
 743 | on O
 744 | May O
 745 | 13 O
 746 | next O
 747 | year O
 748 | , O
 749 | the O
 750 | Test B-ORG
 751 | and I-ORG
 752 | County I-ORG
 753 | Cricket I-ORG
 754 | Board E-ORG
 755 | 
 756 | said O
 757 | on O
 758 | Friday O
 759 | . O
 760 | 
 761 | Australia S-LOC
 762 | will O
 763 | also O
 764 | play O
 765 | three O
 766 | one-day O
 767 | internationals O
 768 | and O
 769 | 
 770 | four O
 771 | one-day O
 772 | warm-up O
 773 | matches O
 774 | at O
 775 | the O
 776 | start O
 777 | of O
 778 | the O
 779 | tour O
 780 | . O
 781 | 
 782 | The O
 783 | tourists O
 784 | will O
 785 | play O
 786 | nine O
 787 | first-class O
 788 | matches O
 789 | against O
 790 | 
 791 | English S-MISC
 792 | county O
 793 | sides O
 794 | and O
 795 | another O
 796 | against O
 797 | British B-ORG
 798 | Universities E-ORG
 799 | , O
 800 | 
 801 | as O
 802 | well O
 803 | as O
 804 | one-day O
 805 | matches O
 806 | against O
 807 | the O
 808 | Minor B-ORG
 809 | Counties E-ORG
 810 | and O
 811 | 
 812 | Scotland S-LOC
 813 | . O
 814 | 
 815 | Tour O
 816 | itinerary O
 817 | : O
 818 | 
 819 | May O
 820 | 
 821 | May O
 822 | 13 O
 823 | Arrive O
 824 | in O
 825 | London S-LOC
 826 | 
 827 | May O
 828 | 14 O
 829 | Practice O
 830 | at O
 831 | Lord B-LOC
 832 | 's E-LOC
 833 | 
 834 | May O
 835 | 15 O
 836 | v O
 837 | Duke B-ORG
 838 | of I-ORG
 839 | Norfolk I-ORG
 840 | 's I-ORG
 841 | XI E-ORG
 842 | ( O
 843 | at O
 844 | Arundel S-LOC
 845 | ) O
 846 | 
 847 | May O
 848 | 17 O
 849 | v O
 850 | Northampton S-ORG
 851 | 
 852 | May O
 853 | 18 O
 854 | v O
 855 | Worcestershire S-ORG
 856 | 
 857 | May O
 858 | 20 O
 859 | v O
 860 | Durham S-ORG
 861 | 
 862 | May O
 863 | 22 O
 864 | First O
 865 | one-day O
 866 | international O
 867 | ( O
 868 | at O
 869 | Headingley S-LOC
 870 | , O
 871 | 
 872 | Leeds S-ORG
 873 | ) O
 874 | 
 875 | May O
 876 | 24 O
 877 | Second O
 878 | one-day O
 879 | international O
 880 | ( O
 881 | at O
 882 | The B-LOC
 883 | Oval E-LOC
 884 | , O
 885 | 
 886 | London S-LOC
 887 | ) O
 888 | 
 889 | May O
 890 | 25 O
 891 | Third O
 892 | one-day O
 893 | international O
 894 | ( O
 895 | at O
 896 | Lord B-LOC
 897 | 's E-LOC
 898 | , O
 899 | London S-LOC
 900 | ) O
 901 | 
 902 | May O
 903 | 27-29 O
 904 | v O
 905 | Gloucestershire S-ORG
 906 | or O
 907 | Sussex S-ORG
 908 | or O
 909 | Surrey S-ORG
 910 | ( O
 911 | three O
 912 | 
 913 | days O
 914 | ) O
 915 | 
 916 | May O
 917 | 31 O
 918 | - O
 919 | June O
 920 | 2 O
 921 | v O
 922 | Derbyshire S-ORG
 923 | ( O
 924 | three O
 925 | days O
 926 | ) O
 927 | 
 928 | June O
 929 | 
 930 | June O
 931 | 5-9 O
 932 | First O
 933 | test O
 934 | match O
 935 | ( O
 936 | at O
 937 | Edgbaston S-LOC
 938 | , O
 939 | Birmingham S-LOC
 940 | ) O
 941 | 
 942 | June O
 943 | 11-13 O
 944 | v O
 945 | a O
 946 | first O
 947 | class O
 948 | county O
 949 | ( O
 950 | to O
 951 | be O
 952 | confirmed O
 953 | ) O
 954 | 
 955 | June O
 956 | 14-16 O
 957 | v O
 958 | Leicestershire S-ORG
 959 | ( O
 960 | three O
 961 | days O
 962 | ) O
 963 | 
 964 | June O
 965 | 19-23 O
 966 | Second O
 967 | test O
 968 | ( O
 969 | at O
 970 | Lord B-LOC
 971 | 's E-LOC
 972 | ) O
 973 | 
 974 | June O
 975 | 25-27 O
 976 | v O
 977 | British B-ORG
 978 | Universities E-ORG
 979 | ( O
 980 | at O
 981 | Oxford S-LOC
 982 | , O
 983 | three O
 984 | days O
 985 | ) O
 986 | 
 987 | June O
 988 | 28-30 O
 989 | v O
 990 | Hampshire S-ORG
 991 | ( O
 992 | three O
 993 | days O
 994 | ) O
 995 | 
 996 | July O
 997 | 
 998 | July O
 999 | 3-7 O
1000 | Third O
1001 | test O
1002 | ( O
1003 | at O
1004 | Old B-LOC
1005 | Trafford E-LOC
1006 | , O
1007 | Manchester S-LOC
1008 | ) O
1009 | 
1010 | July O
1011 | 9 O
1012 | v O
1013 | Minor B-ORG
1014 | Counties I-ORG
1015 | XI E-ORG
1016 | 
1017 | July O
1018 | 12 O
1019 | v O
1020 | Scotland S-LOC
1021 | 
1022 | July O
1023 | 16-18 O
1024 | v O
1025 | Glamorgan S-ORG
1026 | ( O
1027 | three O
1028 | days O
1029 | ) O
1030 | 
1031 | July O
1032 | 19-21 O
1033 | v O
1034 | Middlesex S-ORG
1035 | ( O
1036 | three O
1037 | days O
1038 | ) O
1039 | 
1040 | July O
1041 | 24-28 O
1042 | Fourth O
1043 | test O
1044 | ( O
1045 | at O
1046 | Headingley S-LOC
1047 | ) O
1048 | 
1049 | August O
1050 | 
1051 | August O
1052 | 1-4 O
1053 | v O
1054 | Somerset S-ORG
1055 | ( O
1056 | four O
1057 | days O
1058 | ) O
1059 | 
1060 | August O
1061 | 7-11 O
1062 | Fifth O
1063 | test O
1064 | ( O
1065 | at O
1066 | Trent B-LOC
1067 | Bridge E-LOC
1068 | , O
1069 | Nottingham S-LOC
1070 | ) O
1071 | 
1072 | August O
1073 | 16-18 O
1074 | v O
1075 | Kent S-ORG
1076 | ( O
1077 | three O
1078 | days O
1079 | ) O
1080 | 
1081 | August O
1082 | 21-25 O
1083 | Sixth O
1084 | test O
1085 | ( O
1086 | at O
1087 | The B-LOC
1088 | Oval E-LOC
1089 | , O
1090 | London S-LOC
1091 | ) O
1092 | . O
1093 | 
1094 | -DOCSTART- O
1095 | 
1096 | SOCCER O
1097 | - O
1098 | SHEARER S-PER
1099 | NAMED O
1100 | AS O
1101 | ENGLAND S-LOC
1102 | CAPTAIN O
1103 | . O
1104 | 
1105 | LONDON S-LOC
1106 | 1996-08-30 O
1107 | 
1108 | The O
1109 | world O
1110 | 's O
1111 | costliest O
1112 | footballer O
1113 | Alan B-PER
1114 | Shearer E-PER
1115 | was O
1116 | named O
1117 | as O
1118 | the O
1119 | new O
1120 | England S-LOC
1121 | captain O
1122 | on O
1123 | Friday O
1124 | . O
1125 | 
1126 | The O
1127 | 26-year-old O
1128 | , O
1129 | who O
1130 | joined O
1131 | Newcastle S-ORG
1132 | for O
1133 | 15 O
1134 | million O
1135 | pounds O
1136 | sterling O
1137 | ( O
1138 | $ O
1139 | 23.4 O
1140 | million O
1141 | ) O
1142 | , O
1143 | takes O
1144 | over O
1145 | from O
1146 | Tony B-PER
1147 | Adams E-PER
1148 | , O
1149 | who O
1150 | led O
1151 | the O
1152 | side O
1153 | during O
1154 | the O
1155 | European S-MISC
1156 | championship O
1157 | in O
1158 | June O
1159 | , O
1160 | and O
1161 | former O
1162 | captain O
1163 | David B-PER
1164 | Platt E-PER
1165 | . O
1166 | 
1167 | Adams S-PER
1168 | and O
1169 | Platt S-PER
1170 | are O
1171 | both O
1172 | injured O
1173 | and O
1174 | will O
1175 | miss O
1176 | England S-LOC
1177 | 's O
1178 | opening O
1179 | World B-MISC
1180 | Cup E-MISC
1181 | qualifier O
1182 | against O
1183 | Moldova S-LOC
1184 | on O
1185 | Sunday O
1186 | . O
1187 | 
1188 | Shearer S-PER
1189 | takes O
1190 | the O
1191 | captaincy O
1192 | on O
1193 | a O
1194 | trial O
1195 | basis O
1196 | , O
1197 | but O
1198 | new O
1199 | coach O
1200 | Glenn B-PER
1201 | Hoddle E-PER
1202 | said O
1203 | he O
1204 | saw O
1205 | no O
1206 | reason O
1207 | why O
1208 | the O
1209 | former O
1210 | Blackburn S-ORG
1211 | and O
1212 | Southampton S-ORG
1213 | skipper O
1214 | should O
1215 | not O
1216 | make O
1217 | the O
1218 | post O
1219 | his O
1220 | own O
1221 | . O
1222 | 
1223 | " O
1224 | I O
1225 | 'm O
1226 | sure O
1227 | there O
1228 | wo O
1229 | n't O
1230 | be O
1231 | a O
1232 | problem O
1233 | , O
1234 | I O
1235 | 'm O
1236 | sure O
1237 | Alan S-PER
1238 | is O
1239 | the O
1240 | man O
1241 | for O
1242 | the O
1243 | job O
1244 | , O
1245 | " O
1246 | Hoddle S-PER
1247 | said O
1248 | . O
1249 | 
1250 | " O
1251 | There O
1252 | were O
1253 | three O
1254 | or O
1255 | four O
1256 | people O
1257 | who O
1258 | could O
1259 | have O
1260 | done O
1261 | it O
1262 | but O
1263 | when O
1264 | I O
1265 | spoke O
1266 | to O
1267 | Alan S-PER
1268 | he O
1269 | was O
1270 | up O
1271 | for O
1272 | it O
1273 | and O
1274 | really O
1275 | wanted O
1276 | it O
1277 | . O
1278 | 
1279 | " O
1280 | In O
1281 | four O
1282 | days O
1283 | it O
1284 | 's O
1285 | very O
1286 | difficult O
1287 | to O
1288 | come O
1289 | to O
1290 | a O
1291 | 100 O
1292 | percent O
1293 | conclusion O
1294 | about O
1295 | something O
1296 | like O
1297 | this O
1298 | ... O
1299 | 
1300 | but O
1301 | he O
1302 | knows O
1303 | how O
1304 | to O
1305 | conduct O
1306 | himself O
1307 | , O
1308 | his O
1309 | team O
1310 | mates O
1311 | respect O
1312 | him O
1313 | and O
1314 | he O
1315 | knows O
1316 | about O
1317 | the O
1318 | team O
1319 | situation O
1320 | even O
1321 | though O
1322 | he O
1323 | plays O
1324 | up O
1325 | front O
1326 | . O
1327 | " O
1328 | 
1329 | Shearer S-PER
1330 | 's O
1331 | Euro B-MISC
1332 | 96 E-MISC
1333 | striking O
1334 | partner O
1335 | Teddy B-PER
1336 | Sheringham E-PER
1337 | withdrew O
1338 | from O
1339 | the O
1340 | squad O
1341 | with O
1342 | an O
1343 | injury O
1344 | on O
1345 | Friday O
1346 | . O
1347 | 
1348 | He O
1349 | will O
1350 | probably O
1351 | be O
1352 | replaced O
1353 | by O
1354 | Shearer S-PER
1355 | 's O
1356 | Newcastle S-ORG
1357 | team O
1358 | mate O
1359 | Les B-PER
1360 | Ferdinand E-PER
1361 | . O
1362 | 
1363 | -DOCSTART- O
1364 | 
1365 | BASKETBALL O
1366 | - O
1367 | INTERNATIONAL O
1368 | TOURNAMENT O
1369 | RESULT O
1370 | . O
1371 | 
1372 | BELGRADE S-LOC
1373 | 1996-08-30 O
1374 | 
1375 | Result O
1376 | in O
1377 | an O
1378 | international O
1379 | 
1380 | basketball O
1381 | tournament O
1382 | on O
1383 | Friday O
1384 | : O
1385 | 
1386 | Red B-ORG
1387 | Star E-ORG
1388 | ( O
1389 | Yugoslavia S-LOC
1390 | ) O
1391 | beat O
1392 | Dinamo S-ORG
1393 | ( O
1394 | Russia S-LOC
1395 | ) O
1396 | 92-90 O
1397 | ( O
1398 | halftime O
1399 | 
1400 | 47-47 O
1401 | ) O
1402 | 
1403 | -DOCSTART- O
1404 | 
1405 | SOCCER O
1406 | - O
1407 | ROMANIA S-LOC
1408 | BEAT O
1409 | LITHUANIA S-LOC
1410 | IN O
1411 | UNDER-21 O
1412 | MATCH O
1413 | . O
1414 | 
1415 | BUCHAREST S-LOC
1416 | 1996-08-30 O
1417 | 
1418 | Romania S-LOC
1419 | beat O
1420 | Lithuania S-LOC
1421 | 2-1 O
1422 | ( O
1423 | halftime O
1424 | 1-1 O
1425 | ) O
1426 | in O
1427 | their O
1428 | European S-MISC
1429 | under-21 O
1430 | soccer O
1431 | match O
1432 | on O
1433 | Friday O
1434 | . O
1435 | 
1436 | Scorers O
1437 | : O
1438 | 
1439 | Romania S-LOC
1440 | - O
1441 | Cosmin B-PER
1442 | Contra E-PER
1443 | ( O
1444 | 31st O
1445 | ) O
1446 | , O
1447 | Mihai B-PER
1448 | Tararache E-PER
1449 | ( O
1450 | 75th O
1451 | ) O
1452 | 
1453 | Lithuania S-LOC
1454 | - O
1455 | Danius B-PER
1456 | Gleveckas E-PER
1457 | ( O
1458 | 13rd O
1459 | ) O
1460 | 
1461 | Attendance O
1462 | : O
1463 | 200 O
1464 | 
1465 | -DOCSTART- O
1466 | 
1467 | SOCCER O
1468 | - O
1469 | ROTOR S-ORG
1470 | FANS O
1471 | LOCKED O
1472 | OUT O
1473 | AFTER O
1474 | VOLGOGRAD S-LOC
1475 | VIOLENCE O
1476 | . O
1477 | 
1478 | MOSCOW S-LOC
1479 | 1996-08-30 O
1480 | 
1481 | Rotor B-ORG
1482 | Volgograd E-ORG
1483 | must O
1484 | play O
1485 | their O
1486 | next O
1487 | home O
1488 | game O
1489 | behind O
1490 | closed O
1491 | doors O
1492 | after O
1493 | fans O
1494 | hurled O
1495 | bottles O
1496 | and O
1497 | stones O
1498 | at O
1499 | Dynamo B-ORG
1500 | Moscow E-ORG
1501 | players O
1502 | during O
1503 | a O
1504 | 1-0 O
1505 | home O
1506 | defeat O
1507 | on O
1508 | Saturday O
1509 | that O
1510 | ended O
1511 | Rotor S-ORG
1512 | 's O
1513 | brief O
1514 | spell O
1515 | as O
1516 | league O
1517 | leaders O
1518 | . O
1519 | 
1520 | The O
1521 | head O
1522 | of O
1523 | the O
1524 | Russian S-MISC
1525 | league O
1526 | 's O
1527 | disciplinary O
1528 | committee O
1529 | , O
1530 | Anatoly B-PER
1531 | Gorokhovsky E-PER
1532 | , O
1533 | said O
1534 | on O
1535 | Friday O
1536 | that O
1537 | Rotor S-ORG
1538 | would O
1539 | play O
1540 | Lada B-ORG
1541 | Togliatti E-ORG
1542 | to O
1543 | empty O
1544 | stands O
1545 | on O
1546 | September O
1547 | 3 O
1548 | . O
1549 | 
1550 | The O
1551 | club O
1552 | , O
1553 | who O
1554 | put O
1555 | Manchester B-ORG
1556 | United E-ORG
1557 | out O
1558 | of O
1559 | last O
1560 | year O
1561 | 's O
1562 | UEFA B-MISC
1563 | Cup E-MISC
1564 | , O
1565 | were O
1566 | fined O
1567 | $ O
1568 | 1,000 O
1569 | . O


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'max'
2 | 


--------------------------------------------------------------------------------
/utils/alphabet.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Author: Max
  3 | # @Date:   2018-01-19 11:33:37
  4 | # @Last Modified by:   Jie Yang,     Contact: jieynlp@gmail.com
  5 | # @Last Modified time: 2018-04-26 13:56:03
  6 | 
  7 | 
  8 | """
  9 | Alphabet maps objects to integer ids. It provides two way mapping from the index to the objects.
 10 | """
 11 | from __future__ import print_function
 12 | import json
 13 | import os
 14 | import sys
 15 | 
 16 | 
 17 | class Alphabet:
 18 |     def __init__(self, name, label=False, keep_growing=True):
 19 |         self.name = name
 20 |         self.UNKNOWN = "</unk>"
 21 |         self.label = label
 22 |         self.instance2index = {}
 23 |         self.instances = []
 24 |         self.keep_growing = keep_growing
 25 | 
 26 |         # Index 0 is occupied by default, all else following.
 27 |         self.default_index = 0
 28 |         self.next_index = 1
 29 |         if not self.label:
 30 |             self.add(self.UNKNOWN)
 31 | 
 32 |     def clear(self, keep_growing=True):
 33 |         self.instance2index = {}
 34 |         self.instances = []
 35 |         self.keep_growing = keep_growing
 36 | 
 37 |         # Index 0 is occupied by default, all else following.
 38 |         self.default_index = 0
 39 |         self.next_index = 1
 40 | 
 41 |     def add(self, instance):
 42 |         if instance not in self.instance2index:
 43 |             self.instances.append(instance)
 44 |             self.instance2index[instance] = self.next_index
 45 |             self.next_index += 1
 46 | 
 47 |     def get_index(self, instance):
 48 |         try:
 49 |             return self.instance2index[instance]
 50 |         except KeyError:
 51 |             if self.keep_growing:
 52 |                 index = self.next_index
 53 |                 self.add(instance)
 54 |                 return index
 55 |             else:
 56 |                 return self.instance2index[self.UNKNOWN]
 57 | 
 58 |     def get_instance(self, index):
 59 |         if index == 0:
 60 |             if self.label:
 61 |                 return self.instances[0]
 62 |             # First index is occupied by the wildcard element.
 63 |             return None
 64 |         try:
 65 |             return self.instances[index - 1]
 66 |         except IndexError:
 67 |             print('WARNING:Alphabet get_instance ,unknown instance, return the first label.')
 68 |             return self.instances[0]
 69 | 
 70 |     def size(self):
 71 |         # if self.label:
 72 |         #     return len(self.instances)
 73 |         # else:
 74 |         return len(self.instances) + 1
 75 | 
 76 |     def iteritems(self):
 77 |         if sys.version_info[0] < 3:  # If using python3, dict item access uses different syntax
 78 |             return self.instance2index.iteritems()
 79 |         else:
 80 |             return self.instance2index.items()
 81 | 
 82 |     def enumerate_items(self, start=1):
 83 |         if start < 1 or start >= self.size():
 84 |             raise IndexError("Enumerate is allowed between [1 : size of the alphabet)")
 85 |         return zip(range(start, len(self.instances) + 1), self.instances[start - 1:])
 86 | 
 87 |     def close(self):
 88 |         self.keep_growing = False
 89 | 
 90 |     def open(self):
 91 |         self.keep_growing = True
 92 | 
 93 |     def get_content(self):
 94 |         return {'instance2index': self.instance2index, 'instances': self.instances}
 95 | 
 96 |     def from_json(self, data):
 97 |         self.instances = data["instances"]
 98 |         self.instance2index = data["instance2index"]
 99 | 
100 |     def save(self, output_directory, name=None):
101 |         """
102 |         Save both alhpabet records to the given directory.
103 |         :param output_directory: Directory to save model and weights.
104 |         :param name: The alphabet saving name, optional.
105 |         :return:
106 |         """
107 |         saving_name = name if name else self.__name
108 |         try:
109 |             json.dump(self.get_content(), open(os.path.join(output_directory, saving_name + ".json"), 'w'))
110 |         except Exception as e:
111 |             print("Exception: Alphabet is not saved: " % repr(e))
112 | 
113 |     def load(self, input_directory, name=None):
114 |         """
115 |         Load model architecture and weights from the give directory. This allow we use old models even the structure
116 |         changes.
117 |         :param input_directory: Directory to save model and weights
118 |         :return:
119 |         """
120 |         loading_name = name if name else self.__name
121 |         self.from_json(json.load(open(os.path.join(input_directory, loading_name + ".json"))))
122 | 


--------------------------------------------------------------------------------
/utils/data.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Author: Jie
  3 | # @Date:   2017-06-14 17:34:32
  4 | # @Last Modified by:   Jie Yang,     Contact: jieynlp@gmail.com
  5 | # @Last Modified time: 2019-01-25 20:25:59
  6 | from __future__ import print_function
  7 | from __future__ import absolute_import
  8 | import sys
  9 | from .alphabet import Alphabet
 10 | from .functions import *
 11 | 
 12 | try:
 13 |     import cPickle as pickle
 14 | except ImportError:
 15 |     import pickle as pickle
 16 | 
 17 | 
 18 | START = "</s>"
 19 | UNKNOWN = "</unk>"
 20 | PADDING = "</pad>"
 21 | 
 22 | class Data:
 23 |     def __init__(self):
 24 |         self.sentence_classification = False
 25 |         self.MAX_SENTENCE_LENGTH = 250
 26 |         self.MAX_WORD_LENGTH = -1
 27 |         self.number_normalized = True
 28 |         self.norm_word_emb = False
 29 |         self.norm_char_emb = False
 30 |         self.word_alphabet = Alphabet('word')
 31 |         self.char_alphabet = Alphabet('character')
 32 | 
 33 |         self.feature_name = []
 34 |         self.feature_alphabets = []
 35 |         self.feature_num = len(self.feature_alphabets)
 36 |         self.feat_config = None
 37 | 
 38 | 
 39 |         self.label_alphabet = Alphabet('label',True)
 40 |         self.tagScheme = "NoSeg" ## BMES/BIO
 41 |         self.split_token = ' ||| '
 42 |         self.seg = True
 43 | 
 44 |         ### I/O
 45 |         self.train_dir = None
 46 |         self.dev_dir = None
 47 |         self.test_dir = None
 48 |         self.raw_dir = None
 49 | 
 50 |         self.decode_dir = None
 51 |         self.dset_dir = None ## data vocabulary related file
 52 |         self.model_dir = None ## model save  file
 53 |         self.load_model_dir = None ## model load file
 54 | 
 55 |         self.word_emb_dir = None
 56 |         self.char_emb_dir = None
 57 |         self.feature_emb_dirs = []
 58 | 
 59 |         self.train_texts = []
 60 |         self.dev_texts = []
 61 |         self.test_texts = []
 62 |         self.raw_texts = []
 63 | 
 64 |         self.train_Ids = []
 65 |         self.dev_Ids = []
 66 |         self.test_Ids = []
 67 |         self.raw_Ids = []
 68 | 
 69 |         self.pretrain_word_embedding = None
 70 |         self.pretrain_char_embedding = None
 71 |         self.pretrain_feature_embeddings = []
 72 | 
 73 |         self.label_size = 0
 74 |         self.word_alphabet_size = 0
 75 |         self.char_alphabet_size = 0
 76 |         self.label_alphabet_size = 0
 77 |         self.feature_alphabet_sizes = []
 78 |         self.feature_emb_dims = []
 79 |         self.norm_feature_embs = []
 80 |         self.word_emb_dim = 50
 81 |         self.char_emb_dim = 30
 82 | 
 83 |         ###Networks
 84 |         self.word_feature_extractor = "LSTM" ## "LSTM"/"CNN"/"GRU"/
 85 |         self.use_char = True
 86 |         self.char_feature_extractor = "CNN" ## "LSTM"/"CNN"/"GRU"/None
 87 |         self.use_crf = True
 88 |         self.nbest = None
 89 | 
 90 |         ## Training
 91 |         self.average_batch_loss = False
 92 |         self.optimizer = "SGD" ## "SGD"/"AdaGrad"/"AdaDelta"/"RMSProp"/"Adam"
 93 |         self.status = "train"
 94 |         ### Hyperparameters
 95 |         self.HP_cnn_layer = 4
 96 |         self.HP_iteration = 100
 97 |         self.HP_batch_size = 10
 98 |         self.HP_char_hidden_dim = 50
 99 |         self.HP_hidden_dim = 200
100 |         self.HP_dropout = 0.5
101 |         self.HP_lstm_layer = 1
102 |         self.HP_bilstm = True
103 | 
104 |         self.HP_gpu = False
105 |         self.HP_lr = 0.015
106 |         self.HP_lr_decay = 0.05
107 |         self.HP_clip = None
108 |         self.HP_momentum = 0
109 |         self.HP_l2 = 1e-8
110 | 
111 |     def show_data_summary(self):
112 |         
113 |         print("++"*50)
114 |         print("DATA SUMMARY START:")
115 |         print(" I/O:")
116 |         if self.sentence_classification:
117 |             print("     Start Sentence Classification task...")
118 |         else:
119 |             print("     Start   Sequence   Laebling   task...")
120 |         print("     Tag          scheme: %s"%(self.tagScheme))
121 |         print("     Split         token: %s"%(self.split_token))
122 |         print("     MAX SENTENCE LENGTH: %s"%(self.MAX_SENTENCE_LENGTH))
123 |         print("     MAX   WORD   LENGTH: %s"%(self.MAX_WORD_LENGTH))
124 |         print("     Number   normalized: %s"%(self.number_normalized))
125 |         print("     Word  alphabet size: %s"%(self.word_alphabet_size))
126 |         print("     Char  alphabet size: %s"%(self.char_alphabet_size))
127 |         print("     Label alphabet size: %s"%(self.label_alphabet_size))
128 |         print("     Word embedding  dir: %s"%(self.word_emb_dir))
129 |         print("     Char embedding  dir: %s"%(self.char_emb_dir))
130 |         print("     Word embedding size: %s"%(self.word_emb_dim))
131 |         print("     Char embedding size: %s"%(self.char_emb_dim))
132 |         print("     Norm   word     emb: %s"%(self.norm_word_emb))
133 |         print("     Norm   char     emb: %s"%(self.norm_char_emb))
134 |         print("     Train  file directory: %s"%(self.train_dir))
135 |         print("     Dev    file directory: %s"%(self.dev_dir))
136 |         print("     Test   file directory: %s"%(self.test_dir))
137 |         print("     Raw    file directory: %s"%(self.raw_dir))
138 |         print("     Dset   file directory: %s"%(self.dset_dir))
139 |         print("     Model  file directory: %s"%(self.model_dir))
140 |         print("     Loadmodel   directory: %s"%(self.load_model_dir))
141 |         print("     Decode file directory: %s"%(self.decode_dir))
142 |         print("     Train instance number: %s"%(len(self.train_texts)))
143 |         print("     Dev   instance number: %s"%(len(self.dev_texts)))
144 |         print("     Test  instance number: %s"%(len(self.test_texts)))
145 |         print("     Raw   instance number: %s"%(len(self.raw_texts)))
146 |         print("     FEATURE num: %s"%(self.feature_num))
147 |         for idx in range(self.feature_num):
148 |             print("         Fe: %s  alphabet  size: %s"%(self.feature_alphabets[idx].name, self.feature_alphabet_sizes[idx]))
149 |             print("         Fe: %s  embedding  dir: %s"%(self.feature_alphabets[idx].name, self.feature_emb_dirs[idx]))
150 |             print("         Fe: %s  embedding size: %s"%(self.feature_alphabets[idx].name, self.feature_emb_dims[idx]))
151 |             print("         Fe: %s  norm       emb: %s"%(self.feature_alphabets[idx].name, self.norm_feature_embs[idx]))
152 |         print(" "+"++"*20)
153 |         print(" Model Network:")
154 |         print("     Model        use_crf: %s"%(self.use_crf))
155 |         print("     Model word extractor: %s"%(self.word_feature_extractor))
156 |         print("     Model       use_char: %s"%(self.use_char))
157 |         if self.use_char:
158 |             print("     Model char extractor: %s"%(self.char_feature_extractor))
159 |             print("     Model char_hidden_dim: %s"%(self.HP_char_hidden_dim))
160 |         print(" "+"++"*20)
161 |         print(" Training:")
162 |         print("     Optimizer: %s"%(self.optimizer))
163 |         print("     Iteration: %s"%(self.HP_iteration))
164 |         print("     BatchSize: %s"%(self.HP_batch_size))
165 |         print("     Average  batch   loss: %s"%(self.average_batch_loss))
166 | 
167 |         print(" "+"++"*20)
168 |         print(" Hyperparameters:")
169 | 
170 |         print("     Hyper              lr: %s"%(self.HP_lr))
171 |         print("     Hyper        lr_decay: %s"%(self.HP_lr_decay))
172 |         print("     Hyper         HP_clip: %s"%(self.HP_clip))
173 |         print("     Hyper        momentum: %s"%(self.HP_momentum))
174 |         print("     Hyper              l2: %s"%(self.HP_l2))
175 |         print("     Hyper      hidden_dim: %s"%(self.HP_hidden_dim))
176 |         print("     Hyper         dropout: %s"%(self.HP_dropout))
177 |         print("     Hyper      lstm_layer: %s"%(self.HP_lstm_layer))
178 |         print("     Hyper          bilstm: %s"%(self.HP_bilstm))
179 |         print("     Hyper             GPU: %s"%(self.HP_gpu))
180 |         print("DATA SUMMARY END.")
181 |         print("++"*50)
182 |         sys.stdout.flush()
183 | 
184 | 
185 |     def initial_feature_alphabets(self):
186 |         if self.sentence_classification:
187 |             ## if sentence classification data format, splited by '\t'
188 |             items = open(self.train_dir,'r').readline().strip('\n').split('\t')
189 |         else:
190 |             ## if sequence labeling data format i.e. CoNLL 2003, split by ' '
191 |             items = open(self.train_dir,'r').readline().strip('\n').split()
192 |         total_column = len(items)
193 |         if total_column > 2:
194 |             for idx in range(1, total_column-1):
195 |                 feature_prefix = items[idx].split(']',1)[0]+"]"
196 |                 self.feature_alphabets.append(Alphabet(feature_prefix))
197 |                 self.feature_name.append(feature_prefix)
198 |                 print("Find feature: ", feature_prefix)
199 |         self.feature_num = len(self.feature_alphabets)
200 |         self.pretrain_feature_embeddings = [None]*self.feature_num
201 |         self.feature_emb_dims = [20]*self.feature_num
202 |         self.feature_emb_dirs = [None]*self.feature_num
203 |         self.norm_feature_embs = [False]*self.feature_num
204 |         self.feature_alphabet_sizes = [0]*self.feature_num
205 |         if self.feat_config:
206 |             for idx in range(self.feature_num):
207 |                 if self.feature_name[idx] in self.feat_config:
208 |                     self.feature_emb_dims[idx] = self.feat_config[self.feature_name[idx]]['emb_size']
209 |                     self.feature_emb_dirs[idx] = self.feat_config[self.feature_name[idx]]['emb_dir']
210 |                     self.norm_feature_embs[idx] = self.feat_config[self.feature_name[idx]]['emb_norm']
211 |         # exit(0)
212 | 
213 | 
214 |     def build_alphabet(self, input_file):
215 |         in_lines = open(input_file,'r').readlines()
216 |         for line in in_lines:
217 |             if len(line) > 2:
218 |                 ## if sentence classification data format, splited by \t
219 |                 if self.sentence_classification:
220 |                     pairs = line.strip().split(self.split_token)
221 |                     sent = pairs[0]
222 |                     if sys.version_info[0] < 3:
223 |                         sent = sent.decode('utf-8')
224 |                     words = sent.split()
225 |                     for word in words:
226 |                         if self.number_normalized:
227 |                             word = normalize_word(word)
228 |                         self.word_alphabet.add(word)
229 |                         for char in word:
230 |                             self.char_alphabet.add(char)
231 |                     label = pairs[-1]
232 |                     self.label_alphabet.add(label)
233 |                     ## build feature alphabet
234 |                     for idx in range(self.feature_num):
235 |                         feat_idx = pairs[idx+1].split(']',1)[-1]
236 |                         self.feature_alphabets[idx].add(feat_idx)
237 | 
238 |                 ## if sequence labeling data format i.e. CoNLL 2003
239 |                 else:
240 |                     pairs = line.strip().split()
241 |                     word = pairs[0]
242 |                     if sys.version_info[0] < 3:
243 |                         word = word.decode('utf-8')
244 |                     if self.number_normalized:
245 |                         word = normalize_word(word)
246 |                     label = pairs[-1]
247 |                     self.label_alphabet.add(label)
248 |                     self.word_alphabet.add(word)
249 |                     ## build feature alphabet
250 |                     for idx in range(self.feature_num):
251 |                         feat_idx = pairs[idx+1].split(']',1)[-1]
252 |                         self.feature_alphabets[idx].add(feat_idx)
253 |                     for char in word:
254 |                         self.char_alphabet.add(char)
255 |         self.word_alphabet_size = self.word_alphabet.size()
256 |         self.char_alphabet_size = self.char_alphabet.size()
257 |         self.label_alphabet_size = self.label_alphabet.size()
258 |         for idx in range(self.feature_num):
259 |             self.feature_alphabet_sizes[idx] = self.feature_alphabets[idx].size()
260 |         startS = False
261 |         startB = False
262 |         for label,_ in self.label_alphabet.iteritems():
263 |             if "S-" in label.upper():
264 |                 startS = True
265 |             elif "B-" in label.upper():
266 |                 startB = True
267 |         if startB:
268 |             if startS:
269 |                 self.tagScheme = "BMES"
270 |             else:
271 |                 self.tagScheme = "BIO"
272 |         if self.sentence_classification:
273 |             self.tagScheme = "Not sequence labeling task"
274 | 
275 | 
276 |     def fix_alphabet(self):
277 |         self.word_alphabet.close()
278 |         self.char_alphabet.close()
279 |         self.label_alphabet.close()
280 |         for idx in range(self.feature_num):
281 |             self.feature_alphabets[idx].close()
282 | 
283 | 
284 |     def build_pretrain_emb(self):
285 |         if self.word_emb_dir:
286 |             print("Load pretrained word embedding, norm: %s, dir: %s"%(self.norm_word_emb, self.word_emb_dir))
287 |             self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding(self.word_emb_dir, self.word_alphabet, self.word_emb_dim, self.norm_word_emb)
288 |         if self.char_emb_dir:
289 |             print("Load pretrained char embedding, norm: %s, dir: %s"%(self.norm_char_emb, self.char_emb_dir))
290 |             self.pretrain_char_embedding, self.char_emb_dim = build_pretrain_embedding(self.char_emb_dir, self.char_alphabet, self.char_emb_dim, self.norm_char_emb)
291 |         for idx in range(self.feature_num):
292 |             if self.feature_emb_dirs[idx]:
293 |                 print("Load pretrained feature %s embedding:, norm: %s, dir: %s"%(self.feature_name[idx], self.norm_feature_embs[idx], self.feature_emb_dirs[idx]))
294 |                 self.pretrain_feature_embeddings[idx], self.feature_emb_dims[idx] = build_pretrain_embedding(self.feature_emb_dirs[idx], self.feature_alphabets[idx], self.feature_emb_dims[idx], self.norm_feature_embs[idx])
295 | 
296 | 
297 |     def generate_instance(self, name):
298 |         self.fix_alphabet()
299 |         if name == "train":
300 |             self.train_texts, self.train_Ids = read_instance(self.train_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.sentence_classification, self.split_token)
301 |         elif name == "dev":
302 |             self.dev_texts, self.dev_Ids = read_instance(self.dev_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.sentence_classification, self.split_token)
303 |         elif name == "test":
304 |             self.test_texts, self.test_Ids = read_instance(self.test_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.sentence_classification, self.split_token)
305 |         elif name == "raw":
306 |             self.raw_texts, self.raw_Ids = read_instance(self.raw_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.sentence_classification, self.split_token)
307 |         else:
308 |             print("Error: you can only generate train/dev/test instance! Illegal input:%s"%(name))
309 | 
310 | 
311 |     def write_decoded_results(self, predict_results, name):
312 |         
313 |         sent_num = len(predict_results)
314 |         content_list = []
315 |         if name == 'raw':
316 |            content_list = self.raw_texts
317 |         elif name == 'test':
318 |             content_list = self.test_texts
319 |         elif name == 'dev':
320 |             content_list = self.dev_texts
321 |         elif name == 'train':
322 |             content_list = self.train_texts
323 |         else:
324 |             print("Error: illegal name during writing predict result, name should be within train/dev/test/raw !")
325 |         assert(sent_num == len(content_list))
326 |         fout = open(self.decode_dir,'w')
327 |         for idx in range(sent_num):
328 |             if self.sentence_classification:
329 |                 fout.write(" ".join(content_list[idx][0])+"\t"+predict_results[idx]+ '\n')
330 |             else:
331 |                 sent_length = len(predict_results[idx])
332 |                 for idy in range(sent_length):
333 |                     ## content_list[idx] is a list with [word, char, label]
334 |                     fout.write(content_list[idx][0][idy].encode('utf-8') + " " + predict_results[idx][idy] + '\n')
335 |                 fout.write('\n')
336 |         fout.close()
337 |         print("Predict %s result has been written into file. %s"%(name, self.decode_dir))
338 | 
339 | 
340 |     def load(self,data_file):
341 |         f = open(data_file, 'rb')
342 |         tmp_dict = pickle.load(f)
343 |         f.close()
344 |         self.__dict__.update(tmp_dict)
345 | 
346 |     def save(self,save_file):
347 |         f = open(save_file, 'wb')
348 |         pickle.dump(self.__dict__, f, 2)
349 |         f.close()
350 | 
351 | 
352 | 
353 |     def write_nbest_decoded_results(self, predict_results, pred_scores, name):
354 |         ## predict_results : [whole_sent_num, nbest, each_sent_length]
355 |         ## pred_scores: [whole_sent_num, nbest]
356 |         fout = open(self.decode_dir,'w')
357 |         sent_num = len(predict_results)
358 |         content_list = []
359 |         if name == 'raw':
360 |            content_list = self.raw_texts
361 |         elif name == 'test':
362 |             content_list = self.test_texts
363 |         elif name == 'dev':
364 |             content_list = self.dev_texts
365 |         elif name == 'train':
366 |             content_list = self.train_texts
367 |         else:
368 |             print("Error: illegal name during writing predict result, name should be within train/dev/test/raw !")
369 |         assert(sent_num == len(content_list))
370 |         assert(sent_num == len(pred_scores))
371 |         for idx in range(sent_num):
372 |             sent_length = len(predict_results[idx][0])
373 |             nbest = len(predict_results[idx])
374 |             score_string = "# "
375 |             for idz in range(nbest):
376 |                 score_string += format(pred_scores[idx][idz], '.4f')+" "
377 |             fout.write(score_string.strip() + "\n")
378 | 
379 |             for idy in range(sent_length):
380 |                 try:  # Will fail with python3
381 |                     label_string = content_list[idx][0][idy].encode('utf-8') + " "
382 |                 except:
383 |                     label_string = content_list[idx][0][idy] + " "
384 |                 for idz in range(nbest):
385 |                     label_string += predict_results[idx][idz][idy]+" "
386 |                 label_string = label_string.strip() + "\n"
387 |                 fout.write(label_string)
388 |             fout.write('\n')
389 |         fout.close()
390 |         print("Predict %s %s-best result has been written into file. %s"%(name,nbest, self.decode_dir))
391 | 
392 | 
393 |     def read_config(self,config_file):
394 |         config = config_file_to_dict(config_file)
395 |         ## read data:
396 |         the_item = 'train_dir'
397 |         if the_item in config:
398 |             self.train_dir = config[the_item]
399 |         the_item = 'dev_dir'
400 |         if the_item in config:
401 |             self.dev_dir = config[the_item]
402 |         the_item = 'test_dir'
403 |         if the_item in config:
404 |             self.test_dir = config[the_item]
405 |         the_item = 'raw_dir'
406 |         if the_item in config:
407 |             self.raw_dir = config[the_item]
408 |         the_item = 'decode_dir'
409 |         if the_item in config:
410 |             self.decode_dir = config[the_item]
411 |         the_item = 'dset_dir'
412 |         if the_item in config:
413 |             self.dset_dir = config[the_item]
414 |         the_item = 'model_dir'
415 |         if the_item in config:
416 |             self.model_dir = config[the_item]
417 |         the_item = 'load_model_dir'
418 |         if the_item in config:
419 |             self.load_model_dir = config[the_item]
420 | 
421 |         the_item = 'word_emb_dir'
422 |         if the_item in config:
423 |             self.word_emb_dir = config[the_item]
424 |         the_item = 'char_emb_dir'
425 |         if the_item in config:
426 |             self.char_emb_dir = config[the_item]
427 | 
428 | 
429 |         the_item = 'MAX_SENTENCE_LENGTH'
430 |         if the_item in config:
431 |             self.MAX_SENTENCE_LENGTH = int(config[the_item])
432 |         the_item = 'MAX_WORD_LENGTH'
433 |         if the_item in config:
434 |             self.MAX_WORD_LENGTH = int(config[the_item])
435 | 
436 |         the_item = 'norm_word_emb'
437 |         if the_item in config:
438 |             self.norm_word_emb = str2bool(config[the_item])
439 |         the_item = 'norm_char_emb'
440 |         if the_item in config:
441 |             self.norm_char_emb = str2bool(config[the_item])
442 |         the_item = 'number_normalized'
443 |         if the_item in config:
444 |             self.number_normalized = str2bool(config[the_item])
445 | 
446 |         the_item = 'sentence_classification'
447 |         if the_item in config:
448 |             self.sentence_classification = str2bool(config[the_item])
449 |         the_item = 'seg'
450 |         if the_item in config:
451 |             self.seg = str2bool(config[the_item])
452 |         the_item = 'word_emb_dim'
453 |         if the_item in config:
454 |             self.word_emb_dim = int(config[the_item])
455 |         the_item = 'char_emb_dim'
456 |         if the_item in config:
457 |             self.char_emb_dim = int(config[the_item])
458 | 
459 |         ## read network:
460 |         the_item = 'use_crf'
461 |         if the_item in config:
462 |             self.use_crf = str2bool(config[the_item])
463 |         the_item = 'use_char'
464 |         if the_item in config:
465 |             self.use_char = str2bool(config[the_item])
466 |         the_item = 'word_seq_feature'
467 |         if the_item in config:
468 |             self.word_feature_extractor = config[the_item]
469 |         the_item = 'char_seq_feature'
470 |         if the_item in config:
471 |             self.char_feature_extractor = config[the_item]
472 |         the_item = 'nbest'
473 |         if the_item in config:
474 |             self.nbest = int(config[the_item])
475 | 
476 |         the_item = 'feature'
477 |         if the_item in config:
478 |             self.feat_config = config[the_item] ## feat_config is a dict
479 | 
480 | 
481 |         ## read training setting:
482 |         the_item = 'optimizer'
483 |         if the_item in config:
484 |             self.optimizer = config[the_item]
485 |         the_item = 'ave_batch_loss'
486 |         if the_item in config:
487 |             self.average_batch_loss = str2bool(config[the_item])
488 |         the_item = 'status'
489 |         if the_item in config:
490 |             self.status = config[the_item]
491 | 
492 |         ## read Hyperparameters:
493 |         the_item = 'cnn_layer'
494 |         if the_item in config:
495 |             self.HP_cnn_layer = int(config[the_item])
496 |         the_item = 'iteration'
497 |         if the_item in config:
498 |             self.HP_iteration = int(config[the_item])
499 |         the_item = 'batch_size'
500 |         if the_item in config:
501 |             self.HP_batch_size = int(config[the_item])
502 | 
503 |         the_item = 'char_hidden_dim'
504 |         if the_item in config:
505 |             self.HP_char_hidden_dim = int(config[the_item])
506 |         the_item = 'hidden_dim'
507 |         if the_item in config:
508 |             self.HP_hidden_dim = int(config[the_item])
509 |         the_item = 'dropout'
510 |         if the_item in config:
511 |             self.HP_dropout = float(config[the_item])
512 |         the_item = 'lstm_layer'
513 |         if the_item in config:
514 |             self.HP_lstm_layer = int(config[the_item])
515 |         the_item = 'bilstm'
516 |         if the_item in config:
517 |             self.HP_bilstm = str2bool(config[the_item])
518 | 
519 |         the_item = 'gpu'
520 |         if the_item in config:
521 |             self.HP_gpu = str2bool(config[the_item])
522 |         the_item = 'learning_rate'
523 |         if the_item in config:
524 |             self.HP_lr = float(config[the_item])
525 |         the_item = 'lr_decay'
526 |         if the_item in config:
527 |             self.HP_lr_decay = float(config[the_item])
528 |         the_item = 'clip'
529 |         if the_item in config:
530 |             self.HP_clip = float(config[the_item])
531 |         the_item = 'momentum'
532 |         if the_item in config:
533 |             self.HP_momentum = float(config[the_item])
534 |         the_item = 'l2'
535 |         if the_item in config:
536 |             self.HP_l2 = float(config[the_item])
537 |         ## no seg for sentence classification
538 |         if self.sentence_classification:
539 |             self.seg = False
540 |             self.use_crf = False
541 | 
542 | 
543 | def config_file_to_dict(input_file):
544 |     config = {}
545 |     fins = open(input_file,'r').readlines()
546 |     for line in fins:
547 |         if len(line) > 0 and line[0] == "#":
548 |             continue
549 |         if "=" in line:
550 |             pair = line.strip().split('#',1)[0].split('=',1)
551 |             item = pair[0]
552 |             if item=="feature":
553 |                 if item not in config:
554 |                     feat_dict = {}
555 |                     config[item]= feat_dict
556 |                 feat_dict = config[item]
557 |                 new_pair = pair[-1].split()
558 |                 feat_name = new_pair[0]
559 |                 one_dict = {}
560 |                 one_dict["emb_dir"] = None
561 |                 one_dict["emb_size"] = 10
562 |                 one_dict["emb_norm"] = False
563 |                 if len(new_pair) > 1:
564 |                     for idx in range(1,len(new_pair)):
565 |                         conf_pair = new_pair[idx].split('=')
566 |                         if conf_pair[0] == "emb_dir":
567 |                             one_dict["emb_dir"]=conf_pair[-1]
568 |                         elif conf_pair[0] == "emb_size":
569 |                             one_dict["emb_size"]=int(conf_pair[-1])
570 |                         elif conf_pair[0] == "emb_norm":
571 |                             one_dict["emb_norm"]=str2bool(conf_pair[-1])
572 |                 feat_dict[feat_name] = one_dict
573 |                 # print "feat",feat_dict
574 |             else:
575 |                 if item in config:
576 |                     print("Warning: duplicated config item found: %s, updated."%(pair[0]))
577 |                 config[item] = pair[-1]
578 | 
579 | 
580 |     return config
581 | 
582 | 
583 | def str2bool(string):
584 |     if string == "True" or string == "true" or string == "TRUE":
585 |         return True
586 |     else:
587 |         return False
588 | 


--------------------------------------------------------------------------------
/utils/functions.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Author: Jie
  3 | # @Date:   2017-06-15 14:23:06
  4 | # @Last Modified by:   Jie Yang,     Contact: jieynlp@gmail.com
  5 | # @Last Modified time: 2019-02-14 12:23:52
  6 | from __future__ import print_function
  7 | from __future__ import absolute_import
  8 | import sys
  9 | import numpy as np
 10 | 
 11 | def normalize_word(word):
 12 |     new_word = ""
 13 |     for char in word:
 14 |         if char.isdigit():
 15 |             new_word += '0'
 16 |         else:
 17 |             new_word += char
 18 |     return new_word
 19 | 
 20 | 
 21 | def read_instance(input_file, word_alphabet, char_alphabet, feature_alphabets, label_alphabet, number_normalized, max_sent_length, sentence_classification=False, split_token='\t', char_padding_size=-1, char_padding_symbol = '</pad>'):
 22 |     feature_num = len(feature_alphabets)
 23 |     in_lines = open(input_file,'r', encoding="utf8").readlines()
 24 |     instence_texts = []
 25 |     instence_Ids = []
 26 |     words = []
 27 |     features = []
 28 |     chars = []
 29 |     labels = []
 30 |     word_Ids = []
 31 |     feature_Ids = []
 32 |     char_Ids = []
 33 |     label_Ids = []
 34 | 
 35 |     ## if sentence classification data format, splited by \t
 36 |     if sentence_classification:
 37 |         for line in in_lines:
 38 |             if len(line) > 2:
 39 |                 pairs = line.strip().split(split_token)
 40 |                 sent = pairs[0]
 41 |                 if sys.version_info[0] < 3:
 42 |                     sent = sent.decode('utf-8')
 43 |                 original_words = sent.split()
 44 |                 for word in original_words:
 45 |                     words.append(word)
 46 |                     if number_normalized:
 47 |                         word = normalize_word(word)
 48 |                     word_Ids.append(word_alphabet.get_index(word))
 49 |                     ## get char
 50 |                     char_list = []
 51 |                     char_Id = []
 52 |                     for char in word:
 53 |                         char_list.append(char)
 54 |                     if char_padding_size > 0:
 55 |                         char_number = len(char_list)
 56 |                         if char_number < char_padding_size:
 57 |                             char_list = char_list + [char_padding_symbol]*(char_padding_size-char_number)
 58 |                         assert(len(char_list) == char_padding_size)
 59 |                     for char in char_list:
 60 |                         char_Id.append(char_alphabet.get_index(char))
 61 |                     chars.append(char_list)
 62 |                     char_Ids.append(char_Id)
 63 | 
 64 |                 label = pairs[-1]
 65 |                 label_Id = label_alphabet.get_index(label)
 66 |                 ## get features
 67 |                 feat_list = []
 68 |                 feat_Id = []
 69 |                 for idx in range(feature_num):
 70 |                     feat_idx = pairs[idx+1].split(']',1)[-1]
 71 |                     feat_list.append(feat_idx)
 72 |                     feat_Id.append(feature_alphabets[idx].get_index(feat_idx))
 73 |                 ## combine together and return, notice the feature/label as different format with sequence labeling task
 74 |                 if (len(words) > 0) and ((max_sent_length < 0) or (len(words) < max_sent_length)):
 75 |                     instence_texts.append([words, feat_list, chars, label])
 76 |                     instence_Ids.append([word_Ids, feat_Id, char_Ids,label_Id])
 77 |                 words = []
 78 |                 features = []
 79 |                 chars = []
 80 |                 char_Ids = []
 81 |                 word_Ids = []
 82 |                 feature_Ids = []
 83 |                 label_Ids = []
 84 |         if (len(words) > 0) and ((max_sent_length < 0) or (len(words) < max_sent_length)) :
 85 |             instence_texts.append([words, feat_list, chars, label])
 86 |             instence_Ids.append([word_Ids, feat_Id, char_Ids,label_Id])
 87 |             words = []
 88 |             features = []
 89 |             chars = []
 90 |             char_Ids = []
 91 |             word_Ids = []
 92 |             feature_Ids = []
 93 |             label_Ids = []
 94 | 
 95 |     else:
 96 |     ### for sequence labeling data format i.e. CoNLL 2003
 97 |         for line in in_lines:
 98 |             if len(line) > 2:
 99 |                 pairs = line.strip().split()
100 |                 word = pairs[0]
101 |                 if sys.version_info[0] < 3:
102 |                     word = word.decode('utf-8')
103 |                 words.append(word)
104 |                 if number_normalized:
105 |                     word = normalize_word(word)
106 |                 label = pairs[-1]
107 |                 labels.append(label)
108 |                 word_Ids.append(word_alphabet.get_index(word))
109 |                 label_Ids.append(label_alphabet.get_index(label))
110 |                 ## get features
111 |                 feat_list = []
112 |                 feat_Id = []
113 |                 for idx in range(feature_num):
114 |                     feat_idx = pairs[idx+1].split(']',1)[-1]
115 |                     feat_list.append(feat_idx)
116 |                     feat_Id.append(feature_alphabets[idx].get_index(feat_idx))
117 |                 features.append(feat_list)
118 |                 feature_Ids.append(feat_Id)
119 |                 ## get char
120 |                 char_list = []
121 |                 char_Id = []
122 |                 for char in word:
123 |                     char_list.append(char)
124 |                 if char_padding_size > 0:
125 |                     char_number = len(char_list)
126 |                     if char_number < char_padding_size:
127 |                         char_list = char_list + [char_padding_symbol]*(char_padding_size-char_number)
128 |                     assert(len(char_list) == char_padding_size)
129 |                 else:
130 |                     ### not padding
131 |                     pass
132 |                 for char in char_list:
133 |                     char_Id.append(char_alphabet.get_index(char))
134 |                 chars.append(char_list)
135 |                 char_Ids.append(char_Id)
136 |             else:
137 |                 if (len(words) > 0) and ((max_sent_length < 0) or (len(words) < max_sent_length)) :
138 |                     instence_texts.append([words, features, chars, labels])
139 |                     instence_Ids.append([word_Ids, feature_Ids, char_Ids,label_Ids])
140 |                 words = []
141 |                 features = []
142 |                 chars = []
143 |                 labels = []
144 |                 word_Ids = []
145 |                 feature_Ids = []
146 |                 char_Ids = []
147 |                 label_Ids = []
148 |         if (len(words) > 0) and ((max_sent_length < 0) or (len(words) < max_sent_length)) :
149 |             instence_texts.append([words, features, chars, labels])
150 |             instence_Ids.append([word_Ids, feature_Ids, char_Ids,label_Ids])
151 |             words = []
152 |             features = []
153 |             chars = []
154 |             labels = []
155 |             word_Ids = []
156 |             feature_Ids = []
157 |             char_Ids = []
158 |             label_Ids = []
159 |     return instence_texts, instence_Ids
160 | 
161 | 
162 | def build_pretrain_embedding(embedding_path, word_alphabet, embedd_dim=100, norm=True):
163 |     embedd_dict = dict()
164 |     if embedding_path != None:
165 |         embedd_dict, embedd_dim = load_pretrain_emb(embedding_path)
166 |     alphabet_size = word_alphabet.size()
167 |     scale = np.sqrt(3.0 / embedd_dim)
168 |     pretrain_emb = np.empty([word_alphabet.size(), embedd_dim])
169 |     perfect_match = 0
170 |     case_match = 0
171 |     not_match = 0
172 |     for word, index in word_alphabet.iteritems():
173 |         if word in embedd_dict:
174 |             if norm:
175 |                 pretrain_emb[index,:] = norm2one(embedd_dict[word])
176 |             else:
177 |                 pretrain_emb[index,:] = embedd_dict[word]
178 |             perfect_match += 1
179 |         elif word.lower() in embedd_dict:
180 |             if norm:
181 |                 pretrain_emb[index,:] = norm2one(embedd_dict[word.lower()])
182 |             else:
183 |                 pretrain_emb[index,:] = embedd_dict[word.lower()]
184 |             case_match += 1
185 |         else:
186 |             pretrain_emb[index,:] = np.random.uniform(-scale, scale, [1, embedd_dim])
187 |             not_match += 1
188 |     pretrained_size = len(embedd_dict)
189 |     print("Embedding:\n     pretrain word:%s, prefect match:%s, case_match:%s, oov:%s, oov%%:%s"%(pretrained_size, perfect_match, case_match, not_match, (not_match+0.)/alphabet_size))
190 |     return pretrain_emb, embedd_dim
191 | 
192 | def norm2one(vec):
193 |     root_sum_square = np.sqrt(np.sum(np.square(vec)))
194 |     return vec/root_sum_square
195 | 
196 | def load_pretrain_emb(embedding_path):
197 |     embedd_dim = -1
198 |     embedd_dict = dict()
199 |     with open(embedding_path, 'r', encoding="utf8") as file:
200 |         for line in file:
201 |             line = line.strip()
202 |             if len(line) == 0:
203 |                 continue
204 |             tokens = line.split()
205 |             if embedd_dim < 0:
206 |                 embedd_dim = len(tokens) - 1
207 |             elif embedd_dim + 1 != len(tokens):
208 |                 ## ignore illegal embedding line
209 |                 continue
210 |                 # assert (embedd_dim + 1 == len(tokens))
211 |             embedd = np.empty([1, embedd_dim])
212 |             embedd[:] = tokens[1:]
213 |             if sys.version_info[0] < 3:
214 |                 first_col = tokens[0].decode('utf-8')
215 |             else:
216 |                 first_col = tokens[0]
217 |             embedd_dict[first_col] = embedd
218 |     return embedd_dict, embedd_dim
219 | 
220 | if __name__ == '__main__':
221 |     a = np.arange(9.0)
222 |     print(a)
223 |     print(norm2one(a))
224 | 


--------------------------------------------------------------------------------
/utils/metric.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Author: Jie
  3 | # @Date:   2017-02-16 09:53:19
  4 | # @Last Modified by:   Jie Yang,     Contact: jieynlp@gmail.com
  5 | # @Last Modified time: 2019-02-17 22:46:59
  6 | 
  7 | # from operator import add
  8 | #
  9 | from __future__ import print_function
 10 | import sys
 11 | 
 12 | 
 13 | 
 14 | ## input as sentence level labels
 15 | def get_ner_fmeasure(golden_lists, predict_lists, label_type="BMES"):
 16 |     sent_num = len(golden_lists)
 17 |     golden_full = []
 18 |     predict_full = []
 19 |     right_full = []
 20 |     right_tag = 0
 21 |     all_tag = 0
 22 |     for idx in range(0,sent_num):
 23 |         # word_list = sentence_lists[idx]
 24 |         golden_list = golden_lists[idx]
 25 |         predict_list = predict_lists[idx]
 26 |         for idy in range(len(golden_list)):
 27 |             if golden_list[idy] == predict_list[idy]:
 28 |                 right_tag += 1
 29 |         all_tag += len(golden_list)
 30 |         if label_type == "BMES" or label_type == "BIOES":
 31 |             gold_matrix = get_ner_BMES(golden_list)
 32 |             pred_matrix = get_ner_BMES(predict_list)
 33 |         else:
 34 |             gold_matrix = get_ner_BIO(golden_list)
 35 |             pred_matrix = get_ner_BIO(predict_list)
 36 |         # print "gold", gold_matrix
 37 |         # print "pred", pred_matrix
 38 |         right_ner = list(set(gold_matrix).intersection(set(pred_matrix)))
 39 |         golden_full += gold_matrix
 40 |         predict_full += pred_matrix
 41 |         right_full += right_ner
 42 |     right_num = len(right_full)
 43 |     golden_num = len(golden_full)
 44 |     predict_num = len(predict_full)
 45 |     if predict_num == 0:
 46 |         precision = -1
 47 |     else:
 48 |         precision =  (right_num+0.0)/predict_num
 49 |     if golden_num == 0:
 50 |         recall = -1
 51 |     else:
 52 |         recall = (right_num+0.0)/golden_num
 53 |     if (precision == -1) or (recall == -1) or (precision+recall) <= 0.:
 54 |         f_measure = -1
 55 |     else:
 56 |         f_measure = 2*precision*recall/(precision+recall)
 57 |     accuracy = (right_tag+0.0)/all_tag
 58 |     # print "Accuracy: ", right_tag,"/",all_tag,"=",accuracy
 59 |     if  label_type.upper().startswith("B-"):
 60 |         print("gold_num = ", golden_num, " pred_num = ", predict_num, " right_num = ", right_num)
 61 |     else:
 62 |         print("Right token = ", right_tag, " All token = ", all_tag, " acc = ", accuracy)
 63 |     return accuracy, precision, recall, f_measure
 64 | 
 65 | 
 66 | def reverse_style(input_string):
 67 |     target_position = input_string.index('[')
 68 |     input_len = len(input_string)
 69 |     output_string = input_string[target_position:input_len] + input_string[0:target_position]
 70 |     return output_string
 71 | 
 72 | 
 73 | def get_ner_BMES(label_list):
 74 |     # list_len = len(word_list)
 75 |     # assert(list_len == len(label_list)), "word list size unmatch with label list"
 76 |     list_len = len(label_list)
 77 |     begin_label = 'B-'
 78 |     end_label = 'E-'
 79 |     single_label = 'S-'
 80 |     whole_tag = ''
 81 |     index_tag = ''
 82 |     tag_list = []
 83 |     stand_matrix = []
 84 |     for i in range(0, list_len):
 85 |         # wordlabel = word_list[i]
 86 |         current_label = label_list[i].upper()
 87 |         if begin_label in current_label:
 88 |             if index_tag != '':
 89 |                 tag_list.append(whole_tag + ',' + str(i-1))
 90 |             whole_tag = current_label.replace(begin_label,"",1) +'[' +str(i)
 91 |             index_tag = current_label.replace(begin_label,"",1)
 92 | 
 93 |         elif single_label in current_label:
 94 |             if index_tag != '':
 95 |                 tag_list.append(whole_tag + ',' + str(i-1))
 96 |             whole_tag = current_label.replace(single_label,"",1) +'[' +str(i)
 97 |             tag_list.append(whole_tag)
 98 |             whole_tag = ""
 99 |             index_tag = ""
100 |         elif end_label in current_label:
101 |             if index_tag != '':
102 |                 tag_list.append(whole_tag +',' + str(i))
103 |             whole_tag = ''
104 |             index_tag = ''
105 |         else:
106 |             continue
107 |     if (whole_tag != '')&(index_tag != ''):
108 |         tag_list.append(whole_tag)
109 |     tag_list_len = len(tag_list)
110 | 
111 |     for i in range(0, tag_list_len):
112 |         if  len(tag_list[i]) > 0:
113 |             tag_list[i] = tag_list[i]+ ']'
114 |             insert_list = reverse_style(tag_list[i])
115 |             stand_matrix.append(insert_list)
116 |     # print stand_matrix
117 |     return stand_matrix
118 | 
119 | 
120 | def get_ner_BIO(label_list):
121 |     # list_len = len(word_list)
122 |     # assert(list_len == len(label_list)), "word list size unmatch with label list"
123 |     list_len = len(label_list)
124 |     begin_label = 'B-'
125 |     inside_label = 'I-'
126 |     whole_tag = ''
127 |     index_tag = ''
128 |     tag_list = []
129 |     stand_matrix = []
130 |     for i in range(0, list_len):
131 |         # wordlabel = word_list[i]
132 |         current_label = label_list[i].upper()
133 |         if begin_label in current_label:
134 |             if index_tag == '':
135 |                 whole_tag = current_label.replace(begin_label,"",1) +'[' +str(i)
136 |                 index_tag = current_label.replace(begin_label,"",1)
137 |             else:
138 |                 tag_list.append(whole_tag + ',' + str(i-1))
139 |                 whole_tag = current_label.replace(begin_label,"",1)  + '[' + str(i)
140 |                 index_tag = current_label.replace(begin_label,"",1)
141 | 
142 |         elif inside_label in current_label:
143 |             if current_label.replace(inside_label,"",1) == index_tag:
144 |                 whole_tag = whole_tag
145 |             else:
146 |                 if (whole_tag != '')&(index_tag != ''):
147 |                     tag_list.append(whole_tag +',' + str(i-1))
148 |                 whole_tag = ''
149 |                 index_tag = ''
150 |         else:
151 |             if (whole_tag != '')&(index_tag != ''):
152 |                 tag_list.append(whole_tag +',' + str(i-1))
153 |             whole_tag = ''
154 |             index_tag = ''
155 | 
156 |     if (whole_tag != '')&(index_tag != ''):
157 |         tag_list.append(whole_tag)
158 |     tag_list_len = len(tag_list)
159 | 
160 |     for i in range(0, tag_list_len):
161 |         if  len(tag_list[i]) > 0:
162 |             tag_list[i] = tag_list[i]+ ']'
163 |             insert_list = reverse_style(tag_list[i])
164 |             stand_matrix.append(insert_list)
165 |     return stand_matrix
166 | 
167 | 
168 | 
169 | def readSentence(input_file):
170 |     in_lines = open(input_file,'r').readlines()
171 |     sentences = []
172 |     labels = []
173 |     sentence = []
174 |     label = []
175 |     for line in in_lines:
176 |         if len(line) < 2:
177 |             sentences.append(sentence)
178 |             labels.append(label)
179 |             sentence = []
180 |             label = []
181 |         else:
182 |             pair = line.strip('\n').split(' ')
183 |             sentence.append(pair[0])
184 |             label.append(pair[-1])
185 |     return sentences,labels
186 | 
187 | 
188 | def readTwoLabelSentence(input_file, pred_col=-1):
189 |     in_lines = open(input_file,'r').readlines()
190 |     sentences = []
191 |     predict_labels = []
192 |     golden_labels = []
193 |     sentence = []
194 |     predict_label = []
195 |     golden_label = []
196 |     for line in in_lines:
197 |         if "##score##" in line:
198 |             continue
199 |         if len(line) < 2:
200 |             sentences.append(sentence)
201 |             golden_labels.append(golden_label)
202 |             predict_labels.append(predict_label)
203 |             sentence = []
204 |             golden_label = []
205 |             predict_label = []
206 |         else:
207 |             pair = line.strip('\n').split(' ')
208 |             sentence.append(pair[0])
209 |             golden_label.append(pair[1])
210 |             predict_label.append(pair[pred_col])
211 | 
212 |     return sentences,golden_labels,predict_labels
213 | 
214 | 
215 | def fmeasure_from_file(golden_file, predict_file, label_type="BMES"):
216 |     print("Get f measure from file:", golden_file, predict_file)
217 |     print("Label format:",label_type)
218 |     golden_sent,golden_labels = readSentence(golden_file)
219 |     predict_sent,predict_labels = readSentence(predict_file)
220 |     P,R,F = get_ner_fmeasure(golden_labels, predict_labels, label_type)
221 |     print ("P:%sm R:%s, F:%s"%(P,R,F))
222 | 
223 | 
224 | 
225 | def fmeasure_from_singlefile(twolabel_file, label_type="BMES", pred_col=-1):
226 |     sent,golden_labels,predict_labels = readTwoLabelSentence(twolabel_file, pred_col)
227 |     P,R,F = get_ner_fmeasure(golden_labels, predict_labels, label_type)
228 |     print ("P:%s, R:%s, F:%s"%(P,R,F))
229 | 
230 | 
231 | 
232 | if __name__ == '__main__':
233 |     # print "sys:",len(sys.argv)
234 |     if len(sys.argv) == 3:
235 |         fmeasure_from_singlefile(sys.argv[1],"BMES",int(sys.argv[2]))
236 |     else:
237 |         fmeasure_from_singlefile(sys.argv[1],"BMES")
238 | 
239 | 


--------------------------------------------------------------------------------
/utils/tagSchemeConverter.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Author: Jie Yang
  3 | # @Date:   2017-11-27 16:53:36
  4 | # @Last Modified by:   Jie Yang,     Contact: jieynlp@gmail.com
  5 | # @Last Modified time: 2019-01-09 21:39:10
  6 | 
  7 | 
  8 | """
  9 |     convert NER/Chunking tag schemes, i.e. BIO->BIOES, BIOES->BIO, IOB->BIO, IOB->BIOES
 10 | """
 11 | from __future__ import print_function
 12 | 
 13 | import sys
 14 | 
 15 | 
 16 | def BIO2BIOES(input_file, output_file):
 17 |     print("Convert BIO -> BIOES for file:", input_file)
 18 |     with open(input_file,'r') as in_file:
 19 |         fins = in_file.readlines()
 20 |     fout = open(output_file,'w')
 21 |     words = []
 22 |     labels = []
 23 |     for line in fins:
 24 |         if len(line) < 3:
 25 |             sent_len = len(words)
 26 |             for idx in range(sent_len):
 27 |                 if "-" not in labels[idx]:
 28 |                     fout.write(words[idx]+" "+labels[idx]+"\n")
 29 |                 else:
 30 |                     label_type = labels[idx].split('-')[-1]
 31 |                     if "B-" in labels[idx]:
 32 |                         if (idx == sent_len - 1) or ("I-" not in labels[idx+1]):
 33 |                             fout.write(words[idx]+" S-"+label_type+"\n")
 34 |                         else:
 35 |                             fout.write(words[idx]+" B-"+label_type+"\n")
 36 |                     elif "I-" in labels[idx]:
 37 |                         if (idx == sent_len - 1) or ("I-" not in labels[idx+1]):
 38 |                             fout.write(words[idx]+" E-"+label_type+"\n")
 39 |                         else:
 40 |                             fout.write(words[idx]+" I-"+label_type+"\n")
 41 |             fout.write('\n')
 42 |             words = []
 43 |             labels = []
 44 |         else:
 45 |             pair = line.strip('\n').split()
 46 |             words.append(pair[0])
 47 |             labels.append(pair[-1].upper())
 48 |     fout.close()
 49 |     print("BIOES file generated:", output_file)
 50 | 
 51 | 
 52 | 
 53 | def BIOES2BIO(input_file, output_file):
 54 |     print("Convert BIOES -> BIO for file:", input_file)
 55 |     with open(input_file,'r') as in_file:
 56 |         fins = in_file.readlines()
 57 |     fout = open(output_file,'w')
 58 |     words = []
 59 |     labels = []
 60 |     for line in fins:
 61 |         if len(line) < 3:
 62 |             sent_len = len(words)
 63 |             for idx in range(sent_len):
 64 |                 if "-" not in labels[idx]:
 65 |                     fout.write(words[idx]+" "+labels[idx]+"\n")
 66 |                 else:
 67 |                     label_type = labels[idx].split('-')[-1]
 68 |                     if "E-" in labels[idx]:
 69 |                         fout.write(words[idx]+" I-"+label_type+"\n")
 70 |                     elif "S-" in labels[idx]:
 71 |                         fout.write(words[idx]+" B-"+label_type+"\n")
 72 |                     else:
 73 |                         fout.write(words[idx]+" "+labels[idx]+"\n")     
 74 |             fout.write('\n')
 75 |             words = []
 76 |             labels = []
 77 |         else:
 78 |             pair = line.strip('\n').split()
 79 |             words.append(pair[0])
 80 |             labels.append(pair[-1].upper())
 81 |     fout.close()
 82 |     print("BIO file generated:", output_file)
 83 | 
 84 | 
 85 | def IOB2BIO(input_file, output_file):
 86 |     print("Convert IOB -> BIO for file:", input_file)
 87 |     with open(input_file,'r') as in_file:
 88 |         fins = in_file.readlines()
 89 |     fout = open(output_file,'w')
 90 |     words = []
 91 |     labels = []
 92 |     for line in fins:
 93 |         if len(line) < 3:
 94 |             sent_len = len(words)
 95 |             for idx in range(sent_len):
 96 |                 if "I-" in labels[idx]:
 97 |                     label_type = labels[idx].split('-')[-1]
 98 |                     if (idx == 0) or (labels[idx-1] == "O") or (label_type != labels[idx-1].split('-')[-1]):
 99 |                         fout.write(words[idx]+" B-"+label_type+"\n")
100 |                     else:
101 |                         fout.write(words[idx]+" "+labels[idx]+"\n")
102 |                 else:
103 |                     fout.write(words[idx]+" "+labels[idx]+"\n")
104 |             fout.write('\n')
105 |             words = []
106 |             labels = []
107 |         else:
108 |             pair = line.strip('\n').split()
109 |             words.append(pair[0])
110 |             labels.append(pair[-1].upper())
111 |     fout.close()
112 |     print("BIO file generated:", output_file)
113 | 
114 | 
115 | def choose_label(input_file, output_file):
116 |     with open(input_file,'r') as in_file:
117 |         fins = in_file.readlines()
118 |     with open(output_file,'w') as fout:
119 |         for line in fins:
120 |             if len(line) < 3:
121 |                 fout.write(line)
122 |             else:
123 |                 pairs = line.strip('\n').split(' ')
124 |                 fout.write(pairs[0]+" "+ pairs[-1]+"\n")
125 | 
126 | 
127 | if __name__ == '__main__':
128 |     '''Convert NER tag schemes among IOB/BIO/BIOES.
129 |         For example: if you want to convert the IOB tag scheme to BIO, then you run as following:
130 |             python tagSchemeConverter.py IOB2BIO input_iob_file output_bio_file
131 |         Input data format is the standard CoNLL 2003 data format.
132 |     '''
133 |     if sys.argv[1].upper() == "IOB2BIO":
134 |         IOB2BIO(sys.argv[2],sys.argv[3])
135 |     elif sys.argv[1].upper() == "BIO2BIOES":
136 |         BIO2BIOES(sys.argv[2],sys.argv[3])
137 |     elif sys.argv[1].upper() == "BIOES2BIO":
138 |         BIOES2BIO(sys.argv[2],sys.argv[3])
139 |     elif sys.argv[1].upper() == "IOB2BIOES":
140 |         IOB2BIO(sys.argv[2],"temp")
141 |         BIO2BIOES("temp",sys.argv[3])
142 |     else:
143 |         print("Argument error: sys.argv[1] should belongs to \"IOB2BIO/BIO2BIOES/BIOES2BIO/IOB2BIOES\"")
144 | 


--------------------------------------------------------------------------------