├── .gitignore ├── LICENCE ├── README.md ├── demo.clf.config ├── demo.decode.config ├── demo.train.config ├── main.py ├── main_parse.py ├── model ├── __init__.py ├── charbigru.py ├── charbilstm.py ├── charcnn.py ├── crf.py ├── sentclassifier.py ├── seqlabel.py ├── wordrep.py └── wordsequence.py ├── readme ├── Configuration.md ├── Extension.md ├── architecture.png ├── hyperparameter_tuning.md ├── logo.png ├── nbest.png └── speed.png ├── sample_data ├── dev.bmes ├── dev.cappos.bmes ├── raw.bmes ├── sample.word.emb ├── test.bmes ├── test.cappos.bmes ├── train.bmes └── train.cappos.bmes └── utils ├── __init__.py ├── alphabet.py ├── data.py ├── functions.py ├── metric.py └── tagSchemeConverter.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | __pycache__ 3 | *.dset 4 | *.model 5 | *.txt 6 | demo.clf.* 7 | sent.* 8 | *.out 9 | *.log 10 | -------------------------------------------------------------------------------- /LICENCE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![NCRF++ Logo](https://github.com/jiesutd/NCRFpp/blob/master/readme/logo.png) 2 | 3 | 4 | # NCRF++: An Open-source Neural Sequence Labeling Toolkit 5 | 6 | 7 | * [1. Introduction](#Introduction) 8 | * [2. Requirement](#Requirement) 9 | * [3. Advantages](#Advantages) 10 | * [4. Usage](#Usage) 11 | * [5. Data Format](#Data-Format) 12 | * [6. Performance](#Performance) 13 | * [7. Add Handcrafted Features](#Add-Handcrafted-Features) 14 | * [8. Speed](#Speed) 15 | * [9. N best Decoding](#N-best-Decoding) 16 | * [10. Reproduce Paper Results and Hyperparameter Tuning](#Reproduce-Paper-Results-and-Hyperparameter-Tuning) 17 | * [11. Report Issue or Problem](#Report-Issue-or-Problem) 18 | * [12. Cite](#Cite) 19 | * [13. Future Plan](#Future-Plan) 20 | * [13. Update](#Update) 21 | 22 | ## Introduction 23 | 24 | Sequence labeling models are quite popular in many NLP tasks, such as Named Entity Recognition (NER), part-of-speech (POS) tagging and word segmentation. State-of-the-art sequence labeling models mostly utilize the CRF structure with input word features. LSTM (or bidirectional LSTM) is a popular deep learning based feature extractor in sequence labeling task. And CNN can also be used due to faster computation. Besides, features within word are also useful to represent word, which can be captured by character LSTM or character CNN structure or human-defined neural features. 25 | 26 | NCRF++ is a PyTorch based framework with flexiable choices of input features and output structures. The design of neural sequence labeling models with NCRF++ is fully configurable through a configuration file, which does not require any code work. NCRF++ can be regarded as a neural network version of [CRF++](http://taku910.github.io/crfpp/), which is a famous statistical CRF framework. 27 | 28 | This framework has been accepted by [ACL 2018](https://arxiv.org/abs/1806.05626) as demonstration paper. And the detailed experiment report and analysis using NCRF++ has been accepted at [COLING 2018](https://arxiv.org/abs/1806.04470) as the best paper. 29 | 30 | NCRF++ supports different structure combinations of on three levels: character sequence representation, word sequence representation and inference layer. 31 | 32 | * Character sequence representation: character LSTM, character GRU, character CNN and handcrafted word features. 33 | * Word sequence representation: word LSTM, word GRU, word CNN. 34 | * Inference layer: Softmax, CRF. 35 | 36 | Welcome to star this repository! 37 | 38 | ## Requirement 39 | 40 | Python: 2 or 3 41 | PyTorch: 1.0 42 | 43 | [PyTorch 0.3 compatible version is here.](https://github.com/jiesutd/NCRFpp/tree/PyTorch0.3) 44 | 45 | 46 | ## Advantages 47 | 48 | * Fully configurable: all the neural model structures can be set with a configuration file. 49 | * State-of-the-art system performance: models build on NCRF++ can give comparable or better results compared with state-of-the-art models. 50 | * Flexible with features: user can define their own features and pretrained feature embeddings. 51 | * Fast running speed: NCRF++ utilizes fully batched operations, making the system efficient with the help of GPU (>1000sent/s for training and >2000sents/s for decoding). 52 | * N best output: NCRF++ support `nbest` decoding (with their probabilities). 53 | 54 | 55 | ## Usage 56 | 57 | NCRF++ supports designing the neural network structure through a configuration file. The program can run in two status; ***training*** and ***decoding***. (sample configuration and data have been included in this repository) 58 | 59 | In ***training*** status: 60 | `python main.py --config demo.train.config` 61 | 62 | In ***decoding*** status: 63 | `python main.py --config demo.decode.config` 64 | 65 | The configuration file controls the network structure, I/O, training setting and hyperparameters. 66 | 67 | ***Detail configurations and explanations are listed [here](readme/Configuration.md).*** 68 | 69 | NCRF++ is designed in three layers (shown below): character sequence layer; word sequence layer and inference layer. By using the configuration file, most of the state-of-the-art models can be easily replicated ***without coding***. On the other hand, users can extend each layer by designing their own modules (for example, they may want to design their own neural structures other than CNN/LSTM/GRU). Our layer-wised design makes the module extension convenient, the instruction of module extension can be found [here](readme/Extension.md). 70 | 71 | ![alt text](readme/architecture.png "Layer-size design") 72 | 73 | 74 | ## Data Format 75 | 76 | * You can refer the data format in [sample_data](sample_data). 77 | * NCRF++ supports both BIO and BIOES(BMES) tag scheme. 78 | * Notice that IOB format (***different*** from BIO) is currently not supported, because this tag scheme is old and works worse than other schemes [Reimers and Gurevych, 2017](https://arxiv.org/pdf/1707.06799.pdf). 79 | * The difference among these three tag schemes is explained in this [paper](https://arxiv.org/pdf/1707.06799.pdf). 80 | * I have written a [script](utils/tagSchemeConverter.py) which converts the tag scheme among IOB/BIO/BIOES. Welcome to have a try. 81 | 82 | 83 | ## Performance 84 | 85 | Results on CONLL 2003 English NER task are better or comparable with SOTA results with the same structures. 86 | 87 | CharLSTM+WordLSTM+CRF: 91.20 vs 90.94 of [Lample .etc, NAACL16](http://www.aclweb.org/anthology/N/N16/N16-1030.pdf); 88 | 89 | CharCNN+WordLSTM+CRF: 91.35 vs 91.21 of [Ma .etc, ACL16](http://www.aclweb.org/anthology/P/P16/P16-1101.pdf). 90 | 91 | By default, `LSTM` is bidirectional LSTM. 92 | 93 | |ID| Model | Nochar | CharLSTM |CharCNN 94 | |---|--------- | --- | --- | ------ 95 | |1| WordLSTM | 88.57 | 90.84 | 90.73 96 | |2| WordLSTM+CRF | 89.45 | **91.20** | **91.35** 97 | |3| WordCNN | 88.56| 90.46 | 90.30 98 | |4| WordCNN+CRF | 88.90 | 90.70 | 90.43 99 | 100 | We have compared twelve neural sequence labeling models (`{charLSTM, charCNN, None} x {wordLSTM, wordCNN} x {softmax, CRF}`) on three benchmarks (POS, Chunking, NER) under statistical experiments, detail results and comparisons can be found in our COLING 2018 paper [Design Challenges and Misconceptions in Neural Sequence Labeling](https://arxiv.org/abs/1806.04470). 101 | 102 | 103 | ## Add Handcrafted Features 104 | 105 | NCRF++ has integrated several SOTA neural characrter sequence feature extractors: CNN ([Ma .etc, ACL16](http://www.aclweb.org/anthology/P/P16/P16-1101.pdf)), LSTM ([Lample .etc, NAACL16](http://www.aclweb.org/anthology/N/N16/N16-1030.pdf)) and GRU ([Yang .etc, ICLR17](https://arxiv.org/pdf/1703.06345.pdf)). In addition, handcrafted features have been proven important in sequence labeling tasks. NCRF++ allows users designing their own features such as Capitalization, POS tag or any other features (grey circles in above figure). Users can configure the self-defined features through configuration file (feature embedding size, pretrained feature embeddings .etc). The sample input data format is given at [train.cappos.bmes](sample_data/train.cappos.bmes), which includes two human-defined features `[POS]` and `[Cap]`. (`[POS]` and `[Cap]` are two examples, you can give your feature any name you want, just follow the format `[xx]` and configure the feature with the same name in configuration file.) 106 | User can configure each feature in configuration file by using 107 | 108 | ```Python 109 | feature=[POS] emb_size=20 emb_dir=%your_pretrained_POS_embedding 110 | feature=[Cap] emb_size=20 emb_dir=%your_pretrained_Cap_embedding 111 | ``` 112 | 113 | Feature without pretrained embedding will be randomly initialized. 114 | 115 | 116 | ## Speed 117 | 118 | NCRF++ is implemented using fully batched calculation, making it quite effcient on both model training and decoding. With the help of GPU (Nvidia GTX 1080) and large batch size, LSTMCRF model built with NCRF++ can reach 1000 sents/s and 2000sents/s on training and decoding status, respectively. 119 | 120 | ![alt text](readme/speed.png "System speed on NER data") 121 | 122 | 123 | ## N best Decoding 124 | 125 | Traditional CRF structure decodes only one label sequence with largest probabolities (i.e. 1-best output). While NCRF++ can give a large choice, it can decode `n` label sequences with the top `n` probabilities (i.e. n-best output). The nbest decodeing has been supported by several popular **statistical** CRF framework. However to the best of our knowledge, NCRF++ is the only and the first toolkit which support nbest decoding in **neural** CRF models. 126 | 127 | In our implementation, when the nbest=10, CharCNN+WordLSTM+CRF model built in NCRF++ can give 97.47% oracle F1-value (F1 = 91.35% when nbest=1) on CoNLL 2003 NER task. 128 | 129 | ![alt text](readme/nbest.png "N best decoding oracle result") 130 | 131 | 132 | ## Reproduce Paper Results and Hyperparameter Tuning 133 | 134 | To reproduce the results in our COLING 2018 paper, you only need to set the `iteration=1` as `iteration=100` in configuration file `demo.train.config` and configure your file directory in this configuration file. The default configuration file describes the `Char CNN + Word LSTM + CRF` model, you can build your own model by modifing the configuration accordingly. The parameters in this demo configuration file are the same in our paper. (Notice the `Word CNN` related models need slightly different parameters, details can be found in our COLING paper.) 135 | 136 | If you want to use this framework in new tasks or datasets, here are some tuning [tips](readme/hyperparameter_tuning.md) by @Victor0118. 137 | 138 | 139 | ## Report Issue or Problem 140 | 141 | If you want to report an issue or ask a problem, please attach the following materials if necessary. With these information, I can give fast and accurate discussion and suggestion. 142 | * `log file` 143 | * `config file` 144 | * `sample data` 145 | 146 | 147 | ## Cite 148 | 149 | If you use NCRF++ in your paper, please cite our [ACL demo paper](https://arxiv.org/abs/1806.05626): 150 | 151 | @inproceedings{yang2018ncrf, 152 | title={NCRF++: An Open-source Neural Sequence Labeling Toolkit}, 153 | author={Yang, Jie and Zhang, Yue}, 154 | booktitle={Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics}, 155 | Url = {http://aclweb.org/anthology/P18-4013}, 156 | year={2018} 157 | } 158 | 159 | 160 | If you use experiments results and analysis of NCRF++, please cite our [COLING paper](https://arxiv.org/abs/1806.04470): 161 | 162 | @inproceedings{yang2018design, 163 | title={Design Challenges and Misconceptions in Neural Sequence Labeling}, 164 | author={Yang, Jie and Liang, Shuailong and Zhang, Yue}, 165 | booktitle={Proceedings of the 27th International Conference on Computational Linguistics (COLING)}, 166 | Url = {http://aclweb.org/anthology/C18-1327}, 167 | year={2018} 168 | } 169 | 170 | ## Future Plan 171 | 172 | * Document classification (working) 173 | * Support API usage 174 | * Upload trained model on Word Segmentation/POS tagging/NER 175 | * Enable loading pretrained ELMo parameters 176 | * Add BERT feature extraction layer 177 | 178 | 179 | 180 | ## Update 181 | 182 | * 2018-Dec-17, NCRF++ v0.2, support PyTorch 1.0 183 | * 2018-Mar-30, NCRF++ v0.1, initial version 184 | * 2018-Jan-06, add result comparison. 185 | * 2018-Jan-02, support character feature selection. 186 | * 2017-Dec-06, init version 187 | 188 | -------------------------------------------------------------------------------- /demo.clf.config: -------------------------------------------------------------------------------- 1 | ### use # to comment out the configure item 2 | 3 | sentence_classification=True 4 | 5 | ### I/O ### 6 | train_dir=../data/Sentclf/SST1/stsa.fine.train.clf 7 | dev_dir=../data/Sentclf/SST1/stsa.fine.dev.clf 8 | test_dir=../data/Sentclf/SST1/stsa.fine.test.clf 9 | model_dir=sample_data/clf 10 | word_emb_dir=../data/glove.840B.300d.txt 11 | 12 | 13 | #raw_dir= 14 | #decode_dir= 15 | #dset_dir= 16 | #load_model_dir= 17 | #char_emb_dir= 18 | 19 | norm_word_emb=False 20 | norm_char_emb=False 21 | number_normalized=True 22 | seg=False 23 | word_emb_dim=50 24 | char_emb_dim=30 25 | 26 | ###NetworkConfiguration### 27 | use_crf=False 28 | use_char=False 29 | word_seq_feature=LSTM 30 | char_seq_feature=CNN 31 | #feature=[POS] emb_size=20 32 | #feature=[Cap] emb_size=20 33 | #nbest=1 34 | 35 | ###TrainingSetting### 36 | status=train 37 | optimizer=SGD 38 | iteration=50 39 | batch_size=10 40 | ave_batch_loss=False 41 | 42 | ###Hyperparameters### 43 | cnn_layer=4 44 | char_hidden_dim=50 45 | hidden_dim=400 46 | dropout=0 47 | lstm_layer=1 48 | bilstm=True 49 | learning_rate=0.2 50 | lr_decay=0.05 51 | momentum=0 52 | l2=1e-8 53 | #gpu 54 | #clip= 55 | -------------------------------------------------------------------------------- /demo.decode.config: -------------------------------------------------------------------------------- 1 | ### Decode ### 2 | status=decode 3 | raw_dir=sample_data/raw.bmes 4 | nbest=10 5 | decode_dir=sample_data/raw.out 6 | dset_dir=sample_data/lstmcrf.dset 7 | load_model_dir=sample_data/lstmcrf.0.model -------------------------------------------------------------------------------- /demo.train.config: -------------------------------------------------------------------------------- 1 | ### use # to comment out the configure item 2 | 3 | ### I/O ### 4 | train_dir=sample_data/train.bmes 5 | dev_dir=sample_data/dev.bmes 6 | test_dir=sample_data/test.bmes 7 | model_dir=sample_data/lstmcrf 8 | word_emb_dir=sample_data/sample.word.emb 9 | 10 | #raw_dir= 11 | #decode_dir= 12 | #dset_dir= 13 | #load_model_dir= 14 | #char_emb_dir= 15 | 16 | norm_word_emb=False 17 | norm_char_emb=False 18 | number_normalized=True 19 | seg=True 20 | word_emb_dim=50 21 | char_emb_dim=30 22 | 23 | ###NetworkConfiguration### 24 | use_crf=True 25 | use_char=True 26 | word_seq_feature=LSTM 27 | char_seq_feature=CNN 28 | #feature=[POS] emb_size=20 29 | #feature=[Cap] emb_size=20 30 | #nbest=1 31 | 32 | ###TrainingSetting### 33 | status=train 34 | optimizer=SGD 35 | iteration=1 36 | batch_size=10 37 | ave_batch_loss=False 38 | 39 | ###Hyperparameters### 40 | cnn_layer=4 41 | char_hidden_dim=50 42 | hidden_dim=200 43 | dropout=0.5 44 | lstm_layer=1 45 | bilstm=True 46 | learning_rate=0.015 47 | lr_decay=0.05 48 | momentum=0 49 | l2=1e-8 50 | #gpu 51 | #clip= 52 | -------------------------------------------------------------------------------- /main_parse.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Jie 3 | # @Date: 2017-06-15 14:11:08 4 | # @Last Modified by: Jie Yang, Contact: jieynlp@gmail.com 5 | # @Last Modified time: 2019-02-13 10:58:43 6 | 7 | from __future__ import print_function 8 | import time 9 | import sys 10 | import argparse 11 | import random 12 | import copy 13 | import torch 14 | import gc 15 | import torch.autograd as autograd 16 | import torch.nn as nn 17 | import torch.nn.functional as F 18 | import torch.optim as optim 19 | import numpy as np 20 | from utils.metric import get_ner_fmeasure 21 | from model.seqlabel import SeqLabel 22 | from utils.data import Data 23 | 24 | try: 25 | import cPickle as pickle 26 | except ImportError: 27 | import pickle as pickle 28 | 29 | seed_num = 42 30 | random.seed(seed_num) 31 | torch.manual_seed(seed_num) 32 | np.random.seed(seed_num) 33 | 34 | 35 | def data_initialization(data): 36 | data.initial_feature_alphabets() 37 | data.build_alphabet(data.train_dir) 38 | data.build_alphabet(data.dev_dir) 39 | data.build_alphabet(data.test_dir) 40 | data.fix_alphabet() 41 | 42 | 43 | def predict_check(pred_variable, gold_variable, mask_variable): 44 | """ 45 | input: 46 | pred_variable (batch_size, sent_len): pred tag result, in numpy format 47 | gold_variable (batch_size, sent_len): gold result variable 48 | mask_variable (batch_size, sent_len): mask variable 49 | """ 50 | pred = pred_variable.cpu().data.numpy() 51 | gold = gold_variable.cpu().data.numpy() 52 | mask = mask_variable.cpu().data.numpy() 53 | overlaped = (pred == gold) 54 | right_token = np.sum(overlaped * mask) 55 | total_token = mask.sum() 56 | # print("right: %s, total: %s"%(right_token, total_token)) 57 | return right_token, total_token 58 | 59 | 60 | def recover_label(pred_variable, gold_variable, mask_variable, label_alphabet, word_recover): 61 | """ 62 | input: 63 | pred_variable (batch_size, sent_len): pred tag result 64 | gold_variable (batch_size, sent_len): gold result variable 65 | mask_variable (batch_size, sent_len): mask variable 66 | """ 67 | 68 | pred_variable = pred_variable[word_recover] 69 | gold_variable = gold_variable[word_recover] 70 | mask_variable = mask_variable[word_recover] 71 | batch_size = gold_variable.size(0) 72 | seq_len = gold_variable.size(1) 73 | mask = mask_variable.cpu().data.numpy() 74 | pred_tag = pred_variable.cpu().data.numpy() 75 | gold_tag = gold_variable.cpu().data.numpy() 76 | batch_size = mask.shape[0] 77 | pred_label = [] 78 | gold_label = [] 79 | for idx in range(batch_size): 80 | pred = [label_alphabet.get_instance(pred_tag[idx][idy]) for idy in range(seq_len) if mask[idx][idy] != 0] 81 | gold = [label_alphabet.get_instance(gold_tag[idx][idy]) for idy in range(seq_len) if mask[idx][idy] != 0] 82 | # print("p:",pred, pred_tag.tolist()) 83 | # print("g:", gold, gold_tag.tolist()) 84 | assert(len(pred)==len(gold)) 85 | pred_label.append(pred) 86 | gold_label.append(gold) 87 | return pred_label, gold_label 88 | 89 | 90 | def recover_nbest_label(pred_variable, mask_variable, label_alphabet, word_recover): 91 | """ 92 | input: 93 | pred_variable (batch_size, sent_len, nbest): pred tag result 94 | mask_variable (batch_size, sent_len): mask variable 95 | word_recover (batch_size) 96 | output: 97 | nbest_pred_label list: [batch_size, nbest, each_seq_len] 98 | """ 99 | # print("word recover:", word_recover.size()) 100 | # exit(0) 101 | pred_variable = pred_variable[word_recover] 102 | mask_variable = mask_variable[word_recover] 103 | batch_size = pred_variable.size(0) 104 | seq_len = pred_variable.size(1) 105 | print(pred_variable.size()) 106 | nbest = pred_variable.size(2) 107 | mask = mask_variable.cpu().data.numpy() 108 | pred_tag = pred_variable.cpu().data.numpy() 109 | batch_size = mask.shape[0] 110 | pred_label = [] 111 | for idx in range(batch_size): 112 | pred = [] 113 | for idz in range(nbest): 114 | each_pred = [label_alphabet.get_instance(pred_tag[idx][idy][idz]) for idy in range(seq_len) if mask[idx][idy] != 0] 115 | pred.append(each_pred) 116 | pred_label.append(pred) 117 | return pred_label 118 | 119 | 120 | 121 | # def save_data_setting(data, save_file): 122 | # new_data = copy.deepcopy(data) 123 | # ## remove input instances 124 | # new_data.train_texts = [] 125 | # new_data.dev_texts = [] 126 | # new_data.test_texts = [] 127 | # new_data.raw_texts = [] 128 | 129 | # new_data.train_Ids = [] 130 | # new_data.dev_Ids = [] 131 | # new_data.test_Ids = [] 132 | # new_data.raw_Ids = [] 133 | # ## save data settings 134 | # with open(save_file, 'w') as fp: 135 | # pickle.dump(new_data, fp) 136 | # print("Data setting saved to file: ", save_file) 137 | 138 | 139 | # def load_data_setting(save_file): 140 | # with open(save_file, 'r') as fp: 141 | # data = pickle.load(fp) 142 | # print("Data setting loaded from file: ", save_file) 143 | # data.show_data_summary() 144 | # return data 145 | 146 | def lr_decay(optimizer, epoch, decay_rate, init_lr): 147 | lr = init_lr/(1+decay_rate*epoch) 148 | print(" Learning rate is set as:", lr) 149 | for param_group in optimizer.param_groups: 150 | param_group['lr'] = lr 151 | return optimizer 152 | 153 | 154 | 155 | def evaluate(data, model, name, nbest=None): 156 | if name == "train": 157 | instances = data.train_Ids 158 | elif name == "dev": 159 | instances = data.dev_Ids 160 | elif name == 'test': 161 | instances = data.test_Ids 162 | elif name == 'raw': 163 | instances = data.raw_Ids 164 | else: 165 | print("Error: wrong evaluate name,", name) 166 | right_token = 0 167 | whole_token = 0 168 | nbest_pred_results = [] 169 | pred_scores = [] 170 | pred_results = [] 171 | gold_results = [] 172 | ## set model in eval model 173 | model.eval() 174 | batch_size = data.HP_batch_size 175 | start_time = time.time() 176 | train_num = len(instances) 177 | total_batch = train_num//batch_size+1 178 | for batch_id in range(total_batch): 179 | start = batch_id*batch_size 180 | end = (batch_id+1)*batch_size 181 | if end > train_num: 182 | end = train_num 183 | instance = instances[start:end] 184 | if not instance: 185 | continue 186 | batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask = batchify_with_label(instance, data.HP_gpu, True) 187 | if nbest: 188 | scores, nbest_tag_seq = model.decode_nbest(batch_word,batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask, nbest) 189 | nbest_pred_result = recover_nbest_label(nbest_tag_seq, mask, data.label_alphabet, batch_wordrecover) 190 | nbest_pred_results += nbest_pred_result 191 | pred_scores += scores[batch_wordrecover].cpu().data.numpy().tolist() 192 | ## select the best sequence to evalurate 193 | tag_seq = nbest_tag_seq[:,:,0] 194 | else: 195 | tag_seq = model(batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask) 196 | # print("tag:",tag_seq) 197 | pred_label, gold_label = recover_label(tag_seq, batch_label, mask, data.label_alphabet, batch_wordrecover) 198 | pred_results += pred_label 199 | gold_results += gold_label 200 | decode_time = time.time() - start_time 201 | speed = len(instances)/decode_time 202 | acc, p, r, f = get_ner_fmeasure(gold_results, pred_results, data.tagScheme) 203 | if nbest: 204 | return speed, acc, p, r, f, nbest_pred_results, pred_scores 205 | return speed, acc, p, r, f, pred_results, pred_scores 206 | 207 | 208 | def batchify_with_label(input_batch_list, gpu, volatile_flag=False): 209 | """ 210 | input: list of words, chars and labels, various length. [[words,chars, labels],[words,chars,labels],...] 211 | words: word ids for one sentence. (batch_size, sent_len) 212 | chars: char ids for on sentences, various length. (batch_size, sent_len, each_word_length) 213 | output: 214 | zero padding for word and char, with their batch length 215 | word_seq_tensor: (batch_size, max_sent_len) Variable 216 | word_seq_lengths: (batch_size,1) Tensor 217 | char_seq_tensor: (batch_size*max_sent_len, max_word_len) Variable 218 | char_seq_lengths: (batch_size*max_sent_len,1) Tensor 219 | char_seq_recover: (batch_size*max_sent_len,1) recover char sequence order 220 | label_seq_tensor: (batch_size, max_sent_len) 221 | mask: (batch_size, max_sent_len) 222 | """ 223 | batch_size = len(input_batch_list) 224 | words = [sent[0] for sent in input_batch_list] 225 | features = [np.asarray(sent[1]) for sent in input_batch_list] 226 | feature_num = len(features[0][0]) 227 | chars = [sent[2] for sent in input_batch_list] 228 | labels = [sent[3] for sent in input_batch_list] 229 | word_seq_lengths = torch.LongTensor(map(len, words)) 230 | max_seq_len = word_seq_lengths.max() 231 | word_seq_tensor = autograd.Variable(torch.zeros((batch_size, max_seq_len)), volatile = volatile_flag).long() 232 | label_seq_tensor = autograd.Variable(torch.zeros((batch_size, max_seq_len)),volatile = volatile_flag).long() 233 | feature_seq_tensors = [] 234 | for idx in range(feature_num): 235 | feature_seq_tensors.append(autograd.Variable(torch.zeros((batch_size, max_seq_len)),volatile = volatile_flag).long()) 236 | mask = autograd.Variable(torch.zeros((batch_size, max_seq_len)),volatile = volatile_flag).bool() 237 | for idx, (seq, label, seqlen) in enumerate(zip(words, labels, word_seq_lengths)): 238 | word_seq_tensor[idx, :seqlen] = torch.LongTensor(seq) 239 | label_seq_tensor[idx, :seqlen] = torch.LongTensor(label) 240 | mask[idx, :seqlen] = torch.Tensor([1]*seqlen) 241 | for idy in range(feature_num): 242 | feature_seq_tensors[idy][idx,:seqlen] = torch.LongTensor(features[idx][:,idy]) 243 | word_seq_lengths, word_perm_idx = word_seq_lengths.sort(0, descending=True) 244 | word_seq_tensor = word_seq_tensor[word_perm_idx] 245 | for idx in range(feature_num): 246 | feature_seq_tensors[idx] = feature_seq_tensors[idx][word_perm_idx] 247 | 248 | label_seq_tensor = label_seq_tensor[word_perm_idx] 249 | mask = mask[word_perm_idx] 250 | ### deal with char 251 | # pad_chars (batch_size, max_seq_len) 252 | pad_chars = [chars[idx] + [[0]] * (max_seq_len-len(chars[idx])) for idx in range(len(chars))] 253 | length_list = [map(len, pad_char) for pad_char in pad_chars] 254 | max_word_len = max(map(max, length_list)) 255 | char_seq_tensor = autograd.Variable(torch.zeros((batch_size, max_seq_len, max_word_len)), volatile = volatile_flag).long() 256 | char_seq_lengths = torch.LongTensor(length_list) 257 | for idx, (seq, seqlen) in enumerate(zip(pad_chars, char_seq_lengths)): 258 | for idy, (word, wordlen) in enumerate(zip(seq, seqlen)): 259 | # print len(word), wordlen 260 | char_seq_tensor[idx, idy, :wordlen] = torch.LongTensor(word) 261 | 262 | char_seq_tensor = char_seq_tensor[word_perm_idx].view(batch_size*max_seq_len,-1) 263 | char_seq_lengths = char_seq_lengths[word_perm_idx].view(batch_size*max_seq_len,) 264 | char_seq_lengths, char_perm_idx = char_seq_lengths.sort(0, descending=True) 265 | char_seq_tensor = char_seq_tensor[char_perm_idx] 266 | _, char_seq_recover = char_perm_idx.sort(0, descending=False) 267 | _, word_seq_recover = word_perm_idx.sort(0, descending=False) 268 | if gpu: 269 | word_seq_tensor = word_seq_tensor.cuda() 270 | for idx in range(feature_num): 271 | feature_seq_tensors[idx] = feature_seq_tensors[idx].cuda() 272 | word_seq_lengths = word_seq_lengths.cuda() 273 | word_seq_recover = word_seq_recover.cuda() 274 | label_seq_tensor = label_seq_tensor.cuda() 275 | char_seq_tensor = char_seq_tensor.cuda() 276 | char_seq_recover = char_seq_recover.cuda() 277 | mask = mask.cuda() 278 | return word_seq_tensor,feature_seq_tensors, word_seq_lengths, word_seq_recover, char_seq_tensor, char_seq_lengths, char_seq_recover, label_seq_tensor, mask 279 | 280 | 281 | def train(data): 282 | print("Training model...") 283 | data.show_data_summary() 284 | save_data_name = data.model_dir +".dset" 285 | data.save(save_data_name) 286 | model = SeqLabel(data) 287 | loss_function = nn.NLLLoss() 288 | if data.optimizer.lower() == "sgd": 289 | optimizer = optim.SGD(model.parameters(), lr=data.HP_lr, momentum=data.HP_momentum,weight_decay=data.HP_l2) 290 | elif data.optimizer.lower() == "adagrad": 291 | optimizer = optim.Adagrad(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) 292 | elif data.optimizer.lower() == "adadelta": 293 | optimizer = optim.Adadelta(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) 294 | elif data.optimizer.lower() == "rmsprop": 295 | optimizer = optim.RMSprop(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) 296 | elif data.optimizer.lower() == "adam": 297 | optimizer = optim.Adam(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) 298 | else: 299 | print("Optimizer illegal: %s"%(data.optimizer)) 300 | exit(0) 301 | best_dev = -10 302 | # data.HP_iteration = 1 303 | ## start training 304 | for idx in range(data.HP_iteration): 305 | epoch_start = time.time() 306 | temp_start = epoch_start 307 | print("Epoch: %s/%s" %(idx,data.HP_iteration)) 308 | if data.optimizer == "SGD": 309 | optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr) 310 | instance_count = 0 311 | sample_id = 0 312 | sample_loss = 0 313 | total_loss = 0 314 | right_token = 0 315 | whole_token = 0 316 | random.shuffle(data.train_Ids) 317 | ## set model in train model 318 | model.train() 319 | model.zero_grad() 320 | batch_size = data.HP_batch_size 321 | batch_id = 0 322 | train_num = len(data.train_Ids) 323 | total_batch = train_num//batch_size+1 324 | for batch_id in range(total_batch): 325 | start = batch_id*batch_size 326 | end = (batch_id+1)*batch_size 327 | if end >train_num: 328 | end = train_num 329 | instance = data.train_Ids[start:end] 330 | if not instance: 331 | continue 332 | batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask = batchify_with_label(instance, data.HP_gpu) 333 | instance_count += 1 334 | loss, tag_seq = model.neg_log_likelihood_loss(batch_word,batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, batch_label, mask) 335 | right, whole = predict_check(tag_seq, batch_label, mask) 336 | right_token += right 337 | whole_token += whole 338 | sample_loss += loss.data[0] 339 | total_loss += loss.data[0] 340 | if end%500 == 0: 341 | temp_time = time.time() 342 | temp_cost = temp_time - temp_start 343 | temp_start = temp_time 344 | print(" Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f"%(end, temp_cost, sample_loss, right_token, whole_token,(right_token+0.)/whole_token)) 345 | sys.stdout.flush() 346 | sample_loss = 0 347 | loss.backward() 348 | optimizer.step() 349 | model.zero_grad() 350 | temp_time = time.time() 351 | temp_cost = temp_time - temp_start 352 | print(" Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f"%(end, temp_cost, sample_loss, right_token, whole_token,(right_token+0.)/whole_token)) 353 | epoch_finish = time.time() 354 | epoch_cost = epoch_finish - epoch_start 355 | print("Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s, total loss: %s"%(idx, epoch_cost, train_num/epoch_cost, total_loss)) 356 | # continue 357 | speed, acc, p, r, f, _,_ = evaluate(data, model, "dev") 358 | dev_finish = time.time() 359 | dev_cost = dev_finish - epoch_finish 360 | 361 | if data.seg: 362 | current_score = f 363 | print("Dev: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"%(dev_cost, speed, acc, p, r, f)) 364 | else: 365 | current_score = acc 366 | print("Dev: time: %.2fs speed: %.2fst/s; acc: %.4f"%(dev_cost, speed, acc)) 367 | 368 | if current_score > best_dev: 369 | if data.seg: 370 | print("Exceed previous best f score:", best_dev) 371 | else: 372 | print("Exceed previous best acc score:", best_dev) 373 | model_name = data.model_dir +'.'+ str(idx) + ".model" 374 | print("Save current best model in file:", model_name) 375 | torch.save(model.state_dict(), model_name) 376 | best_dev = current_score 377 | # ## decode test 378 | speed, acc, p, r, f, _,_ = evaluate(data, model, "test") 379 | test_finish = time.time() 380 | test_cost = test_finish - dev_finish 381 | if data.seg: 382 | print("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"%(test_cost, speed, acc, p, r, f)) 383 | else: 384 | print("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f"%(test_cost, speed, acc)) 385 | gc.collect() 386 | 387 | 388 | def load_model_decode(data, name): 389 | print("Load Model from file: ", data.model_dir) 390 | model = SeqLabel(data) 391 | ## load model need consider if the model trained in GPU and load in CPU, or vice versa 392 | # if not gpu: 393 | # model.load_state_dict(torch.load(model_dir)) 394 | # # model.load_state_dict(torch.load(model_dir), map_location=lambda storage, loc: storage) 395 | # # model = torch.load(model_dir, map_location=lambda storage, loc: storage) 396 | # else: 397 | # model.load_state_dict(torch.load(model_dir)) 398 | # # model = torch.load(model_dir) 399 | model.load_state_dict(torch.load(data.load_model_dir)) 400 | 401 | print("Decode %s data, nbest: %s ..."%(name, data.nbest)) 402 | start_time = time.time() 403 | speed, acc, p, r, f, pred_results, pred_scores = evaluate(data, model, name, data.nbest) 404 | end_time = time.time() 405 | time_cost = end_time - start_time 406 | if data.seg: 407 | print("%s: time:%.2fs, speed:%.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"%(name, time_cost, speed, acc, p, r, f)) 408 | else: 409 | print("%s: time:%.2fs, speed:%.2fst/s; acc: %.4f"%(name, time_cost, speed, acc)) 410 | return pred_results, pred_scores 411 | 412 | 413 | 414 | 415 | if __name__ == '__main__': 416 | parser = argparse.ArgumentParser(description='Tuning with NCRF++') 417 | parser.add_argument('--wordemb', help='Embedding for words', default='None') 418 | parser.add_argument('--charemb', help='Embedding for chars', default='None') 419 | parser.add_argument('--status', choices=['train', 'decode'], help='update algorithm', default='train') 420 | parser.add_argument('--savemodel', default="data/model/saved_model.lstmcrf.") 421 | parser.add_argument('--savedset', help='Dir of saved data setting') 422 | parser.add_argument('--train', default="data/conll03/train.bmes") 423 | parser.add_argument('--dev', default="data/conll03/dev.bmes" ) 424 | parser.add_argument('--test', default="data/conll03/test.bmes") 425 | parser.add_argument('--seg', default="True") 426 | parser.add_argument('--raw') 427 | parser.add_argument('--loadmodel') 428 | parser.add_argument('--output') 429 | args = parser.parse_args() 430 | data = Data() 431 | 432 | data.train_dir = args.train 433 | data.dev_dir = args.dev 434 | data.test_dir = args.test 435 | data.model_dir = args.savemodel 436 | data.dset_dir = args.savedset 437 | print("dset directory:",data.dset_dir) 438 | status = args.status.lower() 439 | save_model_dir = args.savemodel 440 | data.HP_gpu = torch.cuda.is_available() 441 | print("Seed num:",seed_num) 442 | data.number_normalized = True 443 | data.word_emb_dir = "../data/glove.6B.100d.txt" 444 | 445 | if status == 'train': 446 | print("MODEL: train") 447 | data_initialization(data) 448 | data.use_char = True 449 | data.HP_batch_size = 10 450 | data.HP_lr = 0.015 451 | data.char_seq_feature = "CNN" 452 | data.generate_instance('train') 453 | data.generate_instance('dev') 454 | data.generate_instance('test') 455 | data.build_pretrain_emb() 456 | train(data) 457 | elif status == 'decode': 458 | print("MODEL: decode") 459 | data.load(data.dset_dir) 460 | data.raw_dir = args.raw 461 | data.decode_dir = args.output 462 | data.load_model_dir = args.loadmodel 463 | data.show_data_summary() 464 | data.generate_instance('raw') 465 | print("nbest: %s"%(data.nbest)) 466 | decode_results, pred_scores = load_model_decode(data, 'raw') 467 | if data.nbest: 468 | data.write_nbest_decoded_results(decode_results, pred_scores, 'raw') 469 | else: 470 | data.write_decoded_results(decode_results, 'raw') 471 | else: 472 | print("Invalid argument! Please use valid arguments! (train/test/decode)") 473 | 474 | 475 | 476 | 477 | -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | -------------------------------------------------------------------------------- /model/charbigru.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Jie Yang 3 | # @Date: 2017-10-17 16:47:32 4 | # @Last Modified by: Jie Yang, Contact: jieynlp@gmail.com 5 | # @Last Modified time: 2018-10-18 11:12:13 6 | from __future__ import print_function 7 | import torch 8 | import torch.nn as nn 9 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence 10 | import numpy as np 11 | 12 | class CharBiGRU(nn.Module): 13 | def __init__(self, alphabet_size, pretrain_char_embedding, embedding_dim, hidden_dim, dropout, gpu, bidirect_flag = True): 14 | super(CharBiGRU, self).__init__() 15 | print("build char sequence feature extractor: GRU ...") 16 | self.gpu = gpu 17 | self.hidden_dim = hidden_dim 18 | if bidirect_flag: 19 | self.hidden_dim = hidden_dim // 2 20 | self.char_drop = nn.Dropout(dropout) 21 | self.char_embeddings = nn.Embedding(alphabet_size, embedding_dim) 22 | if pretrain_char_embedding is not None: 23 | self.char_embeddings.weight.data.copy_(torch.from_numpy(pretrain_char_embedding)) 24 | else: 25 | self.char_embeddings.weight.data.copy_(torch.from_numpy(self.random_embedding(alphabet_size, embedding_dim))) 26 | self.char_lstm = nn.GRU(embedding_dim, self.hidden_dim, num_layers=1, batch_first=True, bidirectional=bidirect_flag) 27 | if self.gpu: 28 | self.char_drop = self.char_drop.cuda() 29 | self.char_embeddings = self.char_embeddings.cuda() 30 | self.char_lstm = self.char_lstm.cuda() 31 | 32 | 33 | def random_embedding(self, vocab_size, embedding_dim): 34 | pretrain_emb = np.empty([vocab_size, embedding_dim]) 35 | scale = np.sqrt(3.0 / embedding_dim) 36 | for index in range(vocab_size): 37 | pretrain_emb[index,:] = np.random.uniform(-scale, scale, [1, embedding_dim]) 38 | return pretrain_emb 39 | 40 | 41 | def get_last_hiddens(self, input, seq_lengths): 42 | """ 43 | input: 44 | input: Variable(batch_size, word_length) 45 | seq_lengths: numpy array (batch_size, 1) 46 | output: 47 | Variable(batch_size, char_hidden_dim) 48 | Note it only accepts ordered (length) variable, length size is recorded in seq_lengths 49 | """ 50 | batch_size = input.size(0) 51 | char_embeds = self.char_drop(self.char_embeddings(input)) 52 | char_hidden = None 53 | pack_input = pack_padded_sequence(char_embeds, seq_lengths, True) 54 | char_rnn_out, char_hidden = self.char_lstm(pack_input, char_hidden) 55 | # char_rnn_out, _ = pad_packed_sequence(char_rnn_out) 56 | return char_hidden.transpose(1,0).contiguous().view(batch_size,-1) 57 | 58 | def get_all_hiddens(self, input, seq_lengths): 59 | """ 60 | input: 61 | input: Variable(batch_size, word_length) 62 | seq_lengths: numpy array (batch_size, 1) 63 | output: 64 | Variable(batch_size, word_length, char_hidden_dim) 65 | Note it only accepts ordered (length) variable, length size is recorded in seq_lengths 66 | """ 67 | batch_size = input.size(0) 68 | char_embeds = self.char_drop(self.char_embeddings(input)) 69 | char_hidden = None 70 | pack_input = pack_padded_sequence(char_embeds, seq_lengths, True) 71 | char_rnn_out, char_hidden = self.char_lstm(pack_input, char_hidden) 72 | char_rnn_out, _ = pad_packed_sequence(char_rnn_out) 73 | return char_rnn_out.transpose(1,0) 74 | 75 | 76 | def forward(self, input, seq_lengths): 77 | return self.get_all_hiddens(input, seq_lengths) 78 | -------------------------------------------------------------------------------- /model/charbilstm.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Jie Yang 3 | # @Date: 2017-10-17 16:47:32 4 | # @Last Modified by: Jie Yang, Contact: jieynlp@gmail.com 5 | # @Last Modified time: 2018-10-18 11:19:37 6 | from __future__ import print_function 7 | import torch 8 | import torch.nn as nn 9 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence 10 | import numpy as np 11 | 12 | class CharBiLSTM(nn.Module): 13 | def __init__(self, alphabet_size, pretrain_char_embedding, embedding_dim, hidden_dim, dropout, gpu, bidirect_flag = True): 14 | super(CharBiLSTM, self).__init__() 15 | print("build char sequence feature extractor: LSTM ...") 16 | self.gpu = gpu 17 | self.hidden_dim = hidden_dim 18 | if bidirect_flag: 19 | self.hidden_dim = hidden_dim // 2 20 | self.char_drop = nn.Dropout(dropout) 21 | self.char_embeddings = nn.Embedding(alphabet_size, embedding_dim) 22 | if pretrain_char_embedding is not None: 23 | self.char_embeddings.weight.data.copy_(torch.from_numpy(pretrain_char_embedding)) 24 | else: 25 | self.char_embeddings.weight.data.copy_(torch.from_numpy(self.random_embedding(alphabet_size, embedding_dim))) 26 | self.char_lstm = nn.LSTM(embedding_dim, self.hidden_dim, num_layers=1, batch_first=True, bidirectional=bidirect_flag) 27 | if self.gpu: 28 | self.char_drop = self.char_drop.cuda() 29 | self.char_embeddings = self.char_embeddings.cuda() 30 | self.char_lstm = self.char_lstm.cuda() 31 | 32 | 33 | def random_embedding(self, vocab_size, embedding_dim): 34 | pretrain_emb = np.empty([vocab_size, embedding_dim]) 35 | scale = np.sqrt(3.0 / embedding_dim) 36 | for index in range(vocab_size): 37 | pretrain_emb[index,:] = np.random.uniform(-scale, scale, [1, embedding_dim]) 38 | return pretrain_emb 39 | 40 | 41 | def get_last_hiddens(self, input, seq_lengths): 42 | """ 43 | input: 44 | input: Variable(batch_size, word_length) 45 | seq_lengths: numpy array (batch_size, 1) 46 | output: 47 | Variable(batch_size, char_hidden_dim) 48 | Note it only accepts ordered (length) variable, length size is recorded in seq_lengths 49 | """ 50 | batch_size = input.size(0) 51 | char_embeds = self.char_drop(self.char_embeddings(input)) 52 | char_hidden = None 53 | pack_input = pack_padded_sequence(char_embeds, seq_lengths, True) 54 | char_rnn_out, char_hidden = self.char_lstm(pack_input, char_hidden) 55 | ## char_hidden = (h_t, c_t) 56 | # char_hidden[0] = h_t = (2, batch_size, lstm_dimension) 57 | # char_rnn_out, _ = pad_packed_sequence(char_rnn_out) 58 | return char_hidden[0].transpose(1,0).contiguous().view(batch_size,-1) 59 | 60 | def get_all_hiddens(self, input, seq_lengths): 61 | """ 62 | input: 63 | input: Variable(batch_size, word_length) 64 | seq_lengths: numpy array (batch_size, 1) 65 | output: 66 | Variable(batch_size, word_length, char_hidden_dim) 67 | Note it only accepts ordered (length) variable, length size is recorded in seq_lengths 68 | """ 69 | batch_size = input.size(0) 70 | char_embeds = self.char_drop(self.char_embeddings(input)) 71 | char_hidden = None 72 | pack_input = pack_padded_sequence(char_embeds, seq_lengths, True) 73 | char_rnn_out, char_hidden = self.char_lstm(pack_input, char_hidden) 74 | char_rnn_out, _ = pad_packed_sequence(char_rnn_out) 75 | return char_rnn_out.transpose(1,0) 76 | 77 | 78 | def forward(self, input, seq_lengths): 79 | return self.get_all_hiddens(input, seq_lengths) 80 | -------------------------------------------------------------------------------- /model/charcnn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Jie Yang 3 | # @Date: 2017-10-17 16:47:32 4 | # @Last Modified by: Jie Yang, Contact: jieynlp@gmail.com 5 | # @Last Modified time: 2019-01-18 21:06:06 6 | from __future__ import print_function 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | import numpy as np 11 | 12 | class CharCNN(nn.Module): 13 | def __init__(self, alphabet_size, pretrain_char_embedding, embedding_dim, hidden_dim, dropout, gpu): 14 | super(CharCNN, self).__init__() 15 | print("build char sequence feature extractor: CNN ...") 16 | self.gpu = gpu 17 | self.hidden_dim = hidden_dim 18 | self.char_drop = nn.Dropout(dropout) 19 | self.char_embeddings = nn.Embedding(alphabet_size, embedding_dim) 20 | if pretrain_char_embedding is not None: 21 | self.char_embeddings.weight.data.copy_(torch.from_numpy(pretrain_char_embedding)) 22 | else: 23 | self.char_embeddings.weight.data.copy_(torch.from_numpy(self.random_embedding(alphabet_size, embedding_dim))) 24 | self.char_cnn = nn.Conv1d(embedding_dim, self.hidden_dim, kernel_size=3, padding=1) 25 | if self.gpu: 26 | self.char_drop = self.char_drop.cuda() 27 | self.char_embeddings = self.char_embeddings.cuda() 28 | self.char_cnn = self.char_cnn.cuda() 29 | 30 | 31 | def random_embedding(self, vocab_size, embedding_dim): 32 | pretrain_emb = np.empty([vocab_size, embedding_dim]) 33 | scale = np.sqrt(3.0 / embedding_dim) 34 | for index in range(vocab_size): 35 | pretrain_emb[index,:] = np.random.uniform(-scale, scale, [1, embedding_dim]) 36 | return pretrain_emb 37 | 38 | 39 | def get_last_hiddens(self, input, seq_lengths): 40 | """ 41 | input: 42 | input: Variable(batch_size, word_length) 43 | seq_lengths: numpy array (batch_size, 1) 44 | output: 45 | Variable(batch_size, char_hidden_dim) 46 | Note it only accepts ordered (length) variable, length size is recorded in seq_lengths 47 | """ 48 | batch_size = input.size(0) 49 | char_embeds = self.char_drop(self.char_embeddings(input)) 50 | char_embeds = char_embeds.transpose(2,1).contiguous() 51 | char_cnn_out = self.char_cnn(char_embeds) 52 | char_cnn_out = F.max_pool1d(char_cnn_out, char_cnn_out.size(2)).view(batch_size, -1) 53 | return char_cnn_out 54 | 55 | def get_all_hiddens(self, input, seq_lengths): 56 | """ 57 | input: 58 | input: Variable(batch_size, word_length) 59 | seq_lengths: numpy array (batch_size, 1) 60 | output: 61 | Variable(batch_size, word_length, char_hidden_dim) 62 | Note it only accepts ordered (length) variable, length size is recorded in seq_lengths 63 | """ 64 | batch_size = input.size(0) 65 | char_embeds = self.char_drop(self.char_embeddings(input)) 66 | char_embeds = char_embeds.transpose(2,1).contiguous() 67 | char_cnn_out = self.char_cnn(char_embeds).transpose(2,1).contiguous() 68 | return char_cnn_out 69 | 70 | 71 | 72 | def forward(self, input, seq_lengths): 73 | return self.get_all_hiddens(input, seq_lengths) 74 | -------------------------------------------------------------------------------- /model/crf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Jie Yang 3 | # @Date: 2017-12-04 23:19:38 4 | # @Last Modified by: Jie Yang, Contact: jieynlp@gmail.com 5 | # @Last Modified time: 2018-12-16 22:15:56 6 | from __future__ import print_function 7 | import torch 8 | import torch.autograd as autograd 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | START_TAG = -2 12 | STOP_TAG = -1 13 | 14 | 15 | # Compute log sum exp in a numerically stable way for the forward algorithm 16 | def log_sum_exp(vec, m_size): 17 | """ 18 | calculate log of exp sum 19 | args: 20 | vec (batch_size, vanishing_dim, hidden_dim) : input tensor 21 | m_size : hidden_dim 22 | return: 23 | batch_size, hidden_dim 24 | """ 25 | _, idx = torch.max(vec, 1) # B * 1 * M 26 | max_score = torch.gather(vec, 1, idx.view(-1, 1, m_size)).view(-1, 1, m_size) # B * M 27 | return max_score.view(-1, m_size) + torch.log(torch.sum(torch.exp(vec - max_score.expand_as(vec)), 1)).view(-1, m_size) # B * M 28 | 29 | class CRF(nn.Module): 30 | 31 | def __init__(self, tagset_size, gpu): 32 | super(CRF, self).__init__() 33 | print("build CRF...") 34 | self.gpu = gpu 35 | # Matrix of transition parameters. Entry i,j is the score of transitioning from i to j. 36 | self.tagset_size = tagset_size 37 | # # We add 2 here, because of START_TAG and STOP_TAG 38 | # # transitions (f_tag_size, t_tag_size), transition value from f_tag to t_tag 39 | init_transitions = torch.zeros(self.tagset_size+2, self.tagset_size+2) 40 | init_transitions[:,START_TAG] = -10000.0 41 | init_transitions[STOP_TAG,:] = -10000.0 42 | init_transitions[:,0] = -10000.0 43 | init_transitions[0,:] = -10000.0 44 | if self.gpu: 45 | init_transitions = init_transitions.cuda() 46 | self.transitions = nn.Parameter(init_transitions) 47 | 48 | # self.transitions = nn.Parameter(torch.Tensor(self.tagset_size+2, self.tagset_size+2)) 49 | # self.transitions.data.zero_() 50 | 51 | def _calculate_PZ(self, feats, mask): 52 | """ 53 | input: 54 | feats: (batch, seq_len, self.tag_size+2) 55 | masks: (batch, seq_len) 56 | """ 57 | batch_size = feats.size(0) 58 | seq_len = feats.size(1) 59 | tag_size = feats.size(2) 60 | # print feats.view(seq_len, tag_size) 61 | assert(tag_size == self.tagset_size+2) 62 | mask = mask.transpose(1,0).contiguous() 63 | ins_num = seq_len * batch_size 64 | ## be careful the view shape, it is .view(ins_num, 1, tag_size) but not .view(ins_num, tag_size, 1) 65 | feats = feats.transpose(1,0).contiguous().view(ins_num,1, tag_size).expand(ins_num, tag_size, tag_size) 66 | ## need to consider start 67 | scores = feats + self.transitions.view(1,tag_size,tag_size).expand(ins_num, tag_size, tag_size) 68 | scores = scores.view(seq_len, batch_size, tag_size, tag_size) 69 | # build iter 70 | seq_iter = enumerate(scores) 71 | _, inivalues = next(seq_iter) # bat_size * from_target_size * to_target_size 72 | # only need start from start_tag 73 | partition = inivalues[:, START_TAG, :].clone().view(batch_size, tag_size, 1) # bat_size * to_target_size 74 | 75 | ## add start score (from start to all tag, duplicate to batch_size) 76 | # partition = partition + self.transitions[START_TAG,:].view(1, tag_size, 1).expand(batch_size, tag_size, 1) 77 | # iter over last scores 78 | for idx, cur_values in seq_iter: 79 | # previous to_target is current from_target 80 | # partition: previous results log(exp(from_target)), #(batch_size * from_target) 81 | # cur_values: bat_size * from_target * to_target 82 | 83 | cur_values = cur_values + partition.contiguous().view(batch_size, tag_size, 1).expand(batch_size, tag_size, tag_size) 84 | cur_partition = log_sum_exp(cur_values, tag_size) 85 | # print cur_partition.data 86 | 87 | # (bat_size * from_target * to_target) -> (bat_size * to_target) 88 | # partition = utils.switch(partition, cur_partition, mask[idx].view(bat_size, 1).expand(bat_size, self.tagset_size)).view(bat_size, -1) 89 | mask_idx = mask[idx, :].view(batch_size, 1).expand(batch_size, tag_size) 90 | 91 | ## effective updated partition part, only keep the partition value of mask value = 1 92 | masked_cur_partition = cur_partition.masked_select(mask_idx) 93 | ## let mask_idx broadcastable, to disable warning 94 | mask_idx = mask_idx.contiguous().view(batch_size, tag_size, 1) 95 | 96 | ## replace the partition where the maskvalue=1, other partition value keeps the same 97 | partition.masked_scatter_(mask_idx, masked_cur_partition) 98 | # until the last state, add transition score for all partition (and do log_sum_exp) then select the value in STOP_TAG 99 | cur_values = self.transitions.view(1,tag_size, tag_size).expand(batch_size, tag_size, tag_size) + partition.contiguous().view(batch_size, tag_size, 1).expand(batch_size, tag_size, tag_size) 100 | cur_partition = log_sum_exp(cur_values, tag_size) 101 | final_partition = cur_partition[:, STOP_TAG] 102 | return final_partition.sum(), scores 103 | 104 | 105 | def _viterbi_decode(self, feats, mask): 106 | """ 107 | input: 108 | feats: (batch, seq_len, self.tag_size+2) 109 | mask: (batch, seq_len) 110 | output: 111 | decode_idx: (batch, seq_len) decoded sequence 112 | path_score: (batch, 1) corresponding score for each sequence (to be implementated) 113 | """ 114 | batch_size = feats.size(0) 115 | seq_len = feats.size(1) 116 | tag_size = feats.size(2) 117 | assert(tag_size == self.tagset_size+2) 118 | ## calculate sentence length for each sentence 119 | length_mask = torch.sum(mask.long(), dim = 1).view(batch_size,1).long() 120 | ## mask to (seq_len, batch_size) 121 | mask = mask.transpose(1,0).contiguous() 122 | ins_num = seq_len * batch_size 123 | ## be careful the view shape, it is .view(ins_num, 1, tag_size) but not .view(ins_num, tag_size, 1) 124 | feats = feats.transpose(1,0).contiguous().view(ins_num, 1, tag_size).expand(ins_num, tag_size, tag_size) 125 | ## need to consider start 126 | scores = feats + self.transitions.view(1,tag_size,tag_size).expand(ins_num, tag_size, tag_size) 127 | scores = scores.view(seq_len, batch_size, tag_size, tag_size) 128 | 129 | # build iter 130 | seq_iter = enumerate(scores) 131 | ## record the position of best score 132 | back_points = list() 133 | partition_history = list() 134 | ## reverse mask (bug for mask = 1- mask, use this as alternative choice) 135 | # mask = 1 + (-1)*mask 136 | mask = (1 - mask.long()).bool() 137 | _, inivalues = next(seq_iter) # bat_size * from_target_size * to_target_size 138 | # only need start from start_tag 139 | partition = inivalues[:, START_TAG, :].clone().view(batch_size, tag_size) # bat_size * to_target_size 140 | # print "init part:",partition.size() 141 | partition_history.append(partition) 142 | # iter over last scores 143 | for idx, cur_values in seq_iter: 144 | # previous to_target is current from_target 145 | # partition: previous results log(exp(from_target)), #(batch_size * from_target) 146 | # cur_values: batch_size * from_target * to_target 147 | cur_values = cur_values + partition.contiguous().view(batch_size, tag_size, 1).expand(batch_size, tag_size, tag_size) 148 | ## forscores, cur_bp = torch.max(cur_values[:,:-2,:], 1) # do not consider START_TAG/STOP_TAG 149 | # print "cur value:", cur_values.size() 150 | partition, cur_bp = torch.max(cur_values, 1) 151 | # print "partsize:",partition.size() 152 | # exit(0) 153 | # print partition 154 | # print cur_bp 155 | # print "one best, ",idx 156 | partition_history.append(partition) 157 | ## cur_bp: (batch_size, tag_size) max source score position in current tag 158 | ## set padded label as 0, which will be filtered in post processing 159 | cur_bp.masked_fill_(mask[idx].view(batch_size, 1).expand(batch_size, tag_size), 0) 160 | back_points.append(cur_bp) 161 | # exit(0) 162 | ### add score to final STOP_TAG 163 | partition_history = torch.cat(partition_history, 0).view(seq_len, batch_size, -1).transpose(1,0).contiguous() ## (batch_size, seq_len. tag_size) 164 | ### get the last position for each setences, and select the last partitions using gather() 165 | last_position = length_mask.view(batch_size,1,1).expand(batch_size, 1, tag_size) -1 166 | last_partition = torch.gather(partition_history, 1, last_position).view(batch_size,tag_size,1) 167 | ### calculate the score from last partition to end state (and then select the STOP_TAG from it) 168 | last_values = last_partition.expand(batch_size, tag_size, tag_size) + self.transitions.view(1,tag_size, tag_size).expand(batch_size, tag_size, tag_size) 169 | _, last_bp = torch.max(last_values, 1) 170 | pad_zero = autograd.Variable(torch.zeros(batch_size, tag_size)).long() 171 | if self.gpu: 172 | pad_zero = pad_zero.cuda() 173 | back_points.append(pad_zero) 174 | back_points = torch.cat(back_points).view(seq_len, batch_size, tag_size) 175 | 176 | ## select end ids in STOP_TAG 177 | pointer = last_bp[:, STOP_TAG] 178 | insert_last = pointer.contiguous().view(batch_size,1,1).expand(batch_size,1, tag_size) 179 | back_points = back_points.transpose(1,0).contiguous() 180 | ## move the end ids(expand to tag_size) to the corresponding position of back_points to replace the 0 values 181 | # print "lp:",last_position 182 | # print "il:",insert_last 183 | back_points.scatter_(1, last_position, insert_last) 184 | # print "bp:",back_points 185 | # exit(0) 186 | back_points = back_points.transpose(1,0).contiguous() 187 | ## decode from the end, padded position ids are 0, which will be filtered if following evaluation 188 | decode_idx = autograd.Variable(torch.LongTensor(seq_len, batch_size)) 189 | if self.gpu: 190 | decode_idx = decode_idx.cuda() 191 | decode_idx[-1] = pointer.detach() 192 | for idx in range(len(back_points)-2, -1, -1): 193 | pointer = torch.gather(back_points[idx], 1, pointer.contiguous().view(batch_size, 1)) 194 | decode_idx[idx] = pointer.detach().view(batch_size) 195 | path_score = None 196 | decode_idx = decode_idx.transpose(1,0) 197 | return path_score, decode_idx 198 | 199 | 200 | 201 | def forward(self, feats): 202 | path_score, best_path = self._viterbi_decode(feats) 203 | return path_score, best_path 204 | 205 | 206 | def _score_sentence(self, scores, mask, tags): 207 | """ 208 | input: 209 | scores: variable (seq_len, batch, tag_size, tag_size) 210 | mask: (batch, seq_len) 211 | tags: tensor (batch, seq_len) 212 | output: 213 | score: sum of score for gold sequences within whole batch 214 | """ 215 | # Gives the score of a provided tag sequence 216 | batch_size = scores.size(1) 217 | seq_len = scores.size(0) 218 | tag_size = scores.size(2) 219 | ## convert tag value into a new format, recorded label bigram information to index 220 | new_tags = autograd.Variable(torch.LongTensor(batch_size, seq_len)) 221 | if self.gpu: 222 | new_tags = new_tags.cuda() 223 | for idx in range(seq_len): 224 | if idx == 0: 225 | ## start -> first score 226 | new_tags[:,0] = (tag_size - 2)*tag_size + tags[:,0] 227 | 228 | else: 229 | new_tags[:,idx] = tags[:,idx-1]*tag_size + tags[:,idx] 230 | 231 | ## transition for label to STOP_TAG 232 | end_transition = self.transitions[:,STOP_TAG].contiguous().view(1, tag_size).expand(batch_size, tag_size) 233 | ## length for batch, last word position = length - 1 234 | length_mask = torch.sum(mask.long(), dim = 1).view(batch_size,1).long() 235 | ## index the label id of last word 236 | end_ids = torch.gather(tags, 1, length_mask - 1) 237 | 238 | ## index the transition score for end_id to STOP_TAG 239 | end_energy = torch.gather(end_transition, 1, end_ids) 240 | 241 | ## convert tag as (seq_len, batch_size, 1) 242 | new_tags = new_tags.transpose(1,0).contiguous().view(seq_len, batch_size, 1) 243 | ### need convert tags id to search from 400 positions of scores 244 | tg_energy = torch.gather(scores.view(seq_len, batch_size, -1), 2, new_tags).view(seq_len, batch_size) # seq_len * bat_size 245 | ## mask transpose to (seq_len, batch_size) 246 | tg_energy = tg_energy.masked_select(mask.transpose(1,0)) 247 | 248 | # ## calculate the score from START_TAG to first label 249 | # start_transition = self.transitions[START_TAG,:].view(1, tag_size).expand(batch_size, tag_size) 250 | # start_energy = torch.gather(start_transition, 1, tags[0,:]) 251 | 252 | ## add all score together 253 | # gold_score = start_energy.sum() + tg_energy.sum() + end_energy.sum() 254 | gold_score = tg_energy.sum() + end_energy.sum() 255 | return gold_score 256 | 257 | def neg_log_likelihood_loss(self, feats, mask, tags): 258 | # nonegative log likelihood 259 | batch_size = feats.size(0) 260 | forward_score, scores = self._calculate_PZ(feats, mask) 261 | gold_score = self._score_sentence(scores, mask, tags) 262 | # print "batch, f:", forward_score.data[0], " g:", gold_score.data[0], " dis:", forward_score.data[0] - gold_score.data[0] 263 | # exit(0) 264 | return forward_score - gold_score 265 | 266 | 267 | 268 | def _viterbi_decode_nbest(self, feats, mask, nbest): 269 | """ 270 | input: 271 | feats: (batch, seq_len, self.tag_size+2) 272 | mask: (batch, seq_len) 273 | output: 274 | decode_idx: (batch, nbest, seq_len) decoded sequence 275 | path_score: (batch, nbest) corresponding score for each sequence (to be implementated) 276 | nbest decode for sentence with one token is not well supported, to be optimized 277 | """ 278 | batch_size = feats.size(0) 279 | seq_len = feats.size(1) 280 | tag_size = feats.size(2) 281 | assert(tag_size == self.tagset_size+2) 282 | ## calculate sentence length for each sentence 283 | length_mask = torch.sum(mask.long(), dim = 1).view(batch_size,1).long() 284 | ## mask to (seq_len, batch_size) 285 | mask = mask.transpose(1,0).contiguous() 286 | ins_num = seq_len * batch_size 287 | ## be careful the view shape, it is .view(ins_num, 1, tag_size) but not .view(ins_num, tag_size, 1) 288 | feats = feats.transpose(1,0).contiguous().view(ins_num, 1, tag_size).expand(ins_num, tag_size, tag_size) 289 | ## need to consider start 290 | scores = feats + self.transitions.view(1,tag_size,tag_size).expand(ins_num, tag_size, tag_size) 291 | scores = scores.view(seq_len, batch_size, tag_size, tag_size) 292 | 293 | # build iter 294 | seq_iter = enumerate(scores) 295 | ## record the position of best score 296 | back_points = list() 297 | partition_history = list() 298 | ## reverse mask (bug for mask = 1- mask, use this as alternative choice) 299 | # mask = 1 + (-1)*mask 300 | mask = (1 - mask.long()).bool() 301 | _, inivalues = next(seq_iter) # bat_size * from_target_size * to_target_size 302 | # only need start from start_tag 303 | partition = inivalues[:, START_TAG, :].clone() # bat_size * to_target_size 304 | ## initial partition [batch_size, tag_size] 305 | partition_history.append(partition.view(batch_size, tag_size, 1).expand(batch_size, tag_size, nbest)) 306 | # iter over last scores 307 | for idx, cur_values in seq_iter: 308 | if idx == 1: 309 | cur_values = cur_values.view(batch_size, tag_size, tag_size) + partition.contiguous().view(batch_size, tag_size, 1).expand(batch_size, tag_size, tag_size) 310 | else: 311 | # previous to_target is current from_target 312 | # partition: previous results log(exp(from_target)), #(batch_size * nbest * from_target) 313 | # cur_values: batch_size * from_target * to_target 314 | cur_values = cur_values.view(batch_size, tag_size, 1, tag_size).expand(batch_size, tag_size, nbest, tag_size) + partition.contiguous().view(batch_size, tag_size, nbest, 1).expand(batch_size, tag_size, nbest, tag_size) 315 | ## compare all nbest and all from target 316 | cur_values = cur_values.view(batch_size, tag_size*nbest, tag_size) 317 | # print "cur size:",cur_values.size() 318 | partition, cur_bp = torch.topk(cur_values, nbest, 1) 319 | ## cur_bp/partition: [batch_size, nbest, tag_size], id should be normize through nbest in following backtrace step 320 | # print partition[:,0,:] 321 | # print cur_bp[:,0,:] 322 | # print "nbest, ",idx 323 | if idx == 1: 324 | cur_bp = cur_bp*nbest 325 | partition = partition.transpose(2,1) 326 | cur_bp = cur_bp.transpose(2,1) 327 | 328 | # print partition 329 | # exit(0) 330 | #partition: (batch_size * to_target * nbest) 331 | #cur_bp: (batch_size * to_target * nbest) Notice the cur_bp number is the whole position of tag_size*nbest, need to convert when decode 332 | partition_history.append(partition) 333 | ## cur_bp: (batch_size,nbest, tag_size) topn source score position in current tag 334 | ## set padded label as 0, which will be filtered in post processing 335 | ## mask[idx] ? mask[idx-1] 336 | cur_bp.masked_fill_(mask[idx].view(batch_size, 1, 1).expand(batch_size, tag_size, nbest), 0) 337 | # print cur_bp[0] 338 | back_points.append(cur_bp) 339 | ### add score to final STOP_TAG 340 | partition_history = torch.cat(partition_history,0).view(seq_len, batch_size, tag_size, nbest).transpose(1,0).contiguous() ## (batch_size, seq_len, nbest, tag_size) 341 | ### get the last position for each setences, and select the last partitions using gather() 342 | last_position = length_mask.view(batch_size,1,1,1).expand(batch_size, 1, tag_size, nbest) - 1 343 | last_partition = torch.gather(partition_history, 1, last_position).view(batch_size, tag_size, nbest, 1) 344 | ### calculate the score from last partition to end state (and then select the STOP_TAG from it) 345 | last_values = last_partition.expand(batch_size, tag_size, nbest, tag_size) + self.transitions.view(1, tag_size, 1, tag_size).expand(batch_size, tag_size, nbest, tag_size) 346 | last_values = last_values.view(batch_size, tag_size*nbest, tag_size) 347 | end_partition, end_bp = torch.topk(last_values, nbest, 1) 348 | ## end_partition: (batch, nbest, tag_size) 349 | end_bp = end_bp.transpose(2,1) 350 | # end_bp: (batch, tag_size, nbest) 351 | pad_zero = autograd.Variable(torch.zeros(batch_size, tag_size, nbest)).long() 352 | if self.gpu: 353 | pad_zero = pad_zero.cuda() 354 | back_points.append(pad_zero) 355 | back_points = torch.cat(back_points).view(seq_len, batch_size, tag_size, nbest) 356 | 357 | ## select end ids in STOP_TAG 358 | pointer = end_bp[:, STOP_TAG, :] ## (batch_size, nbest) 359 | insert_last = pointer.contiguous().view(batch_size, 1, 1, nbest).expand(batch_size, 1, tag_size, nbest) 360 | back_points = back_points.transpose(1,0).contiguous() 361 | ## move the end ids(expand to tag_size) to the corresponding position of back_points to replace the 0 values 362 | # print "lp:",last_position 363 | # print "il:",insert_last[0] 364 | # exit(0) 365 | ## copy the ids of last position:insert_last to back_points, though the last_position index 366 | ## last_position includes the length of batch sentences 367 | # print "old:", back_points[9,0,:,:] 368 | back_points.scatter_(1, last_position, insert_last) 369 | ## back_points: [batch_size, seq_length, tag_size, nbest] 370 | # print "new:", back_points[9,0,:,:] 371 | # exit(0) 372 | # print pointer[2] 373 | ''' 374 | back_points: in simple demonstratration 375 | x,x,x,x,x,x,x,x,x,7 376 | x,x,x,x,x,4,0,0,0,0 377 | x,x,6,0,0,0,0,0,0,0 378 | ''' 379 | 380 | back_points = back_points.transpose(1,0).contiguous() 381 | # print back_points[0] 382 | ## back_points: (seq_len, batch, tag_size, nbest) 383 | ## decode from the end, padded position ids are 0, which will be filtered in following evaluation 384 | decode_idx = autograd.Variable(torch.LongTensor(seq_len, batch_size, nbest)) 385 | if self.gpu: 386 | decode_idx = decode_idx.cuda() 387 | decode_idx[-1] = pointer.data/nbest 388 | # print "pointer-1:",pointer[2] 389 | # exit(0) 390 | # use old mask, let 0 means has token 391 | for idx in range(len(back_points)-2, -1, -1): 392 | # print "pointer: ",idx, pointer[3] 393 | # print "back:",back_points[idx][3] 394 | # print "mask:",mask[idx+1,3] 395 | new_pointer = torch.gather(back_points[idx].view(batch_size, tag_size*nbest), 1, pointer.contiguous().view(batch_size,nbest)) 396 | decode_idx[idx] = new_pointer.data/nbest 397 | # # use new pointer to remember the last end nbest ids for non longest 398 | pointer = new_pointer + pointer.contiguous().view(batch_size,nbest)*mask[idx].view(batch_size,1).expand(batch_size, nbest).long() 399 | 400 | # exit(0) 401 | path_score = None 402 | decode_idx = decode_idx.transpose(1,0) 403 | ## decode_idx: [batch, seq_len, nbest] 404 | # print decode_idx[:,:,0] 405 | # print "nbest:",nbest 406 | # print "diff:", decode_idx[:,:,0]- decode_idx[:,:,4] 407 | # print decode_idx[:,0,:] 408 | # exit(0) 409 | 410 | ### calculate probability for each sequence 411 | scores = end_partition[:, :, STOP_TAG] 412 | ## scores: [batch_size, nbest] 413 | max_scores,_ = torch.max(scores, 1) 414 | minus_scores = scores - max_scores.view(batch_size,1).expand(batch_size, nbest) 415 | path_score = F.softmax(minus_scores, 1) 416 | ## path_score: [batch_size, nbest] 417 | # exit(0) 418 | return path_score, decode_idx 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | -------------------------------------------------------------------------------- /model/sentclassifier.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Jie Yang 3 | # @Date: 2019-01-01 21:11:50 4 | # @Last Modified by: Jie Yang, Contact: jieynlp@gmail.com 5 | # @Last Modified time: 2019-02-13 12:30:56 6 | 7 | from __future__ import print_function 8 | from __future__ import absolute_import 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | from .wordsequence import WordSequence 13 | 14 | class SentClassifier(nn.Module): 15 | def __init__(self, data): 16 | super(SentClassifier, self).__init__() 17 | print("build sentence classification network...") 18 | print("use_char: ", data.use_char) 19 | if data.use_char: 20 | print("char feature extractor: ", data.char_feature_extractor) 21 | print("word feature extractor: ", data.word_feature_extractor) 22 | 23 | self.gpu = data.HP_gpu 24 | self.average_batch = data.average_batch_loss 25 | label_size = data.label_alphabet_size 26 | self.word_hidden = WordSequence(data) 27 | 28 | 29 | 30 | def calculate_loss(self, word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, batch_label, mask): 31 | outs = self.word_hidden.sentence_representation(word_inputs,feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) 32 | batch_size = word_inputs.size(0) 33 | # loss_function = nn.CrossEntropyLoss(ignore_index=0, reduction='sum') 34 | outs = outs.view(batch_size, -1) 35 | # print("a",outs) 36 | # score = F.log_softmax(outs, 1) 37 | # print(score.size(), batch_label.view(batch_size).size()) 38 | # print(score) 39 | # print(batch_label) 40 | # exit(0) 41 | total_loss = F.cross_entropy(outs, batch_label.view(batch_size)) 42 | # total_loss = loss_function(score, batch_label.view(batch_size)) 43 | 44 | _, tag_seq = torch.max(outs, 1) 45 | if self.average_batch: 46 | total_loss = total_loss / batch_size 47 | return total_loss, tag_seq 48 | 49 | 50 | def forward(self, word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, mask): 51 | outs = self.word_hidden.sentence_representation(word_inputs,feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) 52 | batch_size = word_inputs.size(0) 53 | outs = outs.view(batch_size, -1) 54 | _, tag_seq = torch.max(outs, 1) 55 | # if a == 0: 56 | # print(tag_seq) 57 | return tag_seq 58 | 59 | 60 | -------------------------------------------------------------------------------- /model/seqlabel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Jie Yang 3 | # @Date: 2017-10-17 16:47:32 4 | # @Last Modified by: Jie Yang, Contact: jieynlp@gmail.com 5 | # @Last Modified time: 2019-02-13 11:49:38 6 | 7 | from __future__ import print_function 8 | from __future__ import absolute_import 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | from .wordsequence import WordSequence 13 | from .crf import CRF 14 | 15 | class SeqLabel(nn.Module): 16 | def __init__(self, data): 17 | super(SeqLabel, self).__init__() 18 | self.use_crf = data.use_crf 19 | print("build sequence labeling network...") 20 | print("use_char: ", data.use_char) 21 | if data.use_char: 22 | print("char feature extractor: ", data.char_feature_extractor) 23 | print("word feature extractor: ", data.word_feature_extractor) 24 | print("use crf: ", self.use_crf) 25 | 26 | self.gpu = data.HP_gpu 27 | self.average_batch = data.average_batch_loss 28 | ## add two more label for downlayer lstm, use original label size for CRF 29 | label_size = data.label_alphabet_size 30 | data.label_alphabet_size += 2 31 | self.word_hidden = WordSequence(data) 32 | if self.use_crf: 33 | self.crf = CRF(label_size, self.gpu) 34 | 35 | 36 | def calculate_loss(self, word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, batch_label, mask): 37 | outs = self.word_hidden(word_inputs,feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) 38 | batch_size = word_inputs.size(0) 39 | seq_len = word_inputs.size(1) 40 | if self.use_crf: 41 | total_loss = self.crf.neg_log_likelihood_loss(outs, mask, batch_label) 42 | scores, tag_seq = self.crf._viterbi_decode(outs, mask) 43 | else: 44 | loss_function = nn.NLLLoss(ignore_index=0, size_average=False) 45 | outs = outs.view(batch_size * seq_len, -1) 46 | score = F.log_softmax(outs, 1) 47 | total_loss = loss_function(score, batch_label.view(batch_size * seq_len)) 48 | _, tag_seq = torch.max(score, 1) 49 | tag_seq = tag_seq.view(batch_size, seq_len) 50 | if self.average_batch: 51 | total_loss = total_loss / batch_size 52 | return total_loss, tag_seq 53 | 54 | 55 | def forward(self, word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, mask): 56 | outs = self.word_hidden(word_inputs,feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) 57 | batch_size = word_inputs.size(0) 58 | seq_len = word_inputs.size(1) 59 | if self.use_crf: 60 | scores, tag_seq = self.crf._viterbi_decode(outs, mask) 61 | else: 62 | outs = outs.view(batch_size * seq_len, -1) 63 | _, tag_seq = torch.max(outs, 1) 64 | tag_seq = tag_seq.view(batch_size, seq_len) 65 | ## filter padded position with zero 66 | tag_seq = mask.long() * tag_seq 67 | return tag_seq 68 | 69 | 70 | # def get_lstm_features(self, word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover): 71 | # return self.word_hidden(word_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) 72 | 73 | 74 | def decode_nbest(self, word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, mask, nbest): 75 | if not self.use_crf: 76 | print("Nbest output is currently supported only for CRF! Exit...") 77 | exit(0) 78 | outs = self.word_hidden(word_inputs,feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) 79 | batch_size = word_inputs.size(0) 80 | seq_len = word_inputs.size(1) 81 | scores, tag_seq = self.crf._viterbi_decode_nbest(outs, mask, nbest) 82 | return scores, tag_seq 83 | 84 | -------------------------------------------------------------------------------- /model/wordrep.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Jie Yang 3 | # @Date: 2017-10-17 16:47:32 4 | # @Last Modified by: Jie Yang, Contact: jieynlp@gmail.com 5 | # @Last Modified time: 2019-02-01 15:52:01 6 | from __future__ import print_function 7 | from __future__ import absolute_import 8 | import torch 9 | import torch.nn as nn 10 | import numpy as np 11 | from .charbilstm import CharBiLSTM 12 | from .charbigru import CharBiGRU 13 | from .charcnn import CharCNN 14 | 15 | class WordRep(nn.Module): 16 | def __init__(self, data): 17 | super(WordRep, self).__init__() 18 | print("build word representation...") 19 | self.gpu = data.HP_gpu 20 | self.use_char = data.use_char 21 | self.batch_size = data.HP_batch_size 22 | self.char_hidden_dim = 0 23 | self.char_all_feature = False 24 | self.sentence_classification = data.sentence_classification 25 | if self.use_char: 26 | self.char_hidden_dim = data.HP_char_hidden_dim 27 | self.char_embedding_dim = data.char_emb_dim 28 | if data.char_feature_extractor == "CNN": 29 | self.char_feature = CharCNN(data.char_alphabet.size(), data.pretrain_char_embedding, self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, self.gpu) 30 | elif data.char_feature_extractor == "LSTM": 31 | self.char_feature = CharBiLSTM(data.char_alphabet.size(), data.pretrain_char_embedding, self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, self.gpu) 32 | elif data.char_feature_extractor == "GRU": 33 | self.char_feature = CharBiGRU(data.char_alphabet.size(), data.pretrain_char_embedding, self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, self.gpu) 34 | elif data.char_feature_extractor == "ALL": 35 | self.char_all_feature = True 36 | self.char_feature = CharCNN(data.char_alphabet.size(), data.pretrain_char_embedding, self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, self.gpu) 37 | self.char_feature_extra = CharBiLSTM(data.char_alphabet.size(), data.pretrain_char_embedding, self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, self.gpu) 38 | else: 39 | print("Error char feature selection, please check parameter data.char_feature_extractor (CNN/LSTM/GRU/ALL).") 40 | exit(0) 41 | self.embedding_dim = data.word_emb_dim 42 | self.drop = nn.Dropout(data.HP_dropout) 43 | self.word_embedding = nn.Embedding(data.word_alphabet.size(), self.embedding_dim) 44 | if data.pretrain_word_embedding is not None: 45 | self.word_embedding.weight.data.copy_(torch.from_numpy(data.pretrain_word_embedding)) 46 | else: 47 | self.word_embedding.weight.data.copy_(torch.from_numpy(self.random_embedding(data.word_alphabet.size(), self.embedding_dim))) 48 | 49 | self.feature_num = data.feature_num 50 | self.feature_embedding_dims = data.feature_emb_dims 51 | self.feature_embeddings = nn.ModuleList() 52 | for idx in range(self.feature_num): 53 | self.feature_embeddings.append(nn.Embedding(data.feature_alphabets[idx].size(), self.feature_embedding_dims[idx])) 54 | for idx in range(self.feature_num): 55 | if data.pretrain_feature_embeddings[idx] is not None: 56 | self.feature_embeddings[idx].weight.data.copy_(torch.from_numpy(data.pretrain_feature_embeddings[idx])) 57 | else: 58 | self.feature_embeddings[idx].weight.data.copy_(torch.from_numpy(self.random_embedding(data.feature_alphabets[idx].size(), self.feature_embedding_dims[idx]))) 59 | 60 | if self.gpu: 61 | self.drop = self.drop.cuda() 62 | self.word_embedding = self.word_embedding.cuda() 63 | for idx in range(self.feature_num): 64 | self.feature_embeddings[idx] = self.feature_embeddings[idx].cuda() 65 | 66 | 67 | 68 | def random_embedding(self, vocab_size, embedding_dim): 69 | pretrain_emb = np.empty([vocab_size, embedding_dim]) 70 | scale = np.sqrt(3.0 / embedding_dim) 71 | for index in range(vocab_size): 72 | pretrain_emb[index,:] = np.random.uniform(-scale, scale, [1, embedding_dim]) 73 | return pretrain_emb 74 | 75 | 76 | def forward(self, word_inputs,feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover): 77 | """ 78 | input: 79 | word_inputs: (batch_size, sent_len) 80 | features: list [(batch_size, sent_len), (batch_len, sent_len),...] 81 | word_seq_lengths: list of batch_size, (batch_size,1) 82 | char_inputs: (batch_size*sent_len, word_length) 83 | char_seq_lengths: list of whole batch_size for char, (batch_size*sent_len, 1) 84 | char_seq_recover: variable which records the char order information, used to recover char order 85 | output: 86 | Variable(batch_size, sent_len, hidden_dim) 87 | """ 88 | batch_size = word_inputs.size(0) 89 | sent_len = word_inputs.size(1) 90 | 91 | word_embs = self.word_embedding(word_inputs) 92 | 93 | word_list = [word_embs] 94 | if not self.sentence_classification: 95 | for idx in range(self.feature_num): 96 | word_list.append(self.feature_embeddings[idx](feature_inputs[idx])) 97 | if self.use_char: 98 | ## calculate char lstm last hidden 99 | # print("charinput:", char_inputs) 100 | # exit(0) 101 | char_features = self.char_feature.get_last_hiddens(char_inputs, char_seq_lengths.cpu().numpy()) 102 | char_features = char_features[char_seq_recover] 103 | char_features = char_features.view(batch_size,sent_len,-1) 104 | ## concat word and char together 105 | word_list.append(char_features) 106 | word_embs = torch.cat([word_embs, char_features], 2) 107 | if self.char_all_feature: 108 | char_features_extra = self.char_feature_extra.get_last_hiddens(char_inputs, char_seq_lengths.cpu().numpy()) 109 | char_features_extra = char_features_extra[char_seq_recover] 110 | char_features_extra = char_features_extra.view(batch_size,sent_len,-1) 111 | ## concat word and char together 112 | word_list.append(char_features_extra) 113 | word_embs = torch.cat(word_list, 2) 114 | # if a == 0: 115 | # print("inputs", word_inputs) 116 | # print("embeddings:", word_embs) 117 | word_represent = self.drop(word_embs) 118 | return word_represent 119 | -------------------------------------------------------------------------------- /model/wordsequence.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Jie Yang 3 | # @Date: 2017-10-17 16:47:32 4 | # @Last Modified by: Jie Yang, Contact: jieynlp@gmail.com 5 | # @Last Modified time: 2019-02-01 15:59:26 6 | from __future__ import print_function 7 | from __future__ import absolute_import 8 | import torch 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence 12 | from .wordrep import WordRep 13 | 14 | class WordSequence(nn.Module): 15 | def __init__(self, data): 16 | super(WordSequence, self).__init__() 17 | print("build word sequence feature extractor: %s..."%(data.word_feature_extractor)) 18 | self.gpu = data.HP_gpu 19 | self.use_char = data.use_char 20 | # self.batch_size = data.HP_batch_size 21 | # self.hidden_dim = data.HP_hidden_dim 22 | self.droplstm = nn.Dropout(data.HP_dropout) 23 | self.bilstm_flag = data.HP_bilstm 24 | self.lstm_layer = data.HP_lstm_layer 25 | self.wordrep = WordRep(data) 26 | self.input_size = data.word_emb_dim 27 | self.feature_num = data.feature_num 28 | if self.use_char: 29 | self.input_size += data.HP_char_hidden_dim 30 | if data.char_feature_extractor == "ALL": 31 | self.input_size += data.HP_char_hidden_dim 32 | for idx in range(self.feature_num): 33 | self.input_size += data.feature_emb_dims[idx] 34 | # The LSTM takes word embeddings as inputs, and outputs hidden states 35 | # with dimensionality hidden_dim. 36 | if self.bilstm_flag: 37 | lstm_hidden = data.HP_hidden_dim // 2 38 | else: 39 | lstm_hidden = data.HP_hidden_dim 40 | 41 | self.word_feature_extractor = data.word_feature_extractor 42 | if self.word_feature_extractor == "GRU": 43 | self.lstm = nn.GRU(self.input_size, lstm_hidden, num_layers=self.lstm_layer, batch_first=True, bidirectional=self.bilstm_flag) 44 | elif self.word_feature_extractor == "LSTM": 45 | self.lstm = nn.LSTM(self.input_size, lstm_hidden, num_layers=self.lstm_layer, batch_first=True, bidirectional=self.bilstm_flag) 46 | elif self.word_feature_extractor == "CNN": 47 | # cnn_hidden = data.HP_hidden_dim 48 | self.word2cnn = nn.Linear(self.input_size, data.HP_hidden_dim) 49 | self.cnn_layer = data.HP_cnn_layer 50 | print("CNN layer: ", self.cnn_layer) 51 | self.cnn_list = nn.ModuleList() 52 | self.cnn_drop_list = nn.ModuleList() 53 | self.cnn_batchnorm_list = nn.ModuleList() 54 | kernel = 3 55 | pad_size = int((kernel-1)/2) 56 | for idx in range(self.cnn_layer): 57 | self.cnn_list.append(nn.Conv1d(data.HP_hidden_dim, data.HP_hidden_dim, kernel_size=kernel, padding=pad_size)) 58 | self.cnn_drop_list.append(nn.Dropout(data.HP_dropout)) 59 | self.cnn_batchnorm_list.append(nn.BatchNorm1d(data.HP_hidden_dim)) 60 | # The linear layer that maps from hidden state space to tag space 61 | self.hidden2tag = nn.Linear(data.HP_hidden_dim, data.label_alphabet_size) 62 | 63 | if self.gpu: 64 | self.droplstm = self.droplstm.cuda() 65 | self.hidden2tag = self.hidden2tag.cuda() 66 | if self.word_feature_extractor == "CNN": 67 | self.word2cnn = self.word2cnn.cuda() 68 | for idx in range(self.cnn_layer): 69 | self.cnn_list[idx] = self.cnn_list[idx].cuda() 70 | self.cnn_drop_list[idx] = self.cnn_drop_list[idx].cuda() 71 | self.cnn_batchnorm_list[idx] = self.cnn_batchnorm_list[idx].cuda() 72 | else: 73 | self.lstm = self.lstm.cuda() 74 | 75 | 76 | def forward(self, word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover): 77 | """ 78 | input: 79 | word_inputs: (batch_size, sent_len) 80 | feature_inputs: [(batch_size, sent_len), ...] list of variables 81 | word_seq_lengths: list of batch_size, (batch_size,1) 82 | char_inputs: (batch_size*sent_len, word_length) 83 | char_seq_lengths: list of whole batch_size for char, (batch_size*sent_len, 1) 84 | char_seq_recover: variable which records the char order information, used to recover char order 85 | output: 86 | Variable(batch_size, sent_len, hidden_dim) 87 | """ 88 | 89 | word_represent = self.wordrep(word_inputs,feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) 90 | ## word_embs (batch_size, seq_len, embed_size) 91 | if self.word_feature_extractor == "CNN": 92 | batch_size = word_inputs.size(0) 93 | word_in = torch.tanh(self.word2cnn(word_represent)).transpose(2,1).contiguous() 94 | for idx in range(self.cnn_layer): 95 | if idx == 0: 96 | cnn_feature = F.relu(self.cnn_list[idx](word_in)) 97 | else: 98 | cnn_feature = F.relu(self.cnn_list[idx](cnn_feature)) 99 | cnn_feature = self.cnn_drop_list[idx](cnn_feature) 100 | if batch_size > 1: 101 | cnn_feature = self.cnn_batchnorm_list[idx](cnn_feature) 102 | feature_out = cnn_feature.transpose(2,1).contiguous() 103 | else: 104 | packed_words = pack_padded_sequence(word_represent, word_seq_lengths.cpu().numpy(), True) 105 | hidden = None 106 | lstm_out, hidden = self.lstm(packed_words, hidden) 107 | lstm_out, _ = pad_packed_sequence(lstm_out) 108 | ## lstm_out (seq_len, seq_len, hidden_size) 109 | feature_out = self.droplstm(lstm_out.transpose(1,0)) 110 | ## feature_out (batch_size, seq_len, hidden_size) 111 | outputs = self.hidden2tag(feature_out) 112 | return outputs 113 | 114 | def sentence_representation(self, word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover): 115 | """ 116 | input: 117 | word_inputs: (batch_size, sent_len) 118 | feature_inputs: [(batch_size, ), ...] list of variables 119 | word_seq_lengths: list of batch_size, (batch_size,1) 120 | char_inputs: (batch_size*sent_len, word_length) 121 | char_seq_lengths: list of whole batch_size for char, (batch_size*sent_len, 1) 122 | char_seq_recover: variable which records the char order information, used to recover char order 123 | output: 124 | Variable(batch_size, sent_len, hidden_dim) 125 | """ 126 | 127 | word_represent = self.wordrep(word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) 128 | ## word_embs (batch_size, seq_len, embed_size) 129 | batch_size = word_inputs.size(0) 130 | if self.word_feature_extractor == "CNN": 131 | word_in = torch.tanh(self.word2cnn(word_represent)).transpose(2,1).contiguous() 132 | for idx in range(self.cnn_layer): 133 | if idx == 0: 134 | cnn_feature = F.relu(self.cnn_list[idx](word_in)) 135 | else: 136 | cnn_feature = F.relu(self.cnn_list[idx](cnn_feature)) 137 | cnn_feature = self.cnn_drop_list[idx](cnn_feature) 138 | if batch_size > 1: 139 | cnn_feature = self.cnn_batchnorm_list[idx](cnn_feature) 140 | feature_out = F.max_pool1d(cnn_feature, cnn_feature.size(2)).view(batch_size, -1) 141 | else: 142 | packed_words = pack_padded_sequence(word_represent, word_seq_lengths.cpu().numpy(), True) 143 | hidden = None 144 | lstm_out, hidden = self.lstm(packed_words, hidden) 145 | ## lstm_out (seq_len, seq_len, hidden_size) 146 | ## feature_out (batch_size, hidden_size) 147 | feature_out = hidden[0].transpose(1,0).contiguous().view(batch_size,-1) 148 | 149 | feature_list = [feature_out] 150 | for idx in range(self.feature_num): 151 | feature_list.append(self.feature_embeddings[idx](feature_inputs[idx])) 152 | final_feature = torch.cat(feature_list, 1) 153 | outputs = self.hidden2tag(self.droplstm(final_feature)) 154 | ## outputs: (batch_size, label_alphabet_size) 155 | return outputs 156 | -------------------------------------------------------------------------------- /readme/Configuration.md: -------------------------------------------------------------------------------- 1 | ### I/O ### 2 | ```Python 3 | train_dir=xx #string (necessary in training). Set training file directory. 4 | dev_dir=xx #string (necessary in training). Set dev file directory. 5 | test_dir=xx #string . Set test file directory. 6 | model_dir=xx #string (optional). Set saved model file directory. 7 | word_emb_dir=xx #string (optional). Set pretrained word embedding file directory. 8 | 9 | raw_dir=xx #string (optional). Set input raw file directory. 10 | decode_dir=xx #string (necessary in decoding). Set decoded file directory. 11 | dset_dir=xx #string (necessary). Set saved model file directory. 12 | load_model_dir=xx #string (necessary in decoding). Set loaded model file directory. (when decoding) 13 | char_emb_dir=xx #string (optional). Set pretrained character embedding file directory. 14 | 15 | norm_word_emb=False #boolen. If normalize the pretrained word embedding. 16 | norm_char_emb=False #boolen. If normalize the pretrained character embedding. 17 | number_normalized=True #boolen. If normalize the digit into `0` for input files. 18 | seg=True #boolen. If task is segmentation like, tasks with token accuracy evaluation (e.g. POS, CCG) is False; tasks with F-value evaluation(e.g. Word Segmentation, NER, Chunking) is True . 19 | word_emb_dim=50 #int. Word embedding dimension, if model use pretrained word embedding, word_emb_dim will be reset as the same dimension as pretrained embedidng. 20 | char_emb_dim=30 #int. Character embedding dimension, if model use pretrained character embedding, char_emb_dim will be reset as the same dimension as pretrained embedidng. 21 | ``` 22 | 23 | ### NetworkConfiguration ### 24 | ```Python 25 | use_crf=True #boolen (necessary in training). Flag of if using CRF layer. If it is set as False, then Softmax is used in inference layer. 26 | use_char=True #boolen (necessary in training). Flag of if using character sequence layer. 27 | word_seq_feature=XX #boolen (necessary in training): CNN/LSTM/GRU. Neural structure selection for word sequence. 28 | char_seq_feature=CNN #boolen (necessary in training): CNN/LSTM/GRU. Neural structure selection for character sequence, it only be used when use_char=True. 29 | feature=[POS] emb_size=20 emb_dir=xx #feature configuration. It includes the feature prefix [POS], pretrained feature embedding file and the embedding size. 30 | feature=[Cap] emb_size=20 emb_dir=xx #feature configuration. Another feature [Cap]. 31 | nbest=1 #int (necessary in decoding). Set the nbest size during decoding. 32 | ``` 33 | 34 | ### TrainingSetting ### 35 | ```Python 36 | status=train #string: train or decode. Set the program running in training or decoding mode. 37 | optimizer=SGD #string: SGD/Adagrad/AdaDelta/RMSprop/Adam. optimizer selection. 38 | iteration=1 #int. Set the iteration number of training. 39 | batch_size=10 #int. Set the batch size of training or decoding. 40 | ave_batch_loss=False #boolen. Set average the batched loss during training. 41 | ``` 42 | 43 | ### Hyperparameters ### 44 | ```Python 45 | cnn_layer=4 #int. CNN layer number for word sequence layer. 46 | char_hidden_dim=50 #int. Character hidden vector dimension for character sequence layer. 47 | hidden_dim=200 #int. Word hidden vector dimension for word sequence layer. 48 | dropout=0.5 #float. Dropout probability. 49 | lstm_layer=1 #int. LSTM layer number for word sequence layer. 50 | bilstm=True #boolen. If use bidirection lstm for word seuquence layer. 51 | learning_rate=0.015 #float. Learning rate. 52 | lr_decay=0.05 #float. Learning rate decay rate, only works when optimizer=SGD. 53 | momentum=0 #float. Momentum 54 | l2=1e-8 #float. L2-regulization. 55 | #gpu=True #boolen. If use GPU, generally it depends on the hardward environment. 56 | #clip= #float. Clip the gradient which is larger than the setted number. 57 | ``` 58 | -------------------------------------------------------------------------------- /readme/Extension.md: -------------------------------------------------------------------------------- 1 | ### Module Extension. ### 2 | 3 | If you want to extend character sequence layer: please refer to the file [charlstm.py](model/charlstm.py). 4 | 5 | If you want to extend word sequence layer: please refer to the file [wordsequence.py](model/wordsequence.py). 6 | 7 | More details will be updated soon. -------------------------------------------------------------------------------- /readme/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiesutd/NCRFpp/105a53a321eca9c1280037c473967858e01aaa43/readme/architecture.png -------------------------------------------------------------------------------- /readme/hyperparameter_tuning.md: -------------------------------------------------------------------------------- 1 | ## Hyperparamter tuning on CoNLL 2003 English NER task 2 | 3 | 1. If you use large batch (e.g. batch_size > 100), you'd better set `avg_batch_loss=True` to get a stable training process. For small batch size, `avg_batch_loss=True` will converge faster and sometimes gives better performance (e.g. CoNLL 2003 NER). 4 | 2. You can get better performance on the CoNLL 2003 English dataset if you use 100-d pretrained word vectors [here](https://nlp.stanford.edu/projects/glove/) instead of 50-d pretrained word vectors. 5 | 3. If you want to write a script to tune hyperparameters, you can use the `main_parse.py` to set hyperparameters in command line arguements. 6 | 4. Model performance is sensitive with `lr` which needs to be carefully tuned under different structures: 7 | * Word level LSTM models (e.g. char LSTM + word LSTM + CRF) would prefer a `lr` around 0.015. 8 | * Word level CNN models (e.g. char LSTM + word CNN + CRF) would prefer a `lr` around 0.005 and with more iterations. 9 | * You can refer the COLING paper "[Design Challenges and Misconceptions in Neural Sequence Labeling](https://arxiv.org/pdf/1806.04470.pdf)" for more hyperparameter settings. 10 | -------------------------------------------------------------------------------- /readme/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiesutd/NCRFpp/105a53a321eca9c1280037c473967858e01aaa43/readme/logo.png -------------------------------------------------------------------------------- /readme/nbest.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiesutd/NCRFpp/105a53a321eca9c1280037c473967858e01aaa43/readme/nbest.png -------------------------------------------------------------------------------- /readme/speed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiesutd/NCRFpp/105a53a321eca9c1280037c473967858e01aaa43/readme/speed.png -------------------------------------------------------------------------------- /sample_data/dev.bmes: -------------------------------------------------------------------------------- 1 | CRICKET O 2 | - O 3 | LEICESTERSHIRE S-ORG 4 | TAKE O 5 | OVER O 6 | AT O 7 | TOP O 8 | AFTER O 9 | INNINGS O 10 | VICTORY O 11 | . O 12 | 13 | LONDON S-LOC 14 | 1996-08-30 O 15 | 16 | West B-MISC 17 | Indian E-MISC 18 | all-rounder O 19 | Phil B-PER 20 | Simmons E-PER 21 | took O 22 | four O 23 | for O 24 | 38 O 25 | on O 26 | Friday O 27 | as O 28 | Leicestershire S-ORG 29 | beat O 30 | Somerset S-ORG 31 | by O 32 | an O 33 | innings O 34 | and O 35 | 39 O 36 | runs O 37 | in O 38 | two O 39 | days O 40 | to O 41 | take O 42 | over O 43 | at O 44 | the O 45 | head O 46 | of O 47 | the O 48 | county O 49 | championship O 50 | . O 51 | 52 | Their O 53 | stay O 54 | on O 55 | top O 56 | , O 57 | though O 58 | , O 59 | may O 60 | be O 61 | short-lived O 62 | as O 63 | title O 64 | rivals O 65 | Essex S-ORG 66 | , O 67 | Derbyshire S-ORG 68 | and O 69 | Surrey S-ORG 70 | all O 71 | closed O 72 | in O 73 | on O 74 | victory O 75 | while O 76 | Kent S-ORG 77 | made O 78 | up O 79 | for O 80 | lost O 81 | time O 82 | in O 83 | their O 84 | rain-affected O 85 | match O 86 | against O 87 | Nottinghamshire S-ORG 88 | . O 89 | 90 | After O 91 | bowling O 92 | Somerset S-ORG 93 | out O 94 | for O 95 | 83 O 96 | on O 97 | the O 98 | opening O 99 | morning O 100 | at O 101 | Grace B-LOC 102 | Road E-LOC 103 | , O 104 | Leicestershire S-ORG 105 | extended O 106 | their O 107 | first O 108 | innings O 109 | by O 110 | 94 O 111 | runs O 112 | before O 113 | being O 114 | bowled O 115 | out O 116 | for O 117 | 296 O 118 | with O 119 | England S-LOC 120 | discard O 121 | Andy B-PER 122 | Caddick E-PER 123 | taking O 124 | three O 125 | for O 126 | 83 O 127 | . O 128 | 129 | Trailing O 130 | by O 131 | 213 O 132 | , O 133 | Somerset S-ORG 134 | got O 135 | a O 136 | solid O 137 | start O 138 | to O 139 | their O 140 | second O 141 | innings O 142 | before O 143 | Simmons S-PER 144 | stepped O 145 | in O 146 | to O 147 | bundle O 148 | them O 149 | out O 150 | for O 151 | 174 O 152 | . O 153 | 154 | Essex S-ORG 155 | , O 156 | however O 157 | , O 158 | look O 159 | certain O 160 | to O 161 | regain O 162 | their O 163 | top O 164 | spot O 165 | after O 166 | Nasser B-PER 167 | Hussain E-PER 168 | and O 169 | Peter B-PER 170 | Such E-PER 171 | gave O 172 | them O 173 | a O 174 | firm O 175 | grip O 176 | on O 177 | their O 178 | match O 179 | against O 180 | Yorkshire S-ORG 181 | at O 182 | Headingley S-LOC 183 | . O 184 | 185 | Hussain S-PER 186 | , O 187 | considered O 188 | surplus O 189 | to O 190 | England S-LOC 191 | 's O 192 | one-day O 193 | requirements O 194 | , O 195 | struck O 196 | 158 O 197 | , O 198 | his O 199 | first O 200 | championship O 201 | century O 202 | of O 203 | the O 204 | season O 205 | , O 206 | as O 207 | Essex S-ORG 208 | reached O 209 | 372 O 210 | and O 211 | took O 212 | a O 213 | first O 214 | innings O 215 | lead O 216 | of O 217 | 82 O 218 | . O 219 | 220 | By O 221 | the O 222 | close O 223 | Yorkshire S-ORG 224 | had O 225 | turned O 226 | that O 227 | into O 228 | a O 229 | 37-run O 230 | advantage O 231 | but O 232 | off-spinner O 233 | Such S-PER 234 | had O 235 | scuttled O 236 | their O 237 | hopes O 238 | , O 239 | taking O 240 | four O 241 | for O 242 | 24 O 243 | in O 244 | 48 O 245 | balls O 246 | and O 247 | leaving O 248 | them O 249 | hanging O 250 | on O 251 | 119 O 252 | for O 253 | five O 254 | and O 255 | praying O 256 | for O 257 | rain O 258 | . O 259 | 260 | At O 261 | the O 262 | Oval S-LOC 263 | , O 264 | Surrey S-ORG 265 | captain O 266 | Chris B-PER 267 | Lewis E-PER 268 | , O 269 | another O 270 | man O 271 | dumped O 272 | by O 273 | England S-LOC 274 | , O 275 | continued O 276 | to O 277 | silence O 278 | his O 279 | critics O 280 | as O 281 | he O 282 | followed O 283 | his O 284 | four O 285 | for O 286 | 45 O 287 | on O 288 | Thursday O 289 | with O 290 | 80 O 291 | not O 292 | out O 293 | on O 294 | Friday O 295 | in O 296 | the O 297 | match O 298 | against O 299 | Warwickshire S-ORG 300 | . O 301 | 302 | He O 303 | was O 304 | well O 305 | backed O 306 | by O 307 | England S-LOC 308 | hopeful O 309 | Mark B-PER 310 | Butcher E-PER 311 | who O 312 | made O 313 | 70 O 314 | as O 315 | Surrey S-ORG 316 | closed O 317 | on O 318 | 429 O 319 | for O 320 | seven O 321 | , O 322 | a O 323 | lead O 324 | of O 325 | 234 O 326 | . O 327 | 328 | Derbyshire S-ORG 329 | kept O 330 | up O 331 | the O 332 | hunt O 333 | for O 334 | their O 335 | first O 336 | championship O 337 | title O 338 | since O 339 | 1936 O 340 | by O 341 | reducing O 342 | Worcestershire S-ORG 343 | to O 344 | 133 O 345 | for O 346 | five O 347 | in O 348 | their O 349 | second O 350 | innings O 351 | , O 352 | still O 353 | 100 O 354 | runs O 355 | away O 356 | from O 357 | avoiding O 358 | an O 359 | innings O 360 | defeat O 361 | . O 362 | 363 | Australian S-MISC 364 | Tom B-PER 365 | Moody E-PER 366 | took O 367 | six O 368 | for O 369 | 82 O 370 | but O 371 | Chris B-PER 372 | Adams E-PER 373 | , O 374 | 123 O 375 | , O 376 | and O 377 | Tim B-PER 378 | O'Gorman E-PER 379 | , O 380 | 109 O 381 | , O 382 | took O 383 | Derbyshire S-ORG 384 | to O 385 | 471 O 386 | and O 387 | a O 388 | first O 389 | innings O 390 | lead O 391 | of O 392 | 233 O 393 | . O 394 | 395 | After O 396 | the O 397 | frustration O 398 | of O 399 | seeing O 400 | the O 401 | opening O 402 | day O 403 | of O 404 | their O 405 | match O 406 | badly O 407 | affected O 408 | by O 409 | the O 410 | weather O 411 | , O 412 | Kent S-ORG 413 | stepped O 414 | up O 415 | a O 416 | gear O 417 | to O 418 | dismiss O 419 | Nottinghamshire S-ORG 420 | for O 421 | 214 O 422 | . O 423 | 424 | They O 425 | were O 426 | held O 427 | up O 428 | by O 429 | a O 430 | gritty O 431 | 84 O 432 | from O 433 | Paul B-PER 434 | Johnson E-PER 435 | but O 436 | ex-England S-MISC 437 | fast O 438 | bowler O 439 | Martin B-PER 440 | McCague E-PER 441 | took O 442 | four O 443 | for O 444 | 55 O 445 | . O 446 | 447 | By O 448 | stumps O 449 | Kent S-ORG 450 | had O 451 | reached O 452 | 108 O 453 | for O 454 | three O 455 | . O 456 | 457 | -DOCSTART- O 458 | 459 | CRICKET O 460 | - O 461 | ENGLISH B-MISC 462 | COUNTY I-MISC 463 | CHAMPIONSHIP E-MISC 464 | SCORES O 465 | . O 466 | 467 | LONDON S-LOC 468 | 1996-08-30 O 469 | 470 | Result O 471 | and O 472 | close O 473 | of O 474 | play O 475 | scores O 476 | in O 477 | English S-MISC 478 | county O 479 | championship O 480 | matches O 481 | on O 482 | Friday O 483 | : O 484 | 485 | Leicester S-LOC 486 | : O 487 | Leicestershire S-ORG 488 | beat O 489 | Somerset S-ORG 490 | by O 491 | an O 492 | innings O 493 | and O 494 | 39 O 495 | runs O 496 | . O 497 | 498 | Somerset S-ORG 499 | 83 O 500 | and O 501 | 174 O 502 | ( O 503 | P. B-PER 504 | Simmons E-PER 505 | 4-38 O 506 | ) O 507 | , O 508 | Leicestershire S-ORG 509 | 296 O 510 | . O 511 | 512 | Leicestershire S-ORG 513 | 22 O 514 | points O 515 | , O 516 | Somerset S-ORG 517 | 4 O 518 | . O 519 | 520 | Chester-le-Street S-LOC 521 | : O 522 | Glamorgan S-ORG 523 | 259 O 524 | and O 525 | 207 O 526 | ( O 527 | A. B-PER 528 | Dale E-PER 529 | 69 O 530 | , O 531 | H. B-PER 532 | Morris E-PER 533 | 69 O 534 | ; O 535 | D. B-PER 536 | Blenkiron E-PER 537 | 4-43 O 538 | ) O 539 | , O 540 | Durham S-ORG 541 | 114 O 542 | ( O 543 | S. B-PER 544 | Watkin E-PER 545 | 4-28 O 546 | ) O 547 | and O 548 | 81-3 O 549 | . O 550 | 551 | Tunbridge B-LOC 552 | Wells E-LOC 553 | : O 554 | Nottinghamshire S-ORG 555 | 214 O 556 | ( O 557 | P. B-PER 558 | Johnson E-PER 559 | 84 O 560 | ; O 561 | M. B-PER 562 | McCague E-PER 563 | 4-55 O 564 | ) O 565 | , O 566 | Kent S-ORG 567 | 108-3 O 568 | . O 569 | 570 | London S-LOC 571 | ( O 572 | The B-LOC 573 | Oval E-LOC 574 | ) O 575 | : O 576 | Warwickshire S-ORG 577 | 195 O 578 | , O 579 | Surrey S-ORG 580 | 429-7 O 581 | ( O 582 | C. B-PER 583 | Lewis E-PER 584 | 80 O 585 | not O 586 | out O 587 | , O 588 | M. B-PER 589 | Butcher E-PER 590 | 70 O 591 | , O 592 | G. B-PER 593 | Kersey E-PER 594 | 63 O 595 | , O 596 | J. B-PER 597 | Ratcliffe E-PER 598 | 63 O 599 | , O 600 | D. B-PER 601 | Bicknell E-PER 602 | 55 O 603 | ) O 604 | . O 605 | 606 | Hove S-LOC 607 | : O 608 | Sussex S-ORG 609 | 363 O 610 | ( O 611 | W. B-PER 612 | Athey E-PER 613 | 111 O 614 | , O 615 | V. B-PER 616 | Drakes E-PER 617 | 52 O 618 | ; O 619 | I. B-PER 620 | Austin E-PER 621 | 4-37 O 622 | ) O 623 | , O 624 | Lancashire S-ORG 625 | 197-8 O 626 | ( O 627 | W. B-PER 628 | Hegg E-PER 629 | 54 O 630 | ) O 631 | 632 | Portsmouth S-LOC 633 | : O 634 | Middlesex S-ORG 635 | 199 O 636 | and O 637 | 426 O 638 | ( O 639 | J. B-PER 640 | Pooley E-PER 641 | 111 O 642 | , O 643 | M. B-PER 644 | Ramprakash E-PER 645 | 108 O 646 | , O 647 | M. B-PER 648 | Gatting E-PER 649 | 83 O 650 | ) O 651 | , O 652 | Hampshire S-ORG 653 | 232 O 654 | and O 655 | 109-5 O 656 | . O 657 | 658 | Chesterfield S-LOC 659 | : O 660 | Worcestershire S-ORG 661 | 238 O 662 | and O 663 | 133-5 O 664 | , O 665 | Derbyshire S-ORG 666 | 471 O 667 | ( O 668 | J. B-PER 669 | Adams E-PER 670 | 123 O 671 | , O 672 | T.O'Gorman S-PER 673 | 109 O 674 | not O 675 | out O 676 | , O 677 | K. B-PER 678 | Barnett E-PER 679 | 87 O 680 | ; O 681 | T. B-PER 682 | Moody E-PER 683 | 6-82 O 684 | ) O 685 | 686 | Bristol S-LOC 687 | : O 688 | Gloucestershire S-ORG 689 | 183 O 690 | and O 691 | 185-6 O 692 | ( O 693 | J. B-PER 694 | Russell E-PER 695 | 56 O 696 | not O 697 | out O 698 | ) O 699 | , O 700 | Northamptonshire S-ORG 701 | 190 O 702 | ( O 703 | K. B-PER 704 | Curran E-PER 705 | 52 O 706 | ; O 707 | A. B-PER 708 | Smith E-PER 709 | 5-68 O 710 | ) O 711 | . O 712 | 713 | -DOCSTART- O 714 | 715 | CRICKET O 716 | - O 717 | 1997 O 718 | ASHES S-MISC 719 | INTINERARY O 720 | . O 721 | 722 | LONDON S-LOC 723 | 1996-08-30 O 724 | 725 | Australia S-LOC 726 | will O 727 | defend O 728 | the O 729 | Ashes S-MISC 730 | in O 731 | 732 | a O 733 | six-test O 734 | series O 735 | against O 736 | England S-LOC 737 | during O 738 | a O 739 | four-month O 740 | tour O 741 | 742 | starting O 743 | on O 744 | May O 745 | 13 O 746 | next O 747 | year O 748 | , O 749 | the O 750 | Test B-ORG 751 | and I-ORG 752 | County I-ORG 753 | Cricket I-ORG 754 | Board E-ORG 755 | 756 | said O 757 | on O 758 | Friday O 759 | . O 760 | 761 | Australia S-LOC 762 | will O 763 | also O 764 | play O 765 | three O 766 | one-day O 767 | internationals O 768 | and O 769 | 770 | four O 771 | one-day O 772 | warm-up O 773 | matches O 774 | at O 775 | the O 776 | start O 777 | of O 778 | the O 779 | tour O 780 | . O 781 | 782 | The O 783 | tourists O 784 | will O 785 | play O 786 | nine O 787 | first-class O 788 | matches O 789 | against O 790 | 791 | English S-MISC 792 | county O 793 | sides O 794 | and O 795 | another O 796 | against O 797 | British B-ORG 798 | Universities E-ORG 799 | , O 800 | 801 | as O 802 | well O 803 | as O 804 | one-day O 805 | matches O 806 | against O 807 | the O 808 | Minor B-ORG 809 | Counties E-ORG 810 | and O 811 | 812 | Scotland S-LOC 813 | . O 814 | 815 | Tour O 816 | itinerary O 817 | : O 818 | 819 | May O 820 | 821 | May O 822 | 13 O 823 | Arrive O 824 | in O 825 | London S-LOC 826 | 827 | May O 828 | 14 O 829 | Practice O 830 | at O 831 | Lord B-LOC 832 | 's E-LOC 833 | 834 | May O 835 | 15 O 836 | v O 837 | Duke B-ORG 838 | of I-ORG 839 | Norfolk I-ORG 840 | 's I-ORG 841 | XI E-ORG 842 | ( O 843 | at O 844 | Arundel S-LOC 845 | ) O 846 | 847 | May O 848 | 17 O 849 | v O 850 | Northampton S-ORG 851 | 852 | May O 853 | 18 O 854 | v O 855 | Worcestershire S-ORG 856 | 857 | May O 858 | 20 O 859 | v O 860 | Durham S-ORG 861 | 862 | May O 863 | 22 O 864 | First O 865 | one-day O 866 | international O 867 | ( O 868 | at O 869 | Headingley S-LOC 870 | , O 871 | 872 | Leeds S-ORG 873 | ) O 874 | 875 | May O 876 | 24 O 877 | Second O 878 | one-day O 879 | international O 880 | ( O 881 | at O 882 | The B-LOC 883 | Oval E-LOC 884 | , O 885 | 886 | London S-LOC 887 | ) O 888 | 889 | May O 890 | 25 O 891 | Third O 892 | one-day O 893 | international O 894 | ( O 895 | at O 896 | Lord B-LOC 897 | 's E-LOC 898 | , O 899 | London S-LOC 900 | ) O 901 | 902 | May O 903 | 27-29 O 904 | v O 905 | Gloucestershire S-ORG 906 | or O 907 | Sussex S-ORG 908 | or O 909 | Surrey S-ORG 910 | ( O 911 | three O 912 | 913 | days O 914 | ) O 915 | 916 | May O 917 | 31 O 918 | - O 919 | June O 920 | 2 O 921 | v O 922 | Derbyshire S-ORG 923 | ( O 924 | three O 925 | days O 926 | ) O 927 | 928 | June O 929 | 930 | June O 931 | 5-9 O 932 | First O 933 | test O 934 | match O 935 | ( O 936 | at O 937 | Edgbaston S-LOC 938 | , O 939 | Birmingham S-LOC 940 | ) O 941 | 942 | June O 943 | 11-13 O 944 | v O 945 | a O 946 | first O 947 | class O 948 | county O 949 | ( O 950 | to O 951 | be O 952 | confirmed O 953 | ) O 954 | 955 | June O 956 | 14-16 O 957 | v O 958 | Leicestershire S-ORG 959 | ( O 960 | three O 961 | days O 962 | ) O 963 | 964 | June O 965 | 19-23 O 966 | Second O 967 | test O 968 | ( O 969 | at O 970 | Lord B-LOC 971 | 's E-LOC 972 | ) O 973 | 974 | June O 975 | 25-27 O 976 | v O 977 | British B-ORG 978 | Universities E-ORG 979 | ( O 980 | at O 981 | Oxford S-LOC 982 | , O 983 | three O 984 | days O 985 | ) O 986 | 987 | June O 988 | 28-30 O 989 | v O 990 | Hampshire S-ORG 991 | ( O 992 | three O 993 | days O 994 | ) O 995 | 996 | July O 997 | 998 | July O 999 | 3-7 O 1000 | Third O 1001 | test O 1002 | ( O 1003 | at O 1004 | Old B-LOC 1005 | Trafford E-LOC 1006 | , O 1007 | Manchester S-LOC 1008 | ) O 1009 | 1010 | July O 1011 | 9 O 1012 | v O 1013 | Minor B-ORG 1014 | Counties I-ORG 1015 | XI E-ORG 1016 | 1017 | July O 1018 | 12 O 1019 | v O 1020 | Scotland S-LOC 1021 | 1022 | July O 1023 | 16-18 O 1024 | v O 1025 | Glamorgan S-ORG 1026 | ( O 1027 | three O 1028 | days O 1029 | ) O 1030 | 1031 | July O 1032 | 19-21 O 1033 | v O 1034 | Middlesex S-ORG 1035 | ( O 1036 | three O 1037 | days O 1038 | ) O 1039 | 1040 | July O 1041 | 24-28 O 1042 | Fourth O 1043 | test O 1044 | ( O 1045 | at O 1046 | Headingley S-LOC 1047 | ) O 1048 | 1049 | August O 1050 | 1051 | August O 1052 | 1-4 O 1053 | v O 1054 | Somerset S-ORG 1055 | ( O 1056 | four O 1057 | days O 1058 | ) O 1059 | 1060 | August O 1061 | 7-11 O 1062 | Fifth O 1063 | test O 1064 | ( O 1065 | at O 1066 | Trent B-LOC 1067 | Bridge E-LOC 1068 | , O 1069 | Nottingham S-LOC 1070 | ) O 1071 | 1072 | August O 1073 | 16-18 O 1074 | v O 1075 | Kent S-ORG 1076 | ( O 1077 | three O 1078 | days O 1079 | ) O 1080 | 1081 | August O 1082 | 21-25 O 1083 | Sixth O 1084 | test O 1085 | ( O 1086 | at O 1087 | The B-LOC 1088 | Oval E-LOC 1089 | , O 1090 | London S-LOC 1091 | ) O 1092 | . O 1093 | 1094 | -DOCSTART- O 1095 | 1096 | SOCCER O 1097 | - O 1098 | SHEARER S-PER 1099 | NAMED O 1100 | AS O 1101 | ENGLAND S-LOC 1102 | CAPTAIN O 1103 | . O 1104 | 1105 | LONDON S-LOC 1106 | 1996-08-30 O 1107 | 1108 | The O 1109 | world O 1110 | 's O 1111 | costliest O 1112 | footballer O 1113 | Alan B-PER 1114 | Shearer E-PER 1115 | was O 1116 | named O 1117 | as O 1118 | the O 1119 | new O 1120 | England S-LOC 1121 | captain O 1122 | on O 1123 | Friday O 1124 | . O 1125 | 1126 | The O 1127 | 26-year-old O 1128 | , O 1129 | who O 1130 | joined O 1131 | Newcastle S-ORG 1132 | for O 1133 | 15 O 1134 | million O 1135 | pounds O 1136 | sterling O 1137 | ( O 1138 | $ O 1139 | 23.4 O 1140 | million O 1141 | ) O 1142 | , O 1143 | takes O 1144 | over O 1145 | from O 1146 | Tony B-PER 1147 | Adams E-PER 1148 | , O 1149 | who O 1150 | led O 1151 | the O 1152 | side O 1153 | during O 1154 | the O 1155 | European S-MISC 1156 | championship O 1157 | in O 1158 | June O 1159 | , O 1160 | and O 1161 | former O 1162 | captain O 1163 | David B-PER 1164 | Platt E-PER 1165 | . O 1166 | 1167 | Adams S-PER 1168 | and O 1169 | Platt S-PER 1170 | are O 1171 | both O 1172 | injured O 1173 | and O 1174 | will O 1175 | miss O 1176 | England S-LOC 1177 | 's O 1178 | opening O 1179 | World B-MISC 1180 | Cup E-MISC 1181 | qualifier O 1182 | against O 1183 | Moldova S-LOC 1184 | on O 1185 | Sunday O 1186 | . O 1187 | 1188 | Shearer S-PER 1189 | takes O 1190 | the O 1191 | captaincy O 1192 | on O 1193 | a O 1194 | trial O 1195 | basis O 1196 | , O 1197 | but O 1198 | new O 1199 | coach O 1200 | Glenn B-PER 1201 | Hoddle E-PER 1202 | said O 1203 | he O 1204 | saw O 1205 | no O 1206 | reason O 1207 | why O 1208 | the O 1209 | former O 1210 | Blackburn S-ORG 1211 | and O 1212 | Southampton S-ORG 1213 | skipper O 1214 | should O 1215 | not O 1216 | make O 1217 | the O 1218 | post O 1219 | his O 1220 | own O 1221 | . O 1222 | 1223 | " O 1224 | I O 1225 | 'm O 1226 | sure O 1227 | there O 1228 | wo O 1229 | n't O 1230 | be O 1231 | a O 1232 | problem O 1233 | , O 1234 | I O 1235 | 'm O 1236 | sure O 1237 | Alan S-PER 1238 | is O 1239 | the O 1240 | man O 1241 | for O 1242 | the O 1243 | job O 1244 | , O 1245 | " O 1246 | Hoddle S-PER 1247 | said O 1248 | . O 1249 | 1250 | " O 1251 | There O 1252 | were O 1253 | three O 1254 | or O 1255 | four O 1256 | people O 1257 | who O 1258 | could O 1259 | have O 1260 | done O 1261 | it O 1262 | but O 1263 | when O 1264 | I O 1265 | spoke O 1266 | to O 1267 | Alan S-PER 1268 | he O 1269 | was O 1270 | up O 1271 | for O 1272 | it O 1273 | and O 1274 | really O 1275 | wanted O 1276 | it O 1277 | . O 1278 | 1279 | " O 1280 | In O 1281 | four O 1282 | days O 1283 | it O 1284 | 's O 1285 | very O 1286 | difficult O 1287 | to O 1288 | come O 1289 | to O 1290 | a O 1291 | 100 O 1292 | percent O 1293 | conclusion O 1294 | about O 1295 | something O 1296 | like O 1297 | this O 1298 | ... O 1299 | 1300 | but O 1301 | he O 1302 | knows O 1303 | how O 1304 | to O 1305 | conduct O 1306 | himself O 1307 | , O 1308 | his O 1309 | team O 1310 | mates O 1311 | respect O 1312 | him O 1313 | and O 1314 | he O 1315 | knows O 1316 | about O 1317 | the O 1318 | team O 1319 | situation O 1320 | even O 1321 | though O 1322 | he O 1323 | plays O 1324 | up O 1325 | front O 1326 | . O 1327 | " O 1328 | 1329 | Shearer S-PER 1330 | 's O 1331 | Euro B-MISC 1332 | 96 E-MISC 1333 | striking O 1334 | partner O 1335 | Teddy B-PER 1336 | Sheringham E-PER 1337 | withdrew O 1338 | from O 1339 | the O 1340 | squad O 1341 | with O 1342 | an O 1343 | injury O 1344 | on O 1345 | Friday O 1346 | . O 1347 | 1348 | He O 1349 | will O 1350 | probably O 1351 | be O 1352 | replaced O 1353 | by O 1354 | Shearer S-PER 1355 | 's O 1356 | Newcastle S-ORG 1357 | team O 1358 | mate O 1359 | Les B-PER 1360 | Ferdinand E-PER 1361 | . O 1362 | 1363 | -DOCSTART- O 1364 | 1365 | BASKETBALL O 1366 | - O 1367 | INTERNATIONAL O 1368 | TOURNAMENT O 1369 | RESULT O 1370 | . O 1371 | 1372 | BELGRADE S-LOC 1373 | 1996-08-30 O 1374 | 1375 | Result O 1376 | in O 1377 | an O 1378 | international O 1379 | 1380 | basketball O 1381 | tournament O 1382 | on O 1383 | Friday O 1384 | : O 1385 | 1386 | Red B-ORG 1387 | Star E-ORG 1388 | ( O 1389 | Yugoslavia S-LOC 1390 | ) O 1391 | beat O 1392 | Dinamo S-ORG 1393 | ( O 1394 | Russia S-LOC 1395 | ) O 1396 | 92-90 O 1397 | ( O 1398 | halftime O 1399 | 1400 | 47-47 O 1401 | ) O 1402 | 1403 | -DOCSTART- O 1404 | 1405 | SOCCER O 1406 | - O 1407 | ROMANIA S-LOC 1408 | BEAT O 1409 | LITHUANIA S-LOC 1410 | IN O 1411 | UNDER-21 O 1412 | MATCH O 1413 | . O 1414 | 1415 | BUCHAREST S-LOC 1416 | 1996-08-30 O 1417 | 1418 | Romania S-LOC 1419 | beat O 1420 | Lithuania S-LOC 1421 | 2-1 O 1422 | ( O 1423 | halftime O 1424 | 1-1 O 1425 | ) O 1426 | in O 1427 | their O 1428 | European S-MISC 1429 | under-21 O 1430 | soccer O 1431 | match O 1432 | on O 1433 | Friday O 1434 | . O 1435 | 1436 | Scorers O 1437 | : O 1438 | 1439 | Romania S-LOC 1440 | - O 1441 | Cosmin B-PER 1442 | Contra E-PER 1443 | ( O 1444 | 31st O 1445 | ) O 1446 | , O 1447 | Mihai B-PER 1448 | Tararache E-PER 1449 | ( O 1450 | 75th O 1451 | ) O 1452 | 1453 | Lithuania S-LOC 1454 | - O 1455 | Danius B-PER 1456 | Gleveckas E-PER 1457 | ( O 1458 | 13rd O 1459 | ) O 1460 | 1461 | Attendance O 1462 | : O 1463 | 200 O 1464 | 1465 | -DOCSTART- O 1466 | 1467 | SOCCER O 1468 | - O 1469 | ROTOR S-ORG 1470 | FANS O 1471 | LOCKED O 1472 | OUT O 1473 | AFTER O 1474 | VOLGOGRAD S-LOC 1475 | VIOLENCE O 1476 | . O 1477 | 1478 | MOSCOW S-LOC 1479 | 1996-08-30 O 1480 | 1481 | Rotor B-ORG 1482 | Volgograd E-ORG 1483 | must O 1484 | play O 1485 | their O 1486 | next O 1487 | home O 1488 | game O 1489 | behind O 1490 | closed O 1491 | doors O 1492 | after O 1493 | fans O 1494 | hurled O 1495 | bottles O 1496 | and O 1497 | stones O 1498 | at O 1499 | Dynamo B-ORG 1500 | Moscow E-ORG 1501 | players O 1502 | during O 1503 | a O 1504 | 1-0 O 1505 | home O 1506 | defeat O 1507 | on O 1508 | Saturday O 1509 | that O 1510 | ended O 1511 | Rotor S-ORG 1512 | 's O 1513 | brief O 1514 | spell O 1515 | as O 1516 | league O 1517 | leaders O 1518 | . O 1519 | 1520 | The O 1521 | head O 1522 | of O 1523 | the O 1524 | Russian S-MISC 1525 | league O 1526 | 's O 1527 | disciplinary O 1528 | committee O 1529 | , O 1530 | Anatoly B-PER 1531 | Gorokhovsky E-PER 1532 | , O 1533 | said O 1534 | on O 1535 | Friday O 1536 | that O 1537 | Rotor S-ORG 1538 | would O 1539 | play O 1540 | Lada B-ORG 1541 | Togliatti E-ORG 1542 | to O 1543 | empty O 1544 | stands O 1545 | on O 1546 | September O 1547 | 3 O 1548 | . O 1549 | 1550 | The O 1551 | club O 1552 | , O 1553 | who O 1554 | put O 1555 | Manchester B-ORG 1556 | United E-ORG 1557 | out O 1558 | of O 1559 | last O 1560 | year O 1561 | 's O 1562 | UEFA B-MISC 1563 | Cup E-MISC 1564 | , O 1565 | were O 1566 | fined O 1567 | $ O 1568 | 1,000 O 1569 | . O -------------------------------------------------------------------------------- /sample_data/raw.bmes: -------------------------------------------------------------------------------- 1 | CRICKET O 2 | - O 3 | LEICESTERSHIRE S-ORG 4 | TAKE O 5 | OVER O 6 | AT O 7 | TOP O 8 | AFTER O 9 | INNINGS O 10 | VICTORY O 11 | . O 12 | 13 | LONDON S-LOC 14 | 1996-08-30 O 15 | 16 | West B-MISC 17 | Indian E-MISC 18 | all-rounder O 19 | Phil B-PER 20 | Simmons E-PER 21 | took O 22 | four O 23 | for O 24 | 38 O 25 | on O 26 | Friday O 27 | as O 28 | Leicestershire S-ORG 29 | beat O 30 | Somerset S-ORG 31 | by O 32 | an O 33 | innings O 34 | and O 35 | 39 O 36 | runs O 37 | in O 38 | two O 39 | days O 40 | to O 41 | take O 42 | over O 43 | at O 44 | the O 45 | head O 46 | of O 47 | the O 48 | county O 49 | championship O 50 | . O 51 | 52 | Their O 53 | stay O 54 | on O 55 | top O 56 | , O 57 | though O 58 | , O 59 | may O 60 | be O 61 | short-lived O 62 | as O 63 | title O 64 | rivals O 65 | Essex S-ORG 66 | , O 67 | Derbyshire S-ORG 68 | and O 69 | Surrey S-ORG 70 | all O 71 | closed O 72 | in O 73 | on O 74 | victory O 75 | while O 76 | Kent S-ORG 77 | made O 78 | up O 79 | for O 80 | lost O 81 | time O 82 | in O 83 | their O 84 | rain-affected O 85 | match O 86 | against O 87 | Nottinghamshire S-ORG 88 | . O 89 | 90 | After O 91 | bowling O 92 | Somerset S-ORG 93 | out O 94 | for O 95 | 83 O 96 | on O 97 | the O 98 | opening O 99 | morning O 100 | at O 101 | Grace B-LOC 102 | Road E-LOC 103 | , O 104 | Leicestershire S-ORG 105 | extended O 106 | their O 107 | first O 108 | innings O 109 | by O 110 | 94 O 111 | runs O 112 | before O 113 | being O 114 | bowled O 115 | out O 116 | for O 117 | 296 O 118 | with O 119 | England S-LOC 120 | discard O 121 | Andy B-PER 122 | Caddick E-PER 123 | taking O 124 | three O 125 | for O 126 | 83 O 127 | . O 128 | 129 | Trailing O 130 | by O 131 | 213 O 132 | , O 133 | Somerset S-ORG 134 | got O 135 | a O 136 | solid O 137 | start O 138 | to O 139 | their O 140 | second O 141 | innings O 142 | before O 143 | Simmons S-PER 144 | stepped O 145 | in O 146 | to O 147 | bundle O 148 | them O 149 | out O 150 | for O 151 | 174 O 152 | . O 153 | 154 | Essex S-ORG 155 | , O 156 | however O 157 | , O 158 | look O 159 | certain O 160 | to O 161 | regain O 162 | their O 163 | top O 164 | spot O 165 | after O 166 | Nasser B-PER 167 | Hussain E-PER 168 | and O 169 | Peter B-PER 170 | Such E-PER 171 | gave O 172 | them O 173 | a O 174 | firm O 175 | grip O 176 | on O 177 | their O 178 | match O 179 | against O 180 | Yorkshire S-ORG 181 | at O 182 | Headingley S-LOC 183 | . O 184 | 185 | Hussain S-PER 186 | , O 187 | considered O 188 | surplus O 189 | to O 190 | England S-LOC 191 | 's O 192 | one-day O 193 | requirements O 194 | , O 195 | struck O 196 | 158 O 197 | , O 198 | his O 199 | first O 200 | championship O 201 | century O 202 | of O 203 | the O 204 | season O 205 | , O 206 | as O 207 | Essex S-ORG 208 | reached O 209 | 372 O 210 | and O 211 | took O 212 | a O 213 | first O 214 | innings O 215 | lead O 216 | of O 217 | 82 O 218 | . O 219 | 220 | By O 221 | the O 222 | close O 223 | Yorkshire S-ORG 224 | had O 225 | turned O 226 | that O 227 | into O 228 | a O 229 | 37-run O 230 | advantage O 231 | but O 232 | off-spinner O 233 | Such S-PER 234 | had O 235 | scuttled O 236 | their O 237 | hopes O 238 | , O 239 | taking O 240 | four O 241 | for O 242 | 24 O 243 | in O 244 | 48 O 245 | balls O 246 | and O 247 | leaving O 248 | them O 249 | hanging O 250 | on O 251 | 119 O 252 | for O 253 | five O 254 | and O 255 | praying O 256 | for O 257 | rain O 258 | . O 259 | 260 | At O 261 | the O 262 | Oval S-LOC 263 | , O 264 | Surrey S-ORG 265 | captain O 266 | Chris B-PER 267 | Lewis E-PER 268 | , O 269 | another O 270 | man O 271 | dumped O 272 | by O 273 | England S-LOC 274 | , O 275 | continued O 276 | to O 277 | silence O 278 | his O 279 | critics O 280 | as O 281 | he O 282 | followed O 283 | his O 284 | four O 285 | for O 286 | 45 O 287 | on O 288 | Thursday O 289 | with O 290 | 80 O 291 | not O 292 | out O 293 | on O 294 | Friday O 295 | in O 296 | the O 297 | match O 298 | against O 299 | Warwickshire S-ORG 300 | . O 301 | 302 | He O 303 | was O 304 | well O 305 | backed O 306 | by O 307 | England S-LOC 308 | hopeful O 309 | Mark B-PER 310 | Butcher E-PER 311 | who O 312 | made O 313 | 70 O 314 | as O 315 | Surrey S-ORG 316 | closed O 317 | on O 318 | 429 O 319 | for O 320 | seven O 321 | , O 322 | a O 323 | lead O 324 | of O 325 | 234 O 326 | . O 327 | 328 | Derbyshire S-ORG 329 | kept O 330 | up O 331 | the O 332 | hunt O 333 | for O 334 | their O 335 | first O 336 | championship O 337 | title O 338 | since O 339 | 1936 O 340 | by O 341 | reducing O 342 | Worcestershire S-ORG 343 | to O 344 | 133 O 345 | for O 346 | five O 347 | in O 348 | their O 349 | second O 350 | innings O 351 | , O 352 | still O 353 | 100 O 354 | runs O 355 | away O 356 | from O 357 | avoiding O 358 | an O 359 | innings O 360 | defeat O 361 | . O 362 | 363 | Australian S-MISC 364 | Tom B-PER 365 | Moody E-PER 366 | took O 367 | six O 368 | for O 369 | 82 O 370 | but O 371 | Chris B-PER 372 | Adams E-PER 373 | , O 374 | 123 O 375 | , O 376 | and O 377 | Tim B-PER 378 | O'Gorman E-PER 379 | , O 380 | 109 O 381 | , O 382 | took O 383 | Derbyshire S-ORG 384 | to O 385 | 471 O 386 | and O 387 | a O 388 | first O 389 | innings O 390 | lead O 391 | of O 392 | 233 O 393 | . O 394 | 395 | After O 396 | the O 397 | frustration O 398 | of O 399 | seeing O 400 | the O 401 | opening O 402 | day O 403 | of O 404 | their O 405 | match O 406 | badly O 407 | affected O 408 | by O 409 | the O 410 | weather O 411 | , O 412 | Kent S-ORG 413 | stepped O 414 | up O 415 | a O 416 | gear O 417 | to O 418 | dismiss O 419 | Nottinghamshire S-ORG 420 | for O 421 | 214 O 422 | . O 423 | 424 | They O 425 | were O 426 | held O 427 | up O 428 | by O 429 | a O 430 | gritty O 431 | 84 O 432 | from O 433 | Paul B-PER 434 | Johnson E-PER 435 | but O 436 | ex-England S-MISC 437 | fast O 438 | bowler O 439 | Martin B-PER 440 | McCague E-PER 441 | took O 442 | four O 443 | for O 444 | 55 O 445 | . O 446 | 447 | By O 448 | stumps O 449 | Kent S-ORG 450 | had O 451 | reached O 452 | 108 O 453 | for O 454 | three O 455 | . O 456 | 457 | -DOCSTART- O 458 | 459 | CRICKET O 460 | - O 461 | ENGLISH B-MISC 462 | COUNTY I-MISC 463 | CHAMPIONSHIP E-MISC 464 | SCORES O 465 | . O 466 | 467 | LONDON S-LOC 468 | 1996-08-30 O 469 | 470 | Result O 471 | and O 472 | close O 473 | of O 474 | play O 475 | scores O 476 | in O 477 | English S-MISC 478 | county O 479 | championship O 480 | matches O 481 | on O 482 | Friday O 483 | : O 484 | 485 | Leicester S-LOC 486 | : O 487 | Leicestershire S-ORG 488 | beat O 489 | Somerset S-ORG 490 | by O 491 | an O 492 | innings O 493 | and O 494 | 39 O 495 | runs O 496 | . O 497 | 498 | Somerset S-ORG 499 | 83 O 500 | and O 501 | 174 O 502 | ( O 503 | P. B-PER 504 | Simmons E-PER 505 | 4-38 O 506 | ) O 507 | , O 508 | Leicestershire S-ORG 509 | 296 O 510 | . O 511 | 512 | Leicestershire S-ORG 513 | 22 O 514 | points O 515 | , O 516 | Somerset S-ORG 517 | 4 O 518 | . O 519 | 520 | Chester-le-Street S-LOC 521 | : O 522 | Glamorgan S-ORG 523 | 259 O 524 | and O 525 | 207 O 526 | ( O 527 | A. B-PER 528 | Dale E-PER 529 | 69 O 530 | , O 531 | H. B-PER 532 | Morris E-PER 533 | 69 O 534 | ; O 535 | D. B-PER 536 | Blenkiron E-PER 537 | 4-43 O 538 | ) O 539 | , O 540 | Durham S-ORG 541 | 114 O 542 | ( O 543 | S. B-PER 544 | Watkin E-PER 545 | 4-28 O 546 | ) O 547 | and O 548 | 81-3 O 549 | . O 550 | 551 | Tunbridge B-LOC 552 | Wells E-LOC 553 | : O 554 | Nottinghamshire S-ORG 555 | 214 O 556 | ( O 557 | P. B-PER 558 | Johnson E-PER 559 | 84 O 560 | ; O 561 | M. B-PER 562 | McCague E-PER 563 | 4-55 O 564 | ) O 565 | , O 566 | Kent S-ORG 567 | 108-3 O 568 | . O 569 | 570 | London S-LOC 571 | ( O 572 | The B-LOC 573 | Oval E-LOC 574 | ) O 575 | : O 576 | Warwickshire S-ORG 577 | 195 O 578 | , O 579 | Surrey S-ORG 580 | 429-7 O 581 | ( O 582 | C. B-PER 583 | Lewis E-PER 584 | 80 O 585 | not O 586 | out O 587 | , O 588 | M. B-PER 589 | Butcher E-PER 590 | 70 O 591 | , O 592 | G. B-PER 593 | Kersey E-PER 594 | 63 O 595 | , O 596 | J. B-PER 597 | Ratcliffe E-PER 598 | 63 O 599 | , O 600 | D. B-PER 601 | Bicknell E-PER 602 | 55 O 603 | ) O 604 | . O 605 | 606 | Hove S-LOC 607 | : O 608 | Sussex S-ORG 609 | 363 O 610 | ( O 611 | W. B-PER 612 | Athey E-PER 613 | 111 O 614 | , O 615 | V. B-PER 616 | Drakes E-PER 617 | 52 O 618 | ; O 619 | I. B-PER 620 | Austin E-PER 621 | 4-37 O 622 | ) O 623 | , O 624 | Lancashire S-ORG 625 | 197-8 O 626 | ( O 627 | W. B-PER 628 | Hegg E-PER 629 | 54 O 630 | ) O 631 | 632 | Portsmouth S-LOC 633 | : O 634 | Middlesex S-ORG 635 | 199 O 636 | and O 637 | 426 O 638 | ( O 639 | J. B-PER 640 | Pooley E-PER 641 | 111 O 642 | , O 643 | M. B-PER 644 | Ramprakash E-PER 645 | 108 O 646 | , O 647 | M. B-PER 648 | Gatting E-PER 649 | 83 O 650 | ) O 651 | , O 652 | Hampshire S-ORG 653 | 232 O 654 | and O 655 | 109-5 O 656 | . O 657 | 658 | Chesterfield S-LOC 659 | : O 660 | Worcestershire S-ORG 661 | 238 O 662 | and O 663 | 133-5 O 664 | , O 665 | Derbyshire S-ORG 666 | 471 O 667 | ( O 668 | J. B-PER 669 | Adams E-PER 670 | 123 O 671 | , O 672 | T.O'Gorman S-PER 673 | 109 O 674 | not O 675 | out O 676 | , O 677 | K. B-PER 678 | Barnett E-PER 679 | 87 O 680 | ; O 681 | T. B-PER 682 | Moody E-PER 683 | 6-82 O 684 | ) O 685 | 686 | Bristol S-LOC 687 | : O 688 | Gloucestershire S-ORG 689 | 183 O 690 | and O 691 | 185-6 O 692 | ( O 693 | J. B-PER 694 | Russell E-PER 695 | 56 O 696 | not O 697 | out O 698 | ) O 699 | , O 700 | Northamptonshire S-ORG 701 | 190 O 702 | ( O 703 | K. B-PER 704 | Curran E-PER 705 | 52 O 706 | ; O 707 | A. B-PER 708 | Smith E-PER 709 | 5-68 O 710 | ) O 711 | . O 712 | 713 | -DOCSTART- O 714 | 715 | CRICKET O 716 | - O 717 | 1997 O 718 | ASHES S-MISC 719 | INTINERARY O 720 | . O 721 | 722 | LONDON S-LOC 723 | 1996-08-30 O 724 | 725 | Australia S-LOC 726 | will O 727 | defend O 728 | the O 729 | Ashes S-MISC 730 | in O 731 | 732 | a O 733 | six-test O 734 | series O 735 | against O 736 | England S-LOC 737 | during O 738 | a O 739 | four-month O 740 | tour O 741 | 742 | starting O 743 | on O 744 | May O 745 | 13 O 746 | next O 747 | year O 748 | , O 749 | the O 750 | Test B-ORG 751 | and I-ORG 752 | County I-ORG 753 | Cricket I-ORG 754 | Board E-ORG 755 | 756 | said O 757 | on O 758 | Friday O 759 | . O 760 | 761 | Australia S-LOC 762 | will O 763 | also O 764 | play O 765 | three O 766 | one-day O 767 | internationals O 768 | and O 769 | 770 | four O 771 | one-day O 772 | warm-up O 773 | matches O 774 | at O 775 | the O 776 | start O 777 | of O 778 | the O 779 | tour O 780 | . O 781 | 782 | The O 783 | tourists O 784 | will O 785 | play O 786 | nine O 787 | first-class O 788 | matches O 789 | against O 790 | 791 | English S-MISC 792 | county O 793 | sides O 794 | and O 795 | another O 796 | against O 797 | British B-ORG 798 | Universities E-ORG 799 | , O 800 | 801 | as O 802 | well O 803 | as O 804 | one-day O 805 | matches O 806 | against O 807 | the O 808 | Minor B-ORG 809 | Counties E-ORG 810 | and O 811 | 812 | Scotland S-LOC 813 | . O 814 | 815 | Tour O 816 | itinerary O 817 | : O 818 | 819 | May O 820 | 821 | May O 822 | 13 O 823 | Arrive O 824 | in O 825 | London S-LOC 826 | 827 | May O 828 | 14 O 829 | Practice O 830 | at O 831 | Lord B-LOC 832 | 's E-LOC 833 | 834 | May O 835 | 15 O 836 | v O 837 | Duke B-ORG 838 | of I-ORG 839 | Norfolk I-ORG 840 | 's I-ORG 841 | XI E-ORG 842 | ( O 843 | at O 844 | Arundel S-LOC 845 | ) O 846 | 847 | May O 848 | 17 O 849 | v O 850 | Northampton S-ORG 851 | 852 | May O 853 | 18 O 854 | v O 855 | Worcestershire S-ORG 856 | 857 | May O 858 | 20 O 859 | v O 860 | Durham S-ORG 861 | 862 | May O 863 | 22 O 864 | First O 865 | one-day O 866 | international O 867 | ( O 868 | at O 869 | Headingley S-LOC 870 | , O 871 | 872 | Leeds S-ORG 873 | ) O 874 | 875 | May O 876 | 24 O 877 | Second O 878 | one-day O 879 | international O 880 | ( O 881 | at O 882 | The B-LOC 883 | Oval E-LOC 884 | , O 885 | 886 | London S-LOC 887 | ) O 888 | 889 | May O 890 | 25 O 891 | Third O 892 | one-day O 893 | international O 894 | ( O 895 | at O 896 | Lord B-LOC 897 | 's E-LOC 898 | , O 899 | London S-LOC 900 | ) O 901 | 902 | May O 903 | 27-29 O 904 | v O 905 | Gloucestershire S-ORG 906 | or O 907 | Sussex S-ORG 908 | or O 909 | Surrey S-ORG 910 | ( O 911 | three O 912 | 913 | days O 914 | ) O 915 | 916 | May O 917 | 31 O 918 | - O 919 | June O 920 | 2 O 921 | v O 922 | Derbyshire S-ORG 923 | ( O 924 | three O 925 | days O 926 | ) O 927 | 928 | June O 929 | 930 | June O 931 | 5-9 O 932 | First O 933 | test O 934 | match O 935 | ( O 936 | at O 937 | Edgbaston S-LOC 938 | , O 939 | Birmingham S-LOC 940 | ) O 941 | 942 | June O 943 | 11-13 O 944 | v O 945 | a O 946 | first O 947 | class O 948 | county O 949 | ( O 950 | to O 951 | be O 952 | confirmed O 953 | ) O 954 | 955 | June O 956 | 14-16 O 957 | v O 958 | Leicestershire S-ORG 959 | ( O 960 | three O 961 | days O 962 | ) O 963 | 964 | June O 965 | 19-23 O 966 | Second O 967 | test O 968 | ( O 969 | at O 970 | Lord B-LOC 971 | 's E-LOC 972 | ) O 973 | 974 | June O 975 | 25-27 O 976 | v O 977 | British B-ORG 978 | Universities E-ORG 979 | ( O 980 | at O 981 | Oxford S-LOC 982 | , O 983 | three O 984 | days O 985 | ) O 986 | 987 | June O 988 | 28-30 O 989 | v O 990 | Hampshire S-ORG 991 | ( O 992 | three O 993 | days O 994 | ) O 995 | 996 | July O 997 | 998 | July O 999 | 3-7 O 1000 | Third O 1001 | test O 1002 | ( O 1003 | at O 1004 | Old B-LOC 1005 | Trafford E-LOC 1006 | , O 1007 | Manchester S-LOC 1008 | ) O 1009 | 1010 | July O 1011 | 9 O 1012 | v O 1013 | Minor B-ORG 1014 | Counties I-ORG 1015 | XI E-ORG 1016 | 1017 | July O 1018 | 12 O 1019 | v O 1020 | Scotland S-LOC 1021 | 1022 | July O 1023 | 16-18 O 1024 | v O 1025 | Glamorgan S-ORG 1026 | ( O 1027 | three O 1028 | days O 1029 | ) O 1030 | 1031 | July O 1032 | 19-21 O 1033 | v O 1034 | Middlesex S-ORG 1035 | ( O 1036 | three O 1037 | days O 1038 | ) O 1039 | 1040 | July O 1041 | 24-28 O 1042 | Fourth O 1043 | test O 1044 | ( O 1045 | at O 1046 | Headingley S-LOC 1047 | ) O 1048 | 1049 | August O 1050 | 1051 | August O 1052 | 1-4 O 1053 | v O 1054 | Somerset S-ORG 1055 | ( O 1056 | four O 1057 | days O 1058 | ) O 1059 | 1060 | August O 1061 | 7-11 O 1062 | Fifth O 1063 | test O 1064 | ( O 1065 | at O 1066 | Trent B-LOC 1067 | Bridge E-LOC 1068 | , O 1069 | Nottingham S-LOC 1070 | ) O 1071 | 1072 | August O 1073 | 16-18 O 1074 | v O 1075 | Kent S-ORG 1076 | ( O 1077 | three O 1078 | days O 1079 | ) O 1080 | 1081 | August O 1082 | 21-25 O 1083 | Sixth O 1084 | test O 1085 | ( O 1086 | at O 1087 | The B-LOC 1088 | Oval E-LOC 1089 | , O 1090 | London S-LOC 1091 | ) O 1092 | . O 1093 | 1094 | -DOCSTART- O 1095 | 1096 | SOCCER O 1097 | - O 1098 | SHEARER S-PER 1099 | NAMED O 1100 | AS O 1101 | ENGLAND S-LOC 1102 | CAPTAIN O 1103 | . O 1104 | 1105 | LONDON S-LOC 1106 | 1996-08-30 O 1107 | 1108 | The O 1109 | world O 1110 | 's O 1111 | costliest O 1112 | footballer O 1113 | Alan B-PER 1114 | Shearer E-PER 1115 | was O 1116 | named O 1117 | as O 1118 | the O 1119 | new O 1120 | England S-LOC 1121 | captain O 1122 | on O 1123 | Friday O 1124 | . O 1125 | 1126 | The O 1127 | 26-year-old O 1128 | , O 1129 | who O 1130 | joined O 1131 | Newcastle S-ORG 1132 | for O 1133 | 15 O 1134 | million O 1135 | pounds O 1136 | sterling O 1137 | ( O 1138 | $ O 1139 | 23.4 O 1140 | million O 1141 | ) O 1142 | , O 1143 | takes O 1144 | over O 1145 | from O 1146 | Tony B-PER 1147 | Adams E-PER 1148 | , O 1149 | who O 1150 | led O 1151 | the O 1152 | side O 1153 | during O 1154 | the O 1155 | European S-MISC 1156 | championship O 1157 | in O 1158 | June O 1159 | , O 1160 | and O 1161 | former O 1162 | captain O 1163 | David B-PER 1164 | Platt E-PER 1165 | . O 1166 | 1167 | Adams S-PER 1168 | and O 1169 | Platt S-PER 1170 | are O 1171 | both O 1172 | injured O 1173 | and O 1174 | will O 1175 | miss O 1176 | England S-LOC 1177 | 's O 1178 | opening O 1179 | World B-MISC 1180 | Cup E-MISC 1181 | qualifier O 1182 | against O 1183 | Moldova S-LOC 1184 | on O 1185 | Sunday O 1186 | . O 1187 | 1188 | Shearer S-PER 1189 | takes O 1190 | the O 1191 | captaincy O 1192 | on O 1193 | a O 1194 | trial O 1195 | basis O 1196 | , O 1197 | but O 1198 | new O 1199 | coach O 1200 | Glenn B-PER 1201 | Hoddle E-PER 1202 | said O 1203 | he O 1204 | saw O 1205 | no O 1206 | reason O 1207 | why O 1208 | the O 1209 | former O 1210 | Blackburn S-ORG 1211 | and O 1212 | Southampton S-ORG 1213 | skipper O 1214 | should O 1215 | not O 1216 | make O 1217 | the O 1218 | post O 1219 | his O 1220 | own O 1221 | . O 1222 | 1223 | " O 1224 | I O 1225 | 'm O 1226 | sure O 1227 | there O 1228 | wo O 1229 | n't O 1230 | be O 1231 | a O 1232 | problem O 1233 | , O 1234 | I O 1235 | 'm O 1236 | sure O 1237 | Alan S-PER 1238 | is O 1239 | the O 1240 | man O 1241 | for O 1242 | the O 1243 | job O 1244 | , O 1245 | " O 1246 | Hoddle S-PER 1247 | said O 1248 | . O 1249 | 1250 | " O 1251 | There O 1252 | were O 1253 | three O 1254 | or O 1255 | four O 1256 | people O 1257 | who O 1258 | could O 1259 | have O 1260 | done O 1261 | it O 1262 | but O 1263 | when O 1264 | I O 1265 | spoke O 1266 | to O 1267 | Alan S-PER 1268 | he O 1269 | was O 1270 | up O 1271 | for O 1272 | it O 1273 | and O 1274 | really O 1275 | wanted O 1276 | it O 1277 | . O 1278 | 1279 | " O 1280 | In O 1281 | four O 1282 | days O 1283 | it O 1284 | 's O 1285 | very O 1286 | difficult O 1287 | to O 1288 | come O 1289 | to O 1290 | a O 1291 | 100 O 1292 | percent O 1293 | conclusion O 1294 | about O 1295 | something O 1296 | like O 1297 | this O 1298 | ... O 1299 | 1300 | but O 1301 | he O 1302 | knows O 1303 | how O 1304 | to O 1305 | conduct O 1306 | himself O 1307 | , O 1308 | his O 1309 | team O 1310 | mates O 1311 | respect O 1312 | him O 1313 | and O 1314 | he O 1315 | knows O 1316 | about O 1317 | the O 1318 | team O 1319 | situation O 1320 | even O 1321 | though O 1322 | he O 1323 | plays O 1324 | up O 1325 | front O 1326 | . O 1327 | " O 1328 | 1329 | Shearer S-PER 1330 | 's O 1331 | Euro B-MISC 1332 | 96 E-MISC 1333 | striking O 1334 | partner O 1335 | Teddy B-PER 1336 | Sheringham E-PER 1337 | withdrew O 1338 | from O 1339 | the O 1340 | squad O 1341 | with O 1342 | an O 1343 | injury O 1344 | on O 1345 | Friday O 1346 | . O 1347 | 1348 | He O 1349 | will O 1350 | probably O 1351 | be O 1352 | replaced O 1353 | by O 1354 | Shearer S-PER 1355 | 's O 1356 | Newcastle S-ORG 1357 | team O 1358 | mate O 1359 | Les B-PER 1360 | Ferdinand E-PER 1361 | . O 1362 | 1363 | -DOCSTART- O 1364 | 1365 | BASKETBALL O 1366 | - O 1367 | INTERNATIONAL O 1368 | TOURNAMENT O 1369 | RESULT O 1370 | . O 1371 | 1372 | BELGRADE S-LOC 1373 | 1996-08-30 O 1374 | 1375 | Result O 1376 | in O 1377 | an O 1378 | international O 1379 | 1380 | basketball O 1381 | tournament O 1382 | on O 1383 | Friday O 1384 | : O 1385 | 1386 | Red B-ORG 1387 | Star E-ORG 1388 | ( O 1389 | Yugoslavia S-LOC 1390 | ) O 1391 | beat O 1392 | Dinamo S-ORG 1393 | ( O 1394 | Russia S-LOC 1395 | ) O 1396 | 92-90 O 1397 | ( O 1398 | halftime O 1399 | 1400 | 47-47 O 1401 | ) O 1402 | 1403 | -DOCSTART- O 1404 | 1405 | SOCCER O 1406 | - O 1407 | ROMANIA S-LOC 1408 | BEAT O 1409 | LITHUANIA S-LOC 1410 | IN O 1411 | UNDER-21 O 1412 | MATCH O 1413 | . O 1414 | 1415 | BUCHAREST S-LOC 1416 | 1996-08-30 O 1417 | 1418 | Romania S-LOC 1419 | beat O 1420 | Lithuania S-LOC 1421 | 2-1 O 1422 | ( O 1423 | halftime O 1424 | 1-1 O 1425 | ) O 1426 | in O 1427 | their O 1428 | European S-MISC 1429 | under-21 O 1430 | soccer O 1431 | match O 1432 | on O 1433 | Friday O 1434 | . O 1435 | 1436 | Scorers O 1437 | : O 1438 | 1439 | Romania S-LOC 1440 | - O 1441 | Cosmin B-PER 1442 | Contra E-PER 1443 | ( O 1444 | 31st O 1445 | ) O 1446 | , O 1447 | Mihai B-PER 1448 | Tararache E-PER 1449 | ( O 1450 | 75th O 1451 | ) O 1452 | 1453 | Lithuania S-LOC 1454 | - O 1455 | Danius B-PER 1456 | Gleveckas E-PER 1457 | ( O 1458 | 13rd O 1459 | ) O 1460 | 1461 | Attendance O 1462 | : O 1463 | 200 O 1464 | 1465 | -DOCSTART- O 1466 | 1467 | SOCCER O 1468 | - O 1469 | ROTOR S-ORG 1470 | FANS O 1471 | LOCKED O 1472 | OUT O 1473 | AFTER O 1474 | VOLGOGRAD S-LOC 1475 | VIOLENCE O 1476 | . O 1477 | 1478 | MOSCOW S-LOC 1479 | 1996-08-30 O 1480 | 1481 | Rotor B-ORG 1482 | Volgograd E-ORG 1483 | must O 1484 | play O 1485 | their O 1486 | next O 1487 | home O 1488 | game O 1489 | behind O 1490 | closed O 1491 | doors O 1492 | after O 1493 | fans O 1494 | hurled O 1495 | bottles O 1496 | and O 1497 | stones O 1498 | at O 1499 | Dynamo B-ORG 1500 | Moscow E-ORG 1501 | players O 1502 | during O 1503 | a O 1504 | 1-0 O 1505 | home O 1506 | defeat O 1507 | on O 1508 | Saturday O 1509 | that O 1510 | ended O 1511 | Rotor S-ORG 1512 | 's O 1513 | brief O 1514 | spell O 1515 | as O 1516 | league O 1517 | leaders O 1518 | . O 1519 | 1520 | The O 1521 | head O 1522 | of O 1523 | the O 1524 | Russian S-MISC 1525 | league O 1526 | 's O 1527 | disciplinary O 1528 | committee O 1529 | , O 1530 | Anatoly B-PER 1531 | Gorokhovsky E-PER 1532 | , O 1533 | said O 1534 | on O 1535 | Friday O 1536 | that O 1537 | Rotor S-ORG 1538 | would O 1539 | play O 1540 | Lada B-ORG 1541 | Togliatti E-ORG 1542 | to O 1543 | empty O 1544 | stands O 1545 | on O 1546 | September O 1547 | 3 O 1548 | . O 1549 | 1550 | The O 1551 | club O 1552 | , O 1553 | who O 1554 | put O 1555 | Manchester B-ORG 1556 | United E-ORG 1557 | out O 1558 | of O 1559 | last O 1560 | year O 1561 | 's O 1562 | UEFA B-MISC 1563 | Cup E-MISC 1564 | , O 1565 | were O 1566 | fined O 1567 | $ O 1568 | 1,000 O 1569 | . O -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'max' 2 | -------------------------------------------------------------------------------- /utils/alphabet.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Max 3 | # @Date: 2018-01-19 11:33:37 4 | # @Last Modified by: Jie Yang, Contact: jieynlp@gmail.com 5 | # @Last Modified time: 2018-04-26 13:56:03 6 | 7 | 8 | """ 9 | Alphabet maps objects to integer ids. It provides two way mapping from the index to the objects. 10 | """ 11 | from __future__ import print_function 12 | import json 13 | import os 14 | import sys 15 | 16 | 17 | class Alphabet: 18 | def __init__(self, name, label=False, keep_growing=True): 19 | self.name = name 20 | self.UNKNOWN = "" 21 | self.label = label 22 | self.instance2index = {} 23 | self.instances = [] 24 | self.keep_growing = keep_growing 25 | 26 | # Index 0 is occupied by default, all else following. 27 | self.default_index = 0 28 | self.next_index = 1 29 | if not self.label: 30 | self.add(self.UNKNOWN) 31 | 32 | def clear(self, keep_growing=True): 33 | self.instance2index = {} 34 | self.instances = [] 35 | self.keep_growing = keep_growing 36 | 37 | # Index 0 is occupied by default, all else following. 38 | self.default_index = 0 39 | self.next_index = 1 40 | 41 | def add(self, instance): 42 | if instance not in self.instance2index: 43 | self.instances.append(instance) 44 | self.instance2index[instance] = self.next_index 45 | self.next_index += 1 46 | 47 | def get_index(self, instance): 48 | try: 49 | return self.instance2index[instance] 50 | except KeyError: 51 | if self.keep_growing: 52 | index = self.next_index 53 | self.add(instance) 54 | return index 55 | else: 56 | return self.instance2index[self.UNKNOWN] 57 | 58 | def get_instance(self, index): 59 | if index == 0: 60 | if self.label: 61 | return self.instances[0] 62 | # First index is occupied by the wildcard element. 63 | return None 64 | try: 65 | return self.instances[index - 1] 66 | except IndexError: 67 | print('WARNING:Alphabet get_instance ,unknown instance, return the first label.') 68 | return self.instances[0] 69 | 70 | def size(self): 71 | # if self.label: 72 | # return len(self.instances) 73 | # else: 74 | return len(self.instances) + 1 75 | 76 | def iteritems(self): 77 | if sys.version_info[0] < 3: # If using python3, dict item access uses different syntax 78 | return self.instance2index.iteritems() 79 | else: 80 | return self.instance2index.items() 81 | 82 | def enumerate_items(self, start=1): 83 | if start < 1 or start >= self.size(): 84 | raise IndexError("Enumerate is allowed between [1 : size of the alphabet)") 85 | return zip(range(start, len(self.instances) + 1), self.instances[start - 1:]) 86 | 87 | def close(self): 88 | self.keep_growing = False 89 | 90 | def open(self): 91 | self.keep_growing = True 92 | 93 | def get_content(self): 94 | return {'instance2index': self.instance2index, 'instances': self.instances} 95 | 96 | def from_json(self, data): 97 | self.instances = data["instances"] 98 | self.instance2index = data["instance2index"] 99 | 100 | def save(self, output_directory, name=None): 101 | """ 102 | Save both alhpabet records to the given directory. 103 | :param output_directory: Directory to save model and weights. 104 | :param name: The alphabet saving name, optional. 105 | :return: 106 | """ 107 | saving_name = name if name else self.__name 108 | try: 109 | json.dump(self.get_content(), open(os.path.join(output_directory, saving_name + ".json"), 'w')) 110 | except Exception as e: 111 | print("Exception: Alphabet is not saved: " % repr(e)) 112 | 113 | def load(self, input_directory, name=None): 114 | """ 115 | Load model architecture and weights from the give directory. This allow we use old models even the structure 116 | changes. 117 | :param input_directory: Directory to save model and weights 118 | :return: 119 | """ 120 | loading_name = name if name else self.__name 121 | self.from_json(json.load(open(os.path.join(input_directory, loading_name + ".json")))) 122 | -------------------------------------------------------------------------------- /utils/data.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Jie 3 | # @Date: 2017-06-14 17:34:32 4 | # @Last Modified by: Jie Yang, Contact: jieynlp@gmail.com 5 | # @Last Modified time: 2019-01-25 20:25:59 6 | from __future__ import print_function 7 | from __future__ import absolute_import 8 | import sys 9 | from .alphabet import Alphabet 10 | from .functions import * 11 | 12 | try: 13 | import cPickle as pickle 14 | except ImportError: 15 | import pickle as pickle 16 | 17 | 18 | START = "" 19 | UNKNOWN = "" 20 | PADDING = "" 21 | 22 | class Data: 23 | def __init__(self): 24 | self.sentence_classification = False 25 | self.MAX_SENTENCE_LENGTH = 250 26 | self.MAX_WORD_LENGTH = -1 27 | self.number_normalized = True 28 | self.norm_word_emb = False 29 | self.norm_char_emb = False 30 | self.word_alphabet = Alphabet('word') 31 | self.char_alphabet = Alphabet('character') 32 | 33 | self.feature_name = [] 34 | self.feature_alphabets = [] 35 | self.feature_num = len(self.feature_alphabets) 36 | self.feat_config = None 37 | 38 | 39 | self.label_alphabet = Alphabet('label',True) 40 | self.tagScheme = "NoSeg" ## BMES/BIO 41 | self.split_token = ' ||| ' 42 | self.seg = True 43 | 44 | ### I/O 45 | self.train_dir = None 46 | self.dev_dir = None 47 | self.test_dir = None 48 | self.raw_dir = None 49 | 50 | self.decode_dir = None 51 | self.dset_dir = None ## data vocabulary related file 52 | self.model_dir = None ## model save file 53 | self.load_model_dir = None ## model load file 54 | 55 | self.word_emb_dir = None 56 | self.char_emb_dir = None 57 | self.feature_emb_dirs = [] 58 | 59 | self.train_texts = [] 60 | self.dev_texts = [] 61 | self.test_texts = [] 62 | self.raw_texts = [] 63 | 64 | self.train_Ids = [] 65 | self.dev_Ids = [] 66 | self.test_Ids = [] 67 | self.raw_Ids = [] 68 | 69 | self.pretrain_word_embedding = None 70 | self.pretrain_char_embedding = None 71 | self.pretrain_feature_embeddings = [] 72 | 73 | self.label_size = 0 74 | self.word_alphabet_size = 0 75 | self.char_alphabet_size = 0 76 | self.label_alphabet_size = 0 77 | self.feature_alphabet_sizes = [] 78 | self.feature_emb_dims = [] 79 | self.norm_feature_embs = [] 80 | self.word_emb_dim = 50 81 | self.char_emb_dim = 30 82 | 83 | ###Networks 84 | self.word_feature_extractor = "LSTM" ## "LSTM"/"CNN"/"GRU"/ 85 | self.use_char = True 86 | self.char_feature_extractor = "CNN" ## "LSTM"/"CNN"/"GRU"/None 87 | self.use_crf = True 88 | self.nbest = None 89 | 90 | ## Training 91 | self.average_batch_loss = False 92 | self.optimizer = "SGD" ## "SGD"/"AdaGrad"/"AdaDelta"/"RMSProp"/"Adam" 93 | self.status = "train" 94 | ### Hyperparameters 95 | self.HP_cnn_layer = 4 96 | self.HP_iteration = 100 97 | self.HP_batch_size = 10 98 | self.HP_char_hidden_dim = 50 99 | self.HP_hidden_dim = 200 100 | self.HP_dropout = 0.5 101 | self.HP_lstm_layer = 1 102 | self.HP_bilstm = True 103 | 104 | self.HP_gpu = False 105 | self.HP_lr = 0.015 106 | self.HP_lr_decay = 0.05 107 | self.HP_clip = None 108 | self.HP_momentum = 0 109 | self.HP_l2 = 1e-8 110 | 111 | def show_data_summary(self): 112 | 113 | print("++"*50) 114 | print("DATA SUMMARY START:") 115 | print(" I/O:") 116 | if self.sentence_classification: 117 | print(" Start Sentence Classification task...") 118 | else: 119 | print(" Start Sequence Laebling task...") 120 | print(" Tag scheme: %s"%(self.tagScheme)) 121 | print(" Split token: %s"%(self.split_token)) 122 | print(" MAX SENTENCE LENGTH: %s"%(self.MAX_SENTENCE_LENGTH)) 123 | print(" MAX WORD LENGTH: %s"%(self.MAX_WORD_LENGTH)) 124 | print(" Number normalized: %s"%(self.number_normalized)) 125 | print(" Word alphabet size: %s"%(self.word_alphabet_size)) 126 | print(" Char alphabet size: %s"%(self.char_alphabet_size)) 127 | print(" Label alphabet size: %s"%(self.label_alphabet_size)) 128 | print(" Word embedding dir: %s"%(self.word_emb_dir)) 129 | print(" Char embedding dir: %s"%(self.char_emb_dir)) 130 | print(" Word embedding size: %s"%(self.word_emb_dim)) 131 | print(" Char embedding size: %s"%(self.char_emb_dim)) 132 | print(" Norm word emb: %s"%(self.norm_word_emb)) 133 | print(" Norm char emb: %s"%(self.norm_char_emb)) 134 | print(" Train file directory: %s"%(self.train_dir)) 135 | print(" Dev file directory: %s"%(self.dev_dir)) 136 | print(" Test file directory: %s"%(self.test_dir)) 137 | print(" Raw file directory: %s"%(self.raw_dir)) 138 | print(" Dset file directory: %s"%(self.dset_dir)) 139 | print(" Model file directory: %s"%(self.model_dir)) 140 | print(" Loadmodel directory: %s"%(self.load_model_dir)) 141 | print(" Decode file directory: %s"%(self.decode_dir)) 142 | print(" Train instance number: %s"%(len(self.train_texts))) 143 | print(" Dev instance number: %s"%(len(self.dev_texts))) 144 | print(" Test instance number: %s"%(len(self.test_texts))) 145 | print(" Raw instance number: %s"%(len(self.raw_texts))) 146 | print(" FEATURE num: %s"%(self.feature_num)) 147 | for idx in range(self.feature_num): 148 | print(" Fe: %s alphabet size: %s"%(self.feature_alphabets[idx].name, self.feature_alphabet_sizes[idx])) 149 | print(" Fe: %s embedding dir: %s"%(self.feature_alphabets[idx].name, self.feature_emb_dirs[idx])) 150 | print(" Fe: %s embedding size: %s"%(self.feature_alphabets[idx].name, self.feature_emb_dims[idx])) 151 | print(" Fe: %s norm emb: %s"%(self.feature_alphabets[idx].name, self.norm_feature_embs[idx])) 152 | print(" "+"++"*20) 153 | print(" Model Network:") 154 | print(" Model use_crf: %s"%(self.use_crf)) 155 | print(" Model word extractor: %s"%(self.word_feature_extractor)) 156 | print(" Model use_char: %s"%(self.use_char)) 157 | if self.use_char: 158 | print(" Model char extractor: %s"%(self.char_feature_extractor)) 159 | print(" Model char_hidden_dim: %s"%(self.HP_char_hidden_dim)) 160 | print(" "+"++"*20) 161 | print(" Training:") 162 | print(" Optimizer: %s"%(self.optimizer)) 163 | print(" Iteration: %s"%(self.HP_iteration)) 164 | print(" BatchSize: %s"%(self.HP_batch_size)) 165 | print(" Average batch loss: %s"%(self.average_batch_loss)) 166 | 167 | print(" "+"++"*20) 168 | print(" Hyperparameters:") 169 | 170 | print(" Hyper lr: %s"%(self.HP_lr)) 171 | print(" Hyper lr_decay: %s"%(self.HP_lr_decay)) 172 | print(" Hyper HP_clip: %s"%(self.HP_clip)) 173 | print(" Hyper momentum: %s"%(self.HP_momentum)) 174 | print(" Hyper l2: %s"%(self.HP_l2)) 175 | print(" Hyper hidden_dim: %s"%(self.HP_hidden_dim)) 176 | print(" Hyper dropout: %s"%(self.HP_dropout)) 177 | print(" Hyper lstm_layer: %s"%(self.HP_lstm_layer)) 178 | print(" Hyper bilstm: %s"%(self.HP_bilstm)) 179 | print(" Hyper GPU: %s"%(self.HP_gpu)) 180 | print("DATA SUMMARY END.") 181 | print("++"*50) 182 | sys.stdout.flush() 183 | 184 | 185 | def initial_feature_alphabets(self): 186 | if self.sentence_classification: 187 | ## if sentence classification data format, splited by '\t' 188 | items = open(self.train_dir,'r').readline().strip('\n').split('\t') 189 | else: 190 | ## if sequence labeling data format i.e. CoNLL 2003, split by ' ' 191 | items = open(self.train_dir,'r').readline().strip('\n').split() 192 | total_column = len(items) 193 | if total_column > 2: 194 | for idx in range(1, total_column-1): 195 | feature_prefix = items[idx].split(']',1)[0]+"]" 196 | self.feature_alphabets.append(Alphabet(feature_prefix)) 197 | self.feature_name.append(feature_prefix) 198 | print("Find feature: ", feature_prefix) 199 | self.feature_num = len(self.feature_alphabets) 200 | self.pretrain_feature_embeddings = [None]*self.feature_num 201 | self.feature_emb_dims = [20]*self.feature_num 202 | self.feature_emb_dirs = [None]*self.feature_num 203 | self.norm_feature_embs = [False]*self.feature_num 204 | self.feature_alphabet_sizes = [0]*self.feature_num 205 | if self.feat_config: 206 | for idx in range(self.feature_num): 207 | if self.feature_name[idx] in self.feat_config: 208 | self.feature_emb_dims[idx] = self.feat_config[self.feature_name[idx]]['emb_size'] 209 | self.feature_emb_dirs[idx] = self.feat_config[self.feature_name[idx]]['emb_dir'] 210 | self.norm_feature_embs[idx] = self.feat_config[self.feature_name[idx]]['emb_norm'] 211 | # exit(0) 212 | 213 | 214 | def build_alphabet(self, input_file): 215 | in_lines = open(input_file,'r').readlines() 216 | for line in in_lines: 217 | if len(line) > 2: 218 | ## if sentence classification data format, splited by \t 219 | if self.sentence_classification: 220 | pairs = line.strip().split(self.split_token) 221 | sent = pairs[0] 222 | if sys.version_info[0] < 3: 223 | sent = sent.decode('utf-8') 224 | words = sent.split() 225 | for word in words: 226 | if self.number_normalized: 227 | word = normalize_word(word) 228 | self.word_alphabet.add(word) 229 | for char in word: 230 | self.char_alphabet.add(char) 231 | label = pairs[-1] 232 | self.label_alphabet.add(label) 233 | ## build feature alphabet 234 | for idx in range(self.feature_num): 235 | feat_idx = pairs[idx+1].split(']',1)[-1] 236 | self.feature_alphabets[idx].add(feat_idx) 237 | 238 | ## if sequence labeling data format i.e. CoNLL 2003 239 | else: 240 | pairs = line.strip().split() 241 | word = pairs[0] 242 | if sys.version_info[0] < 3: 243 | word = word.decode('utf-8') 244 | if self.number_normalized: 245 | word = normalize_word(word) 246 | label = pairs[-1] 247 | self.label_alphabet.add(label) 248 | self.word_alphabet.add(word) 249 | ## build feature alphabet 250 | for idx in range(self.feature_num): 251 | feat_idx = pairs[idx+1].split(']',1)[-1] 252 | self.feature_alphabets[idx].add(feat_idx) 253 | for char in word: 254 | self.char_alphabet.add(char) 255 | self.word_alphabet_size = self.word_alphabet.size() 256 | self.char_alphabet_size = self.char_alphabet.size() 257 | self.label_alphabet_size = self.label_alphabet.size() 258 | for idx in range(self.feature_num): 259 | self.feature_alphabet_sizes[idx] = self.feature_alphabets[idx].size() 260 | startS = False 261 | startB = False 262 | for label,_ in self.label_alphabet.iteritems(): 263 | if "S-" in label.upper(): 264 | startS = True 265 | elif "B-" in label.upper(): 266 | startB = True 267 | if startB: 268 | if startS: 269 | self.tagScheme = "BMES" 270 | else: 271 | self.tagScheme = "BIO" 272 | if self.sentence_classification: 273 | self.tagScheme = "Not sequence labeling task" 274 | 275 | 276 | def fix_alphabet(self): 277 | self.word_alphabet.close() 278 | self.char_alphabet.close() 279 | self.label_alphabet.close() 280 | for idx in range(self.feature_num): 281 | self.feature_alphabets[idx].close() 282 | 283 | 284 | def build_pretrain_emb(self): 285 | if self.word_emb_dir: 286 | print("Load pretrained word embedding, norm: %s, dir: %s"%(self.norm_word_emb, self.word_emb_dir)) 287 | self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding(self.word_emb_dir, self.word_alphabet, self.word_emb_dim, self.norm_word_emb) 288 | if self.char_emb_dir: 289 | print("Load pretrained char embedding, norm: %s, dir: %s"%(self.norm_char_emb, self.char_emb_dir)) 290 | self.pretrain_char_embedding, self.char_emb_dim = build_pretrain_embedding(self.char_emb_dir, self.char_alphabet, self.char_emb_dim, self.norm_char_emb) 291 | for idx in range(self.feature_num): 292 | if self.feature_emb_dirs[idx]: 293 | print("Load pretrained feature %s embedding:, norm: %s, dir: %s"%(self.feature_name[idx], self.norm_feature_embs[idx], self.feature_emb_dirs[idx])) 294 | self.pretrain_feature_embeddings[idx], self.feature_emb_dims[idx] = build_pretrain_embedding(self.feature_emb_dirs[idx], self.feature_alphabets[idx], self.feature_emb_dims[idx], self.norm_feature_embs[idx]) 295 | 296 | 297 | def generate_instance(self, name): 298 | self.fix_alphabet() 299 | if name == "train": 300 | self.train_texts, self.train_Ids = read_instance(self.train_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.sentence_classification, self.split_token) 301 | elif name == "dev": 302 | self.dev_texts, self.dev_Ids = read_instance(self.dev_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.sentence_classification, self.split_token) 303 | elif name == "test": 304 | self.test_texts, self.test_Ids = read_instance(self.test_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.sentence_classification, self.split_token) 305 | elif name == "raw": 306 | self.raw_texts, self.raw_Ids = read_instance(self.raw_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH, self.sentence_classification, self.split_token) 307 | else: 308 | print("Error: you can only generate train/dev/test instance! Illegal input:%s"%(name)) 309 | 310 | 311 | def write_decoded_results(self, predict_results, name): 312 | 313 | sent_num = len(predict_results) 314 | content_list = [] 315 | if name == 'raw': 316 | content_list = self.raw_texts 317 | elif name == 'test': 318 | content_list = self.test_texts 319 | elif name == 'dev': 320 | content_list = self.dev_texts 321 | elif name == 'train': 322 | content_list = self.train_texts 323 | else: 324 | print("Error: illegal name during writing predict result, name should be within train/dev/test/raw !") 325 | assert(sent_num == len(content_list)) 326 | fout = open(self.decode_dir,'w') 327 | for idx in range(sent_num): 328 | if self.sentence_classification: 329 | fout.write(" ".join(content_list[idx][0])+"\t"+predict_results[idx]+ '\n') 330 | else: 331 | sent_length = len(predict_results[idx]) 332 | for idy in range(sent_length): 333 | ## content_list[idx] is a list with [word, char, label] 334 | fout.write(content_list[idx][0][idy].encode('utf-8') + " " + predict_results[idx][idy] + '\n') 335 | fout.write('\n') 336 | fout.close() 337 | print("Predict %s result has been written into file. %s"%(name, self.decode_dir)) 338 | 339 | 340 | def load(self,data_file): 341 | f = open(data_file, 'rb') 342 | tmp_dict = pickle.load(f) 343 | f.close() 344 | self.__dict__.update(tmp_dict) 345 | 346 | def save(self,save_file): 347 | f = open(save_file, 'wb') 348 | pickle.dump(self.__dict__, f, 2) 349 | f.close() 350 | 351 | 352 | 353 | def write_nbest_decoded_results(self, predict_results, pred_scores, name): 354 | ## predict_results : [whole_sent_num, nbest, each_sent_length] 355 | ## pred_scores: [whole_sent_num, nbest] 356 | fout = open(self.decode_dir,'w') 357 | sent_num = len(predict_results) 358 | content_list = [] 359 | if name == 'raw': 360 | content_list = self.raw_texts 361 | elif name == 'test': 362 | content_list = self.test_texts 363 | elif name == 'dev': 364 | content_list = self.dev_texts 365 | elif name == 'train': 366 | content_list = self.train_texts 367 | else: 368 | print("Error: illegal name during writing predict result, name should be within train/dev/test/raw !") 369 | assert(sent_num == len(content_list)) 370 | assert(sent_num == len(pred_scores)) 371 | for idx in range(sent_num): 372 | sent_length = len(predict_results[idx][0]) 373 | nbest = len(predict_results[idx]) 374 | score_string = "# " 375 | for idz in range(nbest): 376 | score_string += format(pred_scores[idx][idz], '.4f')+" " 377 | fout.write(score_string.strip() + "\n") 378 | 379 | for idy in range(sent_length): 380 | try: # Will fail with python3 381 | label_string = content_list[idx][0][idy].encode('utf-8') + " " 382 | except: 383 | label_string = content_list[idx][0][idy] + " " 384 | for idz in range(nbest): 385 | label_string += predict_results[idx][idz][idy]+" " 386 | label_string = label_string.strip() + "\n" 387 | fout.write(label_string) 388 | fout.write('\n') 389 | fout.close() 390 | print("Predict %s %s-best result has been written into file. %s"%(name,nbest, self.decode_dir)) 391 | 392 | 393 | def read_config(self,config_file): 394 | config = config_file_to_dict(config_file) 395 | ## read data: 396 | the_item = 'train_dir' 397 | if the_item in config: 398 | self.train_dir = config[the_item] 399 | the_item = 'dev_dir' 400 | if the_item in config: 401 | self.dev_dir = config[the_item] 402 | the_item = 'test_dir' 403 | if the_item in config: 404 | self.test_dir = config[the_item] 405 | the_item = 'raw_dir' 406 | if the_item in config: 407 | self.raw_dir = config[the_item] 408 | the_item = 'decode_dir' 409 | if the_item in config: 410 | self.decode_dir = config[the_item] 411 | the_item = 'dset_dir' 412 | if the_item in config: 413 | self.dset_dir = config[the_item] 414 | the_item = 'model_dir' 415 | if the_item in config: 416 | self.model_dir = config[the_item] 417 | the_item = 'load_model_dir' 418 | if the_item in config: 419 | self.load_model_dir = config[the_item] 420 | 421 | the_item = 'word_emb_dir' 422 | if the_item in config: 423 | self.word_emb_dir = config[the_item] 424 | the_item = 'char_emb_dir' 425 | if the_item in config: 426 | self.char_emb_dir = config[the_item] 427 | 428 | 429 | the_item = 'MAX_SENTENCE_LENGTH' 430 | if the_item in config: 431 | self.MAX_SENTENCE_LENGTH = int(config[the_item]) 432 | the_item = 'MAX_WORD_LENGTH' 433 | if the_item in config: 434 | self.MAX_WORD_LENGTH = int(config[the_item]) 435 | 436 | the_item = 'norm_word_emb' 437 | if the_item in config: 438 | self.norm_word_emb = str2bool(config[the_item]) 439 | the_item = 'norm_char_emb' 440 | if the_item in config: 441 | self.norm_char_emb = str2bool(config[the_item]) 442 | the_item = 'number_normalized' 443 | if the_item in config: 444 | self.number_normalized = str2bool(config[the_item]) 445 | 446 | the_item = 'sentence_classification' 447 | if the_item in config: 448 | self.sentence_classification = str2bool(config[the_item]) 449 | the_item = 'seg' 450 | if the_item in config: 451 | self.seg = str2bool(config[the_item]) 452 | the_item = 'word_emb_dim' 453 | if the_item in config: 454 | self.word_emb_dim = int(config[the_item]) 455 | the_item = 'char_emb_dim' 456 | if the_item in config: 457 | self.char_emb_dim = int(config[the_item]) 458 | 459 | ## read network: 460 | the_item = 'use_crf' 461 | if the_item in config: 462 | self.use_crf = str2bool(config[the_item]) 463 | the_item = 'use_char' 464 | if the_item in config: 465 | self.use_char = str2bool(config[the_item]) 466 | the_item = 'word_seq_feature' 467 | if the_item in config: 468 | self.word_feature_extractor = config[the_item] 469 | the_item = 'char_seq_feature' 470 | if the_item in config: 471 | self.char_feature_extractor = config[the_item] 472 | the_item = 'nbest' 473 | if the_item in config: 474 | self.nbest = int(config[the_item]) 475 | 476 | the_item = 'feature' 477 | if the_item in config: 478 | self.feat_config = config[the_item] ## feat_config is a dict 479 | 480 | 481 | ## read training setting: 482 | the_item = 'optimizer' 483 | if the_item in config: 484 | self.optimizer = config[the_item] 485 | the_item = 'ave_batch_loss' 486 | if the_item in config: 487 | self.average_batch_loss = str2bool(config[the_item]) 488 | the_item = 'status' 489 | if the_item in config: 490 | self.status = config[the_item] 491 | 492 | ## read Hyperparameters: 493 | the_item = 'cnn_layer' 494 | if the_item in config: 495 | self.HP_cnn_layer = int(config[the_item]) 496 | the_item = 'iteration' 497 | if the_item in config: 498 | self.HP_iteration = int(config[the_item]) 499 | the_item = 'batch_size' 500 | if the_item in config: 501 | self.HP_batch_size = int(config[the_item]) 502 | 503 | the_item = 'char_hidden_dim' 504 | if the_item in config: 505 | self.HP_char_hidden_dim = int(config[the_item]) 506 | the_item = 'hidden_dim' 507 | if the_item in config: 508 | self.HP_hidden_dim = int(config[the_item]) 509 | the_item = 'dropout' 510 | if the_item in config: 511 | self.HP_dropout = float(config[the_item]) 512 | the_item = 'lstm_layer' 513 | if the_item in config: 514 | self.HP_lstm_layer = int(config[the_item]) 515 | the_item = 'bilstm' 516 | if the_item in config: 517 | self.HP_bilstm = str2bool(config[the_item]) 518 | 519 | the_item = 'gpu' 520 | if the_item in config: 521 | self.HP_gpu = str2bool(config[the_item]) 522 | the_item = 'learning_rate' 523 | if the_item in config: 524 | self.HP_lr = float(config[the_item]) 525 | the_item = 'lr_decay' 526 | if the_item in config: 527 | self.HP_lr_decay = float(config[the_item]) 528 | the_item = 'clip' 529 | if the_item in config: 530 | self.HP_clip = float(config[the_item]) 531 | the_item = 'momentum' 532 | if the_item in config: 533 | self.HP_momentum = float(config[the_item]) 534 | the_item = 'l2' 535 | if the_item in config: 536 | self.HP_l2 = float(config[the_item]) 537 | ## no seg for sentence classification 538 | if self.sentence_classification: 539 | self.seg = False 540 | self.use_crf = False 541 | 542 | 543 | def config_file_to_dict(input_file): 544 | config = {} 545 | fins = open(input_file,'r').readlines() 546 | for line in fins: 547 | if len(line) > 0 and line[0] == "#": 548 | continue 549 | if "=" in line: 550 | pair = line.strip().split('#',1)[0].split('=',1) 551 | item = pair[0] 552 | if item=="feature": 553 | if item not in config: 554 | feat_dict = {} 555 | config[item]= feat_dict 556 | feat_dict = config[item] 557 | new_pair = pair[-1].split() 558 | feat_name = new_pair[0] 559 | one_dict = {} 560 | one_dict["emb_dir"] = None 561 | one_dict["emb_size"] = 10 562 | one_dict["emb_norm"] = False 563 | if len(new_pair) > 1: 564 | for idx in range(1,len(new_pair)): 565 | conf_pair = new_pair[idx].split('=') 566 | if conf_pair[0] == "emb_dir": 567 | one_dict["emb_dir"]=conf_pair[-1] 568 | elif conf_pair[0] == "emb_size": 569 | one_dict["emb_size"]=int(conf_pair[-1]) 570 | elif conf_pair[0] == "emb_norm": 571 | one_dict["emb_norm"]=str2bool(conf_pair[-1]) 572 | feat_dict[feat_name] = one_dict 573 | # print "feat",feat_dict 574 | else: 575 | if item in config: 576 | print("Warning: duplicated config item found: %s, updated."%(pair[0])) 577 | config[item] = pair[-1] 578 | 579 | 580 | return config 581 | 582 | 583 | def str2bool(string): 584 | if string == "True" or string == "true" or string == "TRUE": 585 | return True 586 | else: 587 | return False 588 | -------------------------------------------------------------------------------- /utils/functions.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Jie 3 | # @Date: 2017-06-15 14:23:06 4 | # @Last Modified by: Jie Yang, Contact: jieynlp@gmail.com 5 | # @Last Modified time: 2019-02-14 12:23:52 6 | from __future__ import print_function 7 | from __future__ import absolute_import 8 | import sys 9 | import numpy as np 10 | 11 | def normalize_word(word): 12 | new_word = "" 13 | for char in word: 14 | if char.isdigit(): 15 | new_word += '0' 16 | else: 17 | new_word += char 18 | return new_word 19 | 20 | 21 | def read_instance(input_file, word_alphabet, char_alphabet, feature_alphabets, label_alphabet, number_normalized, max_sent_length, sentence_classification=False, split_token='\t', char_padding_size=-1, char_padding_symbol = ''): 22 | feature_num = len(feature_alphabets) 23 | in_lines = open(input_file,'r', encoding="utf8").readlines() 24 | instence_texts = [] 25 | instence_Ids = [] 26 | words = [] 27 | features = [] 28 | chars = [] 29 | labels = [] 30 | word_Ids = [] 31 | feature_Ids = [] 32 | char_Ids = [] 33 | label_Ids = [] 34 | 35 | ## if sentence classification data format, splited by \t 36 | if sentence_classification: 37 | for line in in_lines: 38 | if len(line) > 2: 39 | pairs = line.strip().split(split_token) 40 | sent = pairs[0] 41 | if sys.version_info[0] < 3: 42 | sent = sent.decode('utf-8') 43 | original_words = sent.split() 44 | for word in original_words: 45 | words.append(word) 46 | if number_normalized: 47 | word = normalize_word(word) 48 | word_Ids.append(word_alphabet.get_index(word)) 49 | ## get char 50 | char_list = [] 51 | char_Id = [] 52 | for char in word: 53 | char_list.append(char) 54 | if char_padding_size > 0: 55 | char_number = len(char_list) 56 | if char_number < char_padding_size: 57 | char_list = char_list + [char_padding_symbol]*(char_padding_size-char_number) 58 | assert(len(char_list) == char_padding_size) 59 | for char in char_list: 60 | char_Id.append(char_alphabet.get_index(char)) 61 | chars.append(char_list) 62 | char_Ids.append(char_Id) 63 | 64 | label = pairs[-1] 65 | label_Id = label_alphabet.get_index(label) 66 | ## get features 67 | feat_list = [] 68 | feat_Id = [] 69 | for idx in range(feature_num): 70 | feat_idx = pairs[idx+1].split(']',1)[-1] 71 | feat_list.append(feat_idx) 72 | feat_Id.append(feature_alphabets[idx].get_index(feat_idx)) 73 | ## combine together and return, notice the feature/label as different format with sequence labeling task 74 | if (len(words) > 0) and ((max_sent_length < 0) or (len(words) < max_sent_length)): 75 | instence_texts.append([words, feat_list, chars, label]) 76 | instence_Ids.append([word_Ids, feat_Id, char_Ids,label_Id]) 77 | words = [] 78 | features = [] 79 | chars = [] 80 | char_Ids = [] 81 | word_Ids = [] 82 | feature_Ids = [] 83 | label_Ids = [] 84 | if (len(words) > 0) and ((max_sent_length < 0) or (len(words) < max_sent_length)) : 85 | instence_texts.append([words, feat_list, chars, label]) 86 | instence_Ids.append([word_Ids, feat_Id, char_Ids,label_Id]) 87 | words = [] 88 | features = [] 89 | chars = [] 90 | char_Ids = [] 91 | word_Ids = [] 92 | feature_Ids = [] 93 | label_Ids = [] 94 | 95 | else: 96 | ### for sequence labeling data format i.e. CoNLL 2003 97 | for line in in_lines: 98 | if len(line) > 2: 99 | pairs = line.strip().split() 100 | word = pairs[0] 101 | if sys.version_info[0] < 3: 102 | word = word.decode('utf-8') 103 | words.append(word) 104 | if number_normalized: 105 | word = normalize_word(word) 106 | label = pairs[-1] 107 | labels.append(label) 108 | word_Ids.append(word_alphabet.get_index(word)) 109 | label_Ids.append(label_alphabet.get_index(label)) 110 | ## get features 111 | feat_list = [] 112 | feat_Id = [] 113 | for idx in range(feature_num): 114 | feat_idx = pairs[idx+1].split(']',1)[-1] 115 | feat_list.append(feat_idx) 116 | feat_Id.append(feature_alphabets[idx].get_index(feat_idx)) 117 | features.append(feat_list) 118 | feature_Ids.append(feat_Id) 119 | ## get char 120 | char_list = [] 121 | char_Id = [] 122 | for char in word: 123 | char_list.append(char) 124 | if char_padding_size > 0: 125 | char_number = len(char_list) 126 | if char_number < char_padding_size: 127 | char_list = char_list + [char_padding_symbol]*(char_padding_size-char_number) 128 | assert(len(char_list) == char_padding_size) 129 | else: 130 | ### not padding 131 | pass 132 | for char in char_list: 133 | char_Id.append(char_alphabet.get_index(char)) 134 | chars.append(char_list) 135 | char_Ids.append(char_Id) 136 | else: 137 | if (len(words) > 0) and ((max_sent_length < 0) or (len(words) < max_sent_length)) : 138 | instence_texts.append([words, features, chars, labels]) 139 | instence_Ids.append([word_Ids, feature_Ids, char_Ids,label_Ids]) 140 | words = [] 141 | features = [] 142 | chars = [] 143 | labels = [] 144 | word_Ids = [] 145 | feature_Ids = [] 146 | char_Ids = [] 147 | label_Ids = [] 148 | if (len(words) > 0) and ((max_sent_length < 0) or (len(words) < max_sent_length)) : 149 | instence_texts.append([words, features, chars, labels]) 150 | instence_Ids.append([word_Ids, feature_Ids, char_Ids,label_Ids]) 151 | words = [] 152 | features = [] 153 | chars = [] 154 | labels = [] 155 | word_Ids = [] 156 | feature_Ids = [] 157 | char_Ids = [] 158 | label_Ids = [] 159 | return instence_texts, instence_Ids 160 | 161 | 162 | def build_pretrain_embedding(embedding_path, word_alphabet, embedd_dim=100, norm=True): 163 | embedd_dict = dict() 164 | if embedding_path != None: 165 | embedd_dict, embedd_dim = load_pretrain_emb(embedding_path) 166 | alphabet_size = word_alphabet.size() 167 | scale = np.sqrt(3.0 / embedd_dim) 168 | pretrain_emb = np.empty([word_alphabet.size(), embedd_dim]) 169 | perfect_match = 0 170 | case_match = 0 171 | not_match = 0 172 | for word, index in word_alphabet.iteritems(): 173 | if word in embedd_dict: 174 | if norm: 175 | pretrain_emb[index,:] = norm2one(embedd_dict[word]) 176 | else: 177 | pretrain_emb[index,:] = embedd_dict[word] 178 | perfect_match += 1 179 | elif word.lower() in embedd_dict: 180 | if norm: 181 | pretrain_emb[index,:] = norm2one(embedd_dict[word.lower()]) 182 | else: 183 | pretrain_emb[index,:] = embedd_dict[word.lower()] 184 | case_match += 1 185 | else: 186 | pretrain_emb[index,:] = np.random.uniform(-scale, scale, [1, embedd_dim]) 187 | not_match += 1 188 | pretrained_size = len(embedd_dict) 189 | print("Embedding:\n pretrain word:%s, prefect match:%s, case_match:%s, oov:%s, oov%%:%s"%(pretrained_size, perfect_match, case_match, not_match, (not_match+0.)/alphabet_size)) 190 | return pretrain_emb, embedd_dim 191 | 192 | def norm2one(vec): 193 | root_sum_square = np.sqrt(np.sum(np.square(vec))) 194 | return vec/root_sum_square 195 | 196 | def load_pretrain_emb(embedding_path): 197 | embedd_dim = -1 198 | embedd_dict = dict() 199 | with open(embedding_path, 'r', encoding="utf8") as file: 200 | for line in file: 201 | line = line.strip() 202 | if len(line) == 0: 203 | continue 204 | tokens = line.split() 205 | if embedd_dim < 0: 206 | embedd_dim = len(tokens) - 1 207 | elif embedd_dim + 1 != len(tokens): 208 | ## ignore illegal embedding line 209 | continue 210 | # assert (embedd_dim + 1 == len(tokens)) 211 | embedd = np.empty([1, embedd_dim]) 212 | embedd[:] = tokens[1:] 213 | if sys.version_info[0] < 3: 214 | first_col = tokens[0].decode('utf-8') 215 | else: 216 | first_col = tokens[0] 217 | embedd_dict[first_col] = embedd 218 | return embedd_dict, embedd_dim 219 | 220 | if __name__ == '__main__': 221 | a = np.arange(9.0) 222 | print(a) 223 | print(norm2one(a)) 224 | -------------------------------------------------------------------------------- /utils/metric.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Jie 3 | # @Date: 2017-02-16 09:53:19 4 | # @Last Modified by: Jie Yang, Contact: jieynlp@gmail.com 5 | # @Last Modified time: 2019-02-17 22:46:59 6 | 7 | # from operator import add 8 | # 9 | from __future__ import print_function 10 | import sys 11 | 12 | 13 | 14 | ## input as sentence level labels 15 | def get_ner_fmeasure(golden_lists, predict_lists, label_type="BMES"): 16 | sent_num = len(golden_lists) 17 | golden_full = [] 18 | predict_full = [] 19 | right_full = [] 20 | right_tag = 0 21 | all_tag = 0 22 | for idx in range(0,sent_num): 23 | # word_list = sentence_lists[idx] 24 | golden_list = golden_lists[idx] 25 | predict_list = predict_lists[idx] 26 | for idy in range(len(golden_list)): 27 | if golden_list[idy] == predict_list[idy]: 28 | right_tag += 1 29 | all_tag += len(golden_list) 30 | if label_type == "BMES" or label_type == "BIOES": 31 | gold_matrix = get_ner_BMES(golden_list) 32 | pred_matrix = get_ner_BMES(predict_list) 33 | else: 34 | gold_matrix = get_ner_BIO(golden_list) 35 | pred_matrix = get_ner_BIO(predict_list) 36 | # print "gold", gold_matrix 37 | # print "pred", pred_matrix 38 | right_ner = list(set(gold_matrix).intersection(set(pred_matrix))) 39 | golden_full += gold_matrix 40 | predict_full += pred_matrix 41 | right_full += right_ner 42 | right_num = len(right_full) 43 | golden_num = len(golden_full) 44 | predict_num = len(predict_full) 45 | if predict_num == 0: 46 | precision = -1 47 | else: 48 | precision = (right_num+0.0)/predict_num 49 | if golden_num == 0: 50 | recall = -1 51 | else: 52 | recall = (right_num+0.0)/golden_num 53 | if (precision == -1) or (recall == -1) or (precision+recall) <= 0.: 54 | f_measure = -1 55 | else: 56 | f_measure = 2*precision*recall/(precision+recall) 57 | accuracy = (right_tag+0.0)/all_tag 58 | # print "Accuracy: ", right_tag,"/",all_tag,"=",accuracy 59 | if label_type.upper().startswith("B-"): 60 | print("gold_num = ", golden_num, " pred_num = ", predict_num, " right_num = ", right_num) 61 | else: 62 | print("Right token = ", right_tag, " All token = ", all_tag, " acc = ", accuracy) 63 | return accuracy, precision, recall, f_measure 64 | 65 | 66 | def reverse_style(input_string): 67 | target_position = input_string.index('[') 68 | input_len = len(input_string) 69 | output_string = input_string[target_position:input_len] + input_string[0:target_position] 70 | return output_string 71 | 72 | 73 | def get_ner_BMES(label_list): 74 | # list_len = len(word_list) 75 | # assert(list_len == len(label_list)), "word list size unmatch with label list" 76 | list_len = len(label_list) 77 | begin_label = 'B-' 78 | end_label = 'E-' 79 | single_label = 'S-' 80 | whole_tag = '' 81 | index_tag = '' 82 | tag_list = [] 83 | stand_matrix = [] 84 | for i in range(0, list_len): 85 | # wordlabel = word_list[i] 86 | current_label = label_list[i].upper() 87 | if begin_label in current_label: 88 | if index_tag != '': 89 | tag_list.append(whole_tag + ',' + str(i-1)) 90 | whole_tag = current_label.replace(begin_label,"",1) +'[' +str(i) 91 | index_tag = current_label.replace(begin_label,"",1) 92 | 93 | elif single_label in current_label: 94 | if index_tag != '': 95 | tag_list.append(whole_tag + ',' + str(i-1)) 96 | whole_tag = current_label.replace(single_label,"",1) +'[' +str(i) 97 | tag_list.append(whole_tag) 98 | whole_tag = "" 99 | index_tag = "" 100 | elif end_label in current_label: 101 | if index_tag != '': 102 | tag_list.append(whole_tag +',' + str(i)) 103 | whole_tag = '' 104 | index_tag = '' 105 | else: 106 | continue 107 | if (whole_tag != '')&(index_tag != ''): 108 | tag_list.append(whole_tag) 109 | tag_list_len = len(tag_list) 110 | 111 | for i in range(0, tag_list_len): 112 | if len(tag_list[i]) > 0: 113 | tag_list[i] = tag_list[i]+ ']' 114 | insert_list = reverse_style(tag_list[i]) 115 | stand_matrix.append(insert_list) 116 | # print stand_matrix 117 | return stand_matrix 118 | 119 | 120 | def get_ner_BIO(label_list): 121 | # list_len = len(word_list) 122 | # assert(list_len == len(label_list)), "word list size unmatch with label list" 123 | list_len = len(label_list) 124 | begin_label = 'B-' 125 | inside_label = 'I-' 126 | whole_tag = '' 127 | index_tag = '' 128 | tag_list = [] 129 | stand_matrix = [] 130 | for i in range(0, list_len): 131 | # wordlabel = word_list[i] 132 | current_label = label_list[i].upper() 133 | if begin_label in current_label: 134 | if index_tag == '': 135 | whole_tag = current_label.replace(begin_label,"",1) +'[' +str(i) 136 | index_tag = current_label.replace(begin_label,"",1) 137 | else: 138 | tag_list.append(whole_tag + ',' + str(i-1)) 139 | whole_tag = current_label.replace(begin_label,"",1) + '[' + str(i) 140 | index_tag = current_label.replace(begin_label,"",1) 141 | 142 | elif inside_label in current_label: 143 | if current_label.replace(inside_label,"",1) == index_tag: 144 | whole_tag = whole_tag 145 | else: 146 | if (whole_tag != '')&(index_tag != ''): 147 | tag_list.append(whole_tag +',' + str(i-1)) 148 | whole_tag = '' 149 | index_tag = '' 150 | else: 151 | if (whole_tag != '')&(index_tag != ''): 152 | tag_list.append(whole_tag +',' + str(i-1)) 153 | whole_tag = '' 154 | index_tag = '' 155 | 156 | if (whole_tag != '')&(index_tag != ''): 157 | tag_list.append(whole_tag) 158 | tag_list_len = len(tag_list) 159 | 160 | for i in range(0, tag_list_len): 161 | if len(tag_list[i]) > 0: 162 | tag_list[i] = tag_list[i]+ ']' 163 | insert_list = reverse_style(tag_list[i]) 164 | stand_matrix.append(insert_list) 165 | return stand_matrix 166 | 167 | 168 | 169 | def readSentence(input_file): 170 | in_lines = open(input_file,'r').readlines() 171 | sentences = [] 172 | labels = [] 173 | sentence = [] 174 | label = [] 175 | for line in in_lines: 176 | if len(line) < 2: 177 | sentences.append(sentence) 178 | labels.append(label) 179 | sentence = [] 180 | label = [] 181 | else: 182 | pair = line.strip('\n').split(' ') 183 | sentence.append(pair[0]) 184 | label.append(pair[-1]) 185 | return sentences,labels 186 | 187 | 188 | def readTwoLabelSentence(input_file, pred_col=-1): 189 | in_lines = open(input_file,'r').readlines() 190 | sentences = [] 191 | predict_labels = [] 192 | golden_labels = [] 193 | sentence = [] 194 | predict_label = [] 195 | golden_label = [] 196 | for line in in_lines: 197 | if "##score##" in line: 198 | continue 199 | if len(line) < 2: 200 | sentences.append(sentence) 201 | golden_labels.append(golden_label) 202 | predict_labels.append(predict_label) 203 | sentence = [] 204 | golden_label = [] 205 | predict_label = [] 206 | else: 207 | pair = line.strip('\n').split(' ') 208 | sentence.append(pair[0]) 209 | golden_label.append(pair[1]) 210 | predict_label.append(pair[pred_col]) 211 | 212 | return sentences,golden_labels,predict_labels 213 | 214 | 215 | def fmeasure_from_file(golden_file, predict_file, label_type="BMES"): 216 | print("Get f measure from file:", golden_file, predict_file) 217 | print("Label format:",label_type) 218 | golden_sent,golden_labels = readSentence(golden_file) 219 | predict_sent,predict_labels = readSentence(predict_file) 220 | P,R,F = get_ner_fmeasure(golden_labels, predict_labels, label_type) 221 | print ("P:%sm R:%s, F:%s"%(P,R,F)) 222 | 223 | 224 | 225 | def fmeasure_from_singlefile(twolabel_file, label_type="BMES", pred_col=-1): 226 | sent,golden_labels,predict_labels = readTwoLabelSentence(twolabel_file, pred_col) 227 | P,R,F = get_ner_fmeasure(golden_labels, predict_labels, label_type) 228 | print ("P:%s, R:%s, F:%s"%(P,R,F)) 229 | 230 | 231 | 232 | if __name__ == '__main__': 233 | # print "sys:",len(sys.argv) 234 | if len(sys.argv) == 3: 235 | fmeasure_from_singlefile(sys.argv[1],"BMES",int(sys.argv[2])) 236 | else: 237 | fmeasure_from_singlefile(sys.argv[1],"BMES") 238 | 239 | -------------------------------------------------------------------------------- /utils/tagSchemeConverter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Jie Yang 3 | # @Date: 2017-11-27 16:53:36 4 | # @Last Modified by: Jie Yang, Contact: jieynlp@gmail.com 5 | # @Last Modified time: 2019-01-09 21:39:10 6 | 7 | 8 | """ 9 | convert NER/Chunking tag schemes, i.e. BIO->BIOES, BIOES->BIO, IOB->BIO, IOB->BIOES 10 | """ 11 | from __future__ import print_function 12 | 13 | import sys 14 | 15 | 16 | def BIO2BIOES(input_file, output_file): 17 | print("Convert BIO -> BIOES for file:", input_file) 18 | with open(input_file,'r') as in_file: 19 | fins = in_file.readlines() 20 | fout = open(output_file,'w') 21 | words = [] 22 | labels = [] 23 | for line in fins: 24 | if len(line) < 3: 25 | sent_len = len(words) 26 | for idx in range(sent_len): 27 | if "-" not in labels[idx]: 28 | fout.write(words[idx]+" "+labels[idx]+"\n") 29 | else: 30 | label_type = labels[idx].split('-')[-1] 31 | if "B-" in labels[idx]: 32 | if (idx == sent_len - 1) or ("I-" not in labels[idx+1]): 33 | fout.write(words[idx]+" S-"+label_type+"\n") 34 | else: 35 | fout.write(words[idx]+" B-"+label_type+"\n") 36 | elif "I-" in labels[idx]: 37 | if (idx == sent_len - 1) or ("I-" not in labels[idx+1]): 38 | fout.write(words[idx]+" E-"+label_type+"\n") 39 | else: 40 | fout.write(words[idx]+" I-"+label_type+"\n") 41 | fout.write('\n') 42 | words = [] 43 | labels = [] 44 | else: 45 | pair = line.strip('\n').split() 46 | words.append(pair[0]) 47 | labels.append(pair[-1].upper()) 48 | fout.close() 49 | print("BIOES file generated:", output_file) 50 | 51 | 52 | 53 | def BIOES2BIO(input_file, output_file): 54 | print("Convert BIOES -> BIO for file:", input_file) 55 | with open(input_file,'r') as in_file: 56 | fins = in_file.readlines() 57 | fout = open(output_file,'w') 58 | words = [] 59 | labels = [] 60 | for line in fins: 61 | if len(line) < 3: 62 | sent_len = len(words) 63 | for idx in range(sent_len): 64 | if "-" not in labels[idx]: 65 | fout.write(words[idx]+" "+labels[idx]+"\n") 66 | else: 67 | label_type = labels[idx].split('-')[-1] 68 | if "E-" in labels[idx]: 69 | fout.write(words[idx]+" I-"+label_type+"\n") 70 | elif "S-" in labels[idx]: 71 | fout.write(words[idx]+" B-"+label_type+"\n") 72 | else: 73 | fout.write(words[idx]+" "+labels[idx]+"\n") 74 | fout.write('\n') 75 | words = [] 76 | labels = [] 77 | else: 78 | pair = line.strip('\n').split() 79 | words.append(pair[0]) 80 | labels.append(pair[-1].upper()) 81 | fout.close() 82 | print("BIO file generated:", output_file) 83 | 84 | 85 | def IOB2BIO(input_file, output_file): 86 | print("Convert IOB -> BIO for file:", input_file) 87 | with open(input_file,'r') as in_file: 88 | fins = in_file.readlines() 89 | fout = open(output_file,'w') 90 | words = [] 91 | labels = [] 92 | for line in fins: 93 | if len(line) < 3: 94 | sent_len = len(words) 95 | for idx in range(sent_len): 96 | if "I-" in labels[idx]: 97 | label_type = labels[idx].split('-')[-1] 98 | if (idx == 0) or (labels[idx-1] == "O") or (label_type != labels[idx-1].split('-')[-1]): 99 | fout.write(words[idx]+" B-"+label_type+"\n") 100 | else: 101 | fout.write(words[idx]+" "+labels[idx]+"\n") 102 | else: 103 | fout.write(words[idx]+" "+labels[idx]+"\n") 104 | fout.write('\n') 105 | words = [] 106 | labels = [] 107 | else: 108 | pair = line.strip('\n').split() 109 | words.append(pair[0]) 110 | labels.append(pair[-1].upper()) 111 | fout.close() 112 | print("BIO file generated:", output_file) 113 | 114 | 115 | def choose_label(input_file, output_file): 116 | with open(input_file,'r') as in_file: 117 | fins = in_file.readlines() 118 | with open(output_file,'w') as fout: 119 | for line in fins: 120 | if len(line) < 3: 121 | fout.write(line) 122 | else: 123 | pairs = line.strip('\n').split(' ') 124 | fout.write(pairs[0]+" "+ pairs[-1]+"\n") 125 | 126 | 127 | if __name__ == '__main__': 128 | '''Convert NER tag schemes among IOB/BIO/BIOES. 129 | For example: if you want to convert the IOB tag scheme to BIO, then you run as following: 130 | python tagSchemeConverter.py IOB2BIO input_iob_file output_bio_file 131 | Input data format is the standard CoNLL 2003 data format. 132 | ''' 133 | if sys.argv[1].upper() == "IOB2BIO": 134 | IOB2BIO(sys.argv[2],sys.argv[3]) 135 | elif sys.argv[1].upper() == "BIO2BIOES": 136 | BIO2BIOES(sys.argv[2],sys.argv[3]) 137 | elif sys.argv[1].upper() == "BIOES2BIO": 138 | BIOES2BIO(sys.argv[2],sys.argv[3]) 139 | elif sys.argv[1].upper() == "IOB2BIOES": 140 | IOB2BIO(sys.argv[2],"temp") 141 | BIO2BIOES("temp",sys.argv[3]) 142 | else: 143 | print("Argument error: sys.argv[1] should belongs to \"IOB2BIO/BIO2BIOES/BIOES2BIO/IOB2BIOES\"") 144 | --------------------------------------------------------------------------------