├── .gitignore ├── LICENSE ├── README.md ├── data ├── get_data.sh └── test.txt ├── example.py ├── mylib ├── __init__.py ├── truecaser.py ├── truecaser_predictor.py └── truecaser_reader.py ├── requirements.txt ├── truecaser.json ├── truecaser_demo.ipynb └── word_eval.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | *~ 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | .pytest_cache/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | db.sqlite3 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # Environments 87 | .env 88 | .venv 89 | env/ 90 | venv/ 91 | ENV/ 92 | env.bak/ 93 | venv.bak/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Truecaser 2 | 3 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/mayhewsw/pytorch-truecaser/master) 4 | 5 | This is a simple neural truecaser written with allennlp, and based loosely on ([Susanto et al, 2016](https://aclweb.org/anthology/D16-1225)). They have an 6 | implementation [here](https://gitlab.com/raymondhs/char-rnn-truecase), but being written in Lua, it's a little hard to use. 7 | 8 | We provide [pre-trained models](https://github.com/mayhewsw/pytorch-truecaser/releases/tag/v1.0) that can be used for truecasing English and German right out of the box. The English model is trained on the [standard Wikipedia data split](http://www.cs.pomona.edu/~dkauchak/simplification/data.v1/data.v1.split.tar.gz) from (Coster and Kauchak, 2011), and achieves an F1 score of **93.01** on test. This is comparable to the best F1 of (Susanto et al 2016) of **93.19**. 9 | 10 | ### Requirements 11 | 12 | * python (3.6) 13 | * [allennlp](https://github.com/allenai/allennlp/) (0.8.2) 14 | 15 | ### Model 16 | This model treats each sentence as a sequence of characters (spaces are included in the sequence). Each character takes a binary label 17 | of "U" if uppercase and "L" if lowercase. For example, the word `tRuEcasIng` would take the labels `LULULLLULL` 18 | 19 | We encode the sequence using a bidirectional LSTM with 2 hidden layers, 50 dimensional character embeddings (input), 150 dimensional hidden size, and dropout of 0.25. 20 | 21 | ### Scoring 22 | A cautionary note is in order. The pytorch model optimizes for _character level_ F1 score, but it is more common to measure 23 | on the _word level_. So, after training a model, get a comparable score using `word_eval.py` (which I copied from [here](https://gitlab.com/raymondhs/char-rnn-truecase/blob/master/word_eval.py)) 24 | 25 | For example, to score on the Wiki test data: 26 | 27 | ```bash 28 | $ allennlp predict wiki-truecaser-model.tar.gz data/data.v1.split/normal.testing.txt --use-dataset-reader --output-file out_preds.txt --include-package mylib --predictor truecaser-predictor 29 | $ python word_eval.py data/data.v1.split/normal.testing.txt out_preds.txt 30 | ``` 31 | 32 | ### Usage 33 | 34 | If you just want to predict, you can run: 35 | ```bash 36 | $ allennlp predict wiki-truecaser-model.tar.gz data/test.txt --output-file test-out.txt --include-package mylib --use-dataset-reader --predictor truecaser-predictor 37 | ``` 38 | 39 | Where `data/test.txt` is a file with one sentence per line. 40 | 41 | See `example.py` for an example of how to use it programmatically. 42 | 43 | 44 | #### Training 45 | The dataset reader requires text that has one sentence per line. The model expects tokenized text. If your text is already tokenized 46 | (the Wiki data is), then you can use `just_spaces` as the `word_splitter` in the config. If you want to tokenize text first, 47 | you can use `spacy`. 48 | 49 | You can get the Wikipedia data by running: 50 | ```bash 51 | $ cd data 52 | $ ./get_data.sh 53 | ``` 54 | 55 | Run: 56 | ```bash 57 | $ allennlp train truecaser.json --include-package mylib -s /path/to/save/model/ 58 | ``` 59 | 60 | If you have a GPU, set `cuda_device` to 0 in `truecaser.json`. This will make training much faster. 61 | -------------------------------------------------------------------------------- /data/get_data.sh: -------------------------------------------------------------------------------- 1 | !#/bin/bash 2 | wget http://www.cs.pomona.edu/~dkauchak/simplification/data.v1/data.v1.split.tar.gz 3 | tar xzvf data.v1.split.tar.gz 4 | -------------------------------------------------------------------------------- /data/test.txt: -------------------------------------------------------------------------------- 1 | tank carradine now plays for the dolphins in miami . -------------------------------------------------------------------------------- /example.py: -------------------------------------------------------------------------------- 1 | from allennlp.predictors.predictor import Predictor 2 | from mylib import * 3 | from allennlp.models.archival import load_archive 4 | 5 | archive = load_archive("wiki-truecaser-model.tar.gz") 6 | predictor = Predictor.from_archive(archive, "truecaser-predictor") 7 | 8 | out= predictor.predict("jared smith lives in paris .") 9 | outline = predictor.dump_line(out) 10 | 11 | print(outline) 12 | -------------------------------------------------------------------------------- /mylib/__init__.py: -------------------------------------------------------------------------------- 1 | from . import truecaser 2 | from . import truecaser_reader 3 | from . import truecaser_predictor 4 | 5 | -------------------------------------------------------------------------------- /mylib/truecaser.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Optional, List, Any 2 | 3 | from overrides import overrides 4 | 5 | from allennlp.data import Vocabulary 6 | from allennlp.modules import Seq2SeqEncoder, TextFieldEmbedder 7 | from allennlp.models.model import Model 8 | from allennlp.models.simple_tagger import SimpleTagger 9 | from allennlp.nn import InitializerApplicator, RegularizerApplicator 10 | from allennlp.training.metrics import F1Measure 11 | 12 | 13 | @Model.register("truecaser") 14 | class TrueCaser(SimpleTagger): 15 | """ 16 | This is a simple child of SimpleTagger. The only difference is that 17 | I wanted to include an F1 measure in the metrics (even though this is 18 | character F1 and not token F1). 19 | """ 20 | 21 | def __init__(self, vocab: Vocabulary, 22 | text_field_embedder: TextFieldEmbedder, 23 | encoder: Seq2SeqEncoder, 24 | initializer: InitializerApplicator = InitializerApplicator(), 25 | regularizer: Optional[RegularizerApplicator] = None) -> None: 26 | super(TrueCaser, self).__init__(vocab, text_field_embedder, encoder, initializer, regularizer) 27 | self.metrics["f1"] = F1Measure(positive_label=self.vocab.get_token_index("U", namespace="labels")) 28 | 29 | @overrides 30 | def get_metrics(self, reset: bool = False) -> Dict[str, float]: 31 | metrics_to_return = {} 32 | for metric_name, metric in self.metrics.items(): 33 | if "f1" in metric_name: 34 | p,r,f1 = metric.get_metric(reset) 35 | metrics_to_return["p"] = p 36 | metrics_to_return["r"] = r 37 | metrics_to_return["f1"] = f1 38 | else: 39 | metrics_to_return[metric_name] = metric.get_metric(reset) 40 | 41 | return metrics_to_return 42 | -------------------------------------------------------------------------------- /mylib/truecaser_predictor.py: -------------------------------------------------------------------------------- 1 | from overrides import overrides 2 | from typing import List 3 | from allennlp.common.util import JsonDict 4 | from allennlp.data import DatasetReader, Instance 5 | from allennlp.models import Model 6 | from allennlp.predictors.predictor import Predictor 7 | from allennlp.data.tokenizers import Token 8 | 9 | 10 | @Predictor.register('truecaser-predictor') 11 | class TruecaserPredictor(Predictor): 12 | """ 13 | This is basically a copy of the SentenceTagger from allennlp. It is 14 | modified to dump output in a more sensible manner. 15 | """ 16 | def __init__(self, model: Model, dataset_reader: DatasetReader) -> None: 17 | super().__init__(model, dataset_reader) 18 | self.model = model 19 | 20 | def predict(self, sent): 21 | js = {"sentence" : sent} 22 | return self.predict_instance(self._json_to_instance(js)) 23 | 24 | @overrides 25 | def predict_instance(self, sent: Instance) -> JsonDict: 26 | output = super().predict_instance(sent) 27 | #output["chars"] = sent["tokens"] 28 | output["words"] = list(map(str, sent["tokens"].tokens)) 29 | 30 | tags = output["tags"] 31 | chars = output["words"] 32 | 33 | # all chars are lower case by default. 34 | out = [] 35 | for token,t in zip(chars,tags): 36 | c = token 37 | if t == "U": 38 | c = c.upper() 39 | out.append(c) 40 | 41 | newsent = "".join(out) 42 | output["pred"] = newsent 43 | 44 | return output 45 | 46 | @overrides 47 | def predict_batch_instance(self, sents: List[Instance]) -> List[JsonDict]: 48 | outputs = super().predict_batch_instance(sents) 49 | for i,sent in enumerate(sents): 50 | outputs[i]["words"] = sent["tokens"].tokens 51 | return outputs 52 | 53 | 54 | def load_line(self, line: str) -> JsonDict: 55 | """ 56 | This will usually be overridden with use_dataset_reader = True on the command line. 57 | :param line: 58 | :return: 59 | """ 60 | return {"sentence": line} 61 | 62 | def dump_line(self, outputs: JsonDict): 63 | return outputs["pred"] + "\n" 64 | 65 | @overrides 66 | def _json_to_instance(self, json_dict: JsonDict) -> Instance: 67 | """ 68 | Expects JSON that looks like ``{"sentence": "..."}``. 69 | Runs the underlying model, and adds the ``"words"`` to the output. 70 | """ 71 | sentence = json_dict["sentence"] 72 | chars = [Token(c) for c in sentence.lower()] 73 | return self._dataset_reader.text_to_instance(chars) 74 | -------------------------------------------------------------------------------- /mylib/truecaser_reader.py: -------------------------------------------------------------------------------- 1 | from typing import Iterator, List, Dict 2 | from allennlp.data.fields import TextField, SequenceLabelField 3 | from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer 4 | from allennlp.data.tokenizers import Token 5 | from allennlp.commands.train import * 6 | from nltk.corpus.util import LazyCorpusLoader 7 | from nltk.corpus.reader import * 8 | from allennlp.data.dataset_readers import DatasetReader 9 | from allennlp.data.instance import Instance 10 | 11 | @DatasetReader.register("truecaser_reader") 12 | class TrueCaserDatasetReader(DatasetReader): 13 | 14 | def __init__(self, 15 | token_indexers: Dict[str, TokenIndexer] = None, 16 | word_splitter = None) -> None: 17 | super().__init__(lazy=False) 18 | self.token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()} 19 | 20 | def text_to_instance(self, tokens: List[Token], tags: List[str] = None) -> Instance: 21 | token_field = TextField(tokens, self.token_indexers) 22 | fields = {"tokens": token_field} 23 | 24 | if tags: 25 | label_field = SequenceLabelField(labels=tags, sequence_field=token_field) 26 | fields["tags"] = label_field 27 | 28 | return Instance(fields) 29 | 30 | def _read(self, file_path: str) -> Iterator[Instance]: 31 | 32 | with open(file_path) as f: 33 | for line in f: 34 | # I want a sentence represented by a string. 35 | tokenized_sent = line.strip() 36 | chars = [Token(c) for c in tokenized_sent.lower()] 37 | case_labels = ["U" if char.isupper() else "L" for char in tokenized_sent] 38 | if len(chars) != len(case_labels): 39 | print("Mismatching sentence lengths!", tokenized_sent) 40 | continue 41 | yield self.text_to_instance(chars, case_labels) 42 | 43 | if __name__ == "__main__": 44 | dr = TrueCaserDatasetReader() 45 | for i in dr._read("tmp"): 46 | print(i) 47 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | allennlp==0.8.2 2 | scikit-learn==0.22.2 3 | -------------------------------------------------------------------------------- /truecaser.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset_reader":{ 3 | "type": "truecaser_reader", 4 | "token_indexers" : { 5 | "tokens": { 6 | "type": "single_id" 7 | } 8 | }, 9 | "word_splitter":{ 10 | "type": "just_spaces" 11 | } 12 | }, 13 | "train_data_path": "data/data.v1.split/normal.training.txt", 14 | "validation_data_path": "data/data.v1.split/normal.tuning.txt", 15 | "test_data_path": "data/data.v1.split/normal.testing.txt", 16 | "evaluate_on_test" : true, 17 | "model": { 18 | "type": "truecaser", 19 | "text_field_embedder": { 20 | "token_embedders": { 21 | "tokens": { 22 | "type": "embedding", 23 | "embedding_dim": 50, 24 | "trainable": true 25 | } 26 | } 27 | }, 28 | "encoder": { 29 | "type": "lstm", 30 | "input_size": 50, 31 | "hidden_size": 150, 32 | "num_layers": 2, 33 | "bidirectional" : true, 34 | "dropout" : 0.25 35 | }, 36 | }, 37 | "iterator": { 38 | "type": "basic", 39 | "batch_size": 64 40 | }, 41 | "trainer": { 42 | "optimizer": { 43 | "type": "adam", 44 | "lr": 0.001 45 | }, 46 | "validation_metric": "+f1", 47 | "num_serialized_models_to_keep": 3, 48 | "num_epochs": 75, 49 | "grad_norm": 5.0, 50 | "patience": 25, 51 | "cuda_device": -1 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /truecaser_demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%%capture\n", 10 | "!wget https://github.com/mayhewsw/pytorch-truecaser/releases/download/v1.0/wiki-truecaser-model-en.tar.gz\n", 11 | "!wget https://github.com/mayhewsw/pytorch-truecaser/releases/download/v1.0/wmt-truecaser-model-de.tar.gz" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "from allennlp.predictors.predictor import Predictor\n", 21 | "from mylib import *\n", 22 | "from allennlp.models.archival import load_archive\n", 23 | "\n", 24 | "archive = load_archive(\"wiki-truecaser-model-en.tar.gz\")\n", 25 | "predictor = Predictor.from_archive(archive, \"truecaser-predictor\")" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "out = predictor.predict(\"jared smith lives in paris .\")\n", 35 | "outline = predictor.dump_line(out)\n", 36 | "print(outline)" 37 | ] 38 | } 39 | ], 40 | "metadata": { 41 | "kernelspec": { 42 | "display_name": "Python 3", 43 | "language": "python", 44 | "name": "python3" 45 | }, 46 | "language_info": { 47 | "codemirror_mode": { 48 | "name": "ipython", 49 | "version": 3 50 | }, 51 | "file_extension": ".py", 52 | "mimetype": "text/x-python", 53 | "name": "python", 54 | "nbconvert_exporter": "python", 55 | "pygments_lexer": "ipython3", 56 | "version": "3.6.7" 57 | } 58 | }, 59 | "nbformat": 4, 60 | "nbformat_minor": 2 61 | } 62 | -------------------------------------------------------------------------------- /word_eval.py: -------------------------------------------------------------------------------- 1 | # This code taken verbatim from https://gitlab.com/raymondhs/char-rnn-truecase/blob/master/word_eval.py 2 | # Thanks to the authors. 3 | 4 | import sys, codecs 5 | 6 | gold = sys.argv[1] 7 | pred = sys.argv[2] 8 | 9 | gold_sent = codecs.open(gold,'r',encoding='utf8').readlines() 10 | pred_sent = codecs.open(pred,'r',encoding='utf8').readlines() 11 | 12 | num_correct = 0 13 | num_changed_correct = 0 14 | num_gold = 0 15 | num_proposed = 0 16 | total = 0 17 | for i in range(len(pred_sent)): 18 | words = gold_sent[i].strip().split() 19 | pred_words = pred_sent[i].strip().split() 20 | for k in range(len(words)): 21 | if pred_words[k] != pred_words[k].lower(): 22 | num_proposed += 1 23 | if words[k] != words[k].lower(): 24 | num_gold += 1 25 | if words[k] == pred_words[k]: 26 | num_correct += 1 27 | if words[k] != words[k].lower(): 28 | num_changed_correct += 1 29 | total += len(words) 30 | acc = num_correct * 100.0 / total 31 | try: 32 | P = float(num_changed_correct)/num_proposed 33 | R = float(num_changed_correct)/num_gold 34 | F = 2*P*R/(P+R) 35 | except: 36 | P = 0 37 | R = 0 38 | F = 0 39 | print('Accuracy: %.2f' % acc) 40 | print('Precision: %.2f' % (P*100)) 41 | print('Recall: %.2f' % (R*100)) 42 | print('F1: %.2f' % (F*100)) 43 | print('%.2f & %.2f & %.2f & %.2f' % (acc, P*100, R*100, F*100)) 44 | --------------------------------------------------------------------------------