├── .gitignore ├── LICENSE ├── README.md ├── __images__ └── seq-to-seq-DNC.jpg ├── environment.yml ├── notebooks ├── .gitkeep ├── Data Preparation.ipynb ├── Preprocessing.ipynb └── Text Normalization Demo.ipynb ├── results ├── .gitkeep ├── base-paper_classwise_accuracy.csv ├── english │ ├── Semiotic_Class-wise_Accuracy.png │ ├── classwise_accuracy.csv │ ├── mistakes.csv │ └── normalized.csv └── russian │ ├── .gitkeep │ ├── Semiotic_Class-wise_Accuracy.png │ ├── classwise_accuracy.csv │ ├── mistakes.csv │ └── normalized.csv ├── setup.sh └── src ├── .gitkeep ├── DNCnormalize.py ├── Encoder.py ├── XGBclassify.py ├── classification_report.py ├── lib ├── access.py ├── addressing.py ├── dnc.py ├── seq2seq.py └── util.py ├── preprocessing.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | MANIFEST 2 | build 3 | dist 4 | _build 5 | docs/man/*.gz 6 | docs/source/api/generated 7 | docs/source/config.rst 8 | docs/gh-pages 9 | notebook/i18n/*/LC_MESSAGES/*.mo 10 | notebook/i18n/*/LC_MESSAGES/nbjs.json 11 | notebook/static/components 12 | notebook/static/style/*.min.css* 13 | notebook/static/*/js/built/ 14 | notebook/static/*/built/ 15 | notebook/static/built/ 16 | notebook/static/*/js/main.min.js* 17 | notebook/static/lab/*bundle.js 18 | node_modules 19 | *.py[co] 20 | __pycache__ 21 | *.egg-info 22 | *~ 23 | *.bak 24 | .ipynb_checkpoints 25 | .tox 26 | .DS_Store 27 | \#*# 28 | .#* 29 | .coverage 30 | 31 | *.swp 32 | *.map 33 | .idea/ 34 | Read the Docs 35 | config.rst 36 | 37 | /.project 38 | /.pydevproject 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Text Normalization using Memory Augmented Neural Networks 2 | 3 | The Text Normalization Demo notebook and the accompanying paper "Text Normalization using Memory Augmented Neural Networks" demonstrates an accuracy of 99.5% on the Text Normalization Challenge by Richard Sproat and Navdeep Jaitly. An earlier version of the approach used here has secured the 6th position in the [Kaggle Russian Text Normalization Challenge](https://www.kaggle.com/c/text-normalization-challenge-russian-language) by Google's Text Normalization Research Group. 4 | 5 | Go straight to the Text Normalization Demo Notebook 6 | 7 | ## Architecture 8 | Two models are used for the purpose of text normalization. A XGBoost boost model first classifies a token as to-be-normalized or remain-self. The to-be-normalized tokens are then fed character-by-character to our proposed Sequence to Sequence DNC model. 9 | 10 | More details about the architecture and implementation can be found in the original paper. 11 | 12 |
13 | 14 | **Sequence to sequence DNC** 15 | 16 | ![Sequence to sequence DNC](__images__/seq-to-seq-DNC.jpg) 17 | 18 | 19 |
20 | 21 | ## Results : 22 | 23 | ### 1. Normalizing English Data 24 | 25 |
26 | 27 | **Semiotic Classwise Accuracy** 28 | 29 | | semiotic-class | accuracy | count | correct | 30 | |----------------|--------------------|-------|---------| 31 | | ALL | 0.994267233453397 | 92451 | 91921 | 32 | | ADDRESS | 1.0 | 4 | 4 | 33 | | CARDINAL | 0.9942140790742526 | 1037 | 1031 | 34 | | DATE | 0.9971751412429378 | 2832 | 2824 | 35 | | DECIMAL | 0.9891304347826086 | 92 | 91 | 36 | | DIGIT | 0.7954545454545454 | 44 | 35 | 37 | | ELECTRONIC | 0.7346938775510204 | 49 | 36 | 38 | | FRACTION | 0.6875 | 16 | 11 | 39 | | LETTERS | 0.971611071682044 | 1409 | 1369 | 40 | | MEASURE | 0.971830985915493 | 142 | 138 | 41 | | MONEY | 0.972972972972973 | 37 | 36 | 42 | | ORDINAL | 0.9805825242718447 | 103 | 101 | 43 | | PLAIN | 0.9939611747724394 | 67894 | 67484 | 44 | | PUNCT | 0.9988729854615125 | 17746 | 17726 | 45 | | TELEPHONE | 0.918918918918919 | 37 | 34 | 46 | | TIME | 0.75 | 8 | 6 | 47 | | VERBATIM | 0.994005994005994 | 1001 | 995 | 48 | 49 |
50 | 51 | ### 2. Normalizing Russian Data 52 | 53 | **Semiotic Classwise Accuracy** 54 | 55 |
56 | 57 | | semiotic-class | accuracy | count | correct | 58 | |----------------|--------------------|-------|---------| 59 | | ALL | 0.9928752306965964 | 93196 | 92532 | 60 | | CARDINAL | 0.9417922948073701 | 2388 | 2249 | 61 | | DATE | 0.9732441471571907 | 1495 | 1455 | 62 | | DECIMAL | 0.9 | 60 | 54 | 63 | | DIGIT | 1.0 | 16 | 16 | 64 | | ELECTRONIC | 0.6041666666666666 | 48 | 29 | 65 | | FRACTION | 0.6086956521739131 | 23 | 14 | 66 | | LETTERS | 0.9907608695652174 | 1840 | 1823 | 67 | | MEASURE | 0.8978102189781022 | 411 | 369 | 68 | | MONEY | 0.8947368421052632 | 19 | 17 | 69 | | ORDINAL | 0.9461358313817331 | 427 | 404 | 70 | | PLAIN | 0.994688407139769 | 64764 | 64420 | 71 | | PUNCT | 0.9998519542045006 | 20264 | 20261 | 72 | | TELEPHONE | 0.8202247191011236 | 89 | 73 | 73 | | TIME | 0.75 | 8 | 6 | 74 | | VERBATIM | 0.9985119047619048 | 1344 | 1342 | 75 | 76 |
77 | 78 | ## How to run? 79 | 80 | **Requirements:** 81 | - [Jupyter Notebook](http://jupyter.org/) 82 | - [Anaconda Package Manager](https://anaconda.org/) 83 | - rest will be installed by anaconda (see below) 84 | 85 | **Follow these steps for a demonstration:** 86 | 87 | 1. Clone the repo 88 | 2. Download and extract the required data. 89 | ``` 90 | $ sh setup.sh 91 | ``` 92 | 2. Create & activate an environment using the provided file 93 | ``` 94 | $ conda env create -f environment.yml 95 | $ source activate deep-tf 96 | ``` 97 | 3. Start a Jupyter Notebook server 98 | 4. Open 'notebooks/Text Normalization Demo.ipynb' 99 | 5. Set the language to English or Russian below the 'Global Config' cell 100 | ```python 101 | lang = 'english' 102 | # lang = 'russian' 103 | ``` 104 | 6. Run the notebook 105 | 106 | **Full Requirements:** 107 | 108 | - numpy 1.13.3 109 | - pandas 0.21.0 110 | - matplotlib 2.1.0 111 | - watermark 1.5.0 112 | - seaborn 0.8.1 113 | - sklearn 0.19.1 114 | - xgboost 0.6 115 | - tensorflow 1.3.0 116 | 117 | ## Authors 118 | 1. Subhojeet Pramanik (http://github.com/subho406) 119 | 2. Aman Hussain (https://github.com/AmanDaVinci) 120 | 121 | **Acknowledgements** 122 | 123 | Differentiable Neural Computer, Tensorflow Implementation: https://github.com/deepmind/dnc 124 | -------------------------------------------------------------------------------- /__images__/seq-to-seq-DNC.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cognibit/Text-Normalization-Demo/36355f4a2c5187948fe786b7318259151f9a9db6/__images__/seq-to-seq-DNC.jpg -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: deep-tf 2 | channels: 3 | - conda-forge 4 | - anaconda 5 | - defaults 6 | dependencies: 7 | - _nb_ext_conf=0.4.0=py36_1 8 | - anaconda-client=1.6.5=py36h19c0dcd_0 9 | - asn1crypto=0.22.0=py36h265ca7c_1 10 | - backports=1.0=py36hfa02d7e_1 11 | - backports.weakref=1.0rc1=py36_0 12 | - bcolz=1.1.2=py36h00f5784_0 13 | - bleach=1.5.0=py36_0 14 | - bokeh=0.12.10=py36hbb0e44a_0 15 | - bzip2=1.0.6=h6d464ef_2 16 | - ca-certificates=2017.08.26=h1d4fec5_0 17 | - certifi=2017.11.5=py36hf29ccca_0 18 | - cffi=1.10.0=py36had8d393_1 19 | - chardet=3.0.4=py36h0f667ec_1 20 | - click=6.7=py36h5253387_0 21 | - cloudpickle=0.5.2=py36h84cdd9c_0 22 | - clyent=1.2.2=py36h7e57e65_1 23 | - cryptography=2.0.3=py36ha225213_1 24 | - cudatoolkit=8.0=3 25 | - cudnn=6.0.21=cuda8.0_0 26 | - cycler=0.10.0=py36h93f1223_0 27 | - dask=0.16.0=py36h73d177f_0 28 | - dask-core=0.16.0=py36ha827fd6_0 29 | - dbus=1.10.22=h3b5a359_0 30 | - decorator=4.1.2=py36hd076ac8_0 31 | - distributed=1.20.0=py36h1c9f478_0 32 | - entrypoints=0.2.3=py36h1aec115_2 33 | - expat=2.2.5=he0dffb1_0 34 | - fontconfig=2.12.4=h88586e7_1 35 | - freetype=2.8=hab7d2ae_1 36 | - glib=2.53.6=h5d9569c_2 37 | - gmp=6.1.2=h6c8ec71_1 38 | - gst-plugins-base=1.12.2=he3457e5_0 39 | - gstreamer=1.12.2=h4f93127_0 40 | - hdf5=1.10.1=h9caa474_1 41 | - heapdict=1.0.0=py36h79797d7_0 42 | - html5lib=0.9999999=py36_0 43 | - icu=58.2=h9c2bf20_1 44 | - idna=2.6=py36h82fb2a8_1 45 | - intel-openmp=2018.0.0=hc7b2577_8 46 | - ipykernel=4.6.1=py36hbf841aa_0 47 | - ipython=6.2.1=py36h88c514a_1 48 | - ipython_genutils=0.2.0=py36hb52b0d5_0 49 | - ipywidgets=7.0.5=py36h31d6531_0 50 | - jedi=0.10.2=py36h552def0_0 51 | - jinja2=2.9.6=py36h489bce4_1 52 | - jpeg=9b=h024ee3a_2 53 | - jsonschema=2.6.0=py36h006f8b5_0 54 | - jupyter_client=5.1.0=py36h614e9ea_0 55 | - jupyter_core=4.4.0=py36h7c827e3_0 56 | - libedit=3.1=heed3624_0 57 | - libffi=3.2.1=hd88cf55_4 58 | - libgcc=7.2.0=h69d50b8_2 59 | - libgcc-ng=7.2.0=h7cc24e2_2 60 | - libgfortran-ng=7.2.0=h9f7466a_2 61 | - libgpuarray=0.6.9=0 62 | - libpng=1.6.32=hbd3595f_4 63 | - libprotobuf=3.4.1=h5b8497f_0 64 | - libsodium=1.0.15=hf101ebd_0 65 | - libstdcxx-ng=7.2.0=h7a57d05_2 66 | - libxcb=1.12=hcd93eb1_4 67 | - libxml2=2.9.4=h2e8b1d7_6 68 | - locket=0.2.0=py36h787c0ad_1 69 | - lzo=2.10=h49e0be7_2 70 | - mako=1.0.7=py36h0727276_0 71 | - markdown=2.6.9=py36_0 72 | - markupsafe=1.0=py36hd9260cd_1 73 | - matplotlib=2.1.0=py36hba5de38_0 74 | - mistune=0.8.1=py36h3d5977c_0 75 | - mkl=2018.0.1=h19d6760_4 76 | - mkl-service=1.1.2=py36h17a0993_4 77 | - msgpack-python=0.4.8=py36hec4c5d1_0 78 | - nb_anacondacloud=1.4.0=py36_0 79 | - nb_conda=2.2.1=py36h8118bb2_0 80 | - nb_conda_kernels=2.1.0=py36_0 81 | - nbconvert=5.3.1=py36hb41ffb7_0 82 | - nbformat=4.4.0=py36h31c9010_0 83 | - nbpresent=3.0.2=py36h5f95a39_1 84 | - ncurses=6.0=h9df7e31_2 85 | - nose=1.3.7=py36hcdf7029_2 86 | - notebook=5.2.1=py36h690a4eb_0 87 | - numexpr=2.6.2=py36hc561933_2 88 | - numpy=1.13.3=py36ha12f23b_0 89 | - openssl=1.0.2m=h26d622b_1 90 | - pandas=0.21.0=py36h78bd809_1 91 | - pandoc=1.19.2.1=hea2e7c5_1 92 | - pandocfilters=1.4.2=py36ha6701b7_1 93 | - partd=0.3.8=py36h36fd896_0 94 | - patsy=0.4.1=py36ha3be15e_0 95 | - pcre=8.41=hc27e229_1 96 | - pexpect=4.2.1=py36h3b9d41b_0 97 | - pickleshare=0.7.4=py36h63277f8_0 98 | - pip=9.0.1=py36h6c6f9ce_4 99 | - prompt_toolkit=1.0.15=py36h17d85b1_0 100 | - protobuf=3.4.1=py36h306e679_0 101 | - psutil=5.4.0=py36h84c53db_0 102 | - ptyprocess=0.5.2=py36h69acd42_0 103 | - pycparser=2.18=py36hf9f622e_1 104 | - pygments=2.2.0=py36h0d3125c_0 105 | - pygpu=0.6.9=py36_0 106 | - pyopenssl=17.2.0=py36h5cc804b_0 107 | - pyparsing=2.2.0=py36hee85983_1 108 | - pyqt=5.6.0=py36h0386399_5 109 | - pysocks=1.6.7=py36hd97a5b1_1 110 | - pytables=3.4.2=py36h3b5282a_2 111 | - python=3.6.3=h6c0c0dc_5 112 | - python-dateutil=2.6.1=py36h88d3b88_1 113 | - pytz=2017.3=py36h63b9c63_0 114 | - pyyaml=3.12=py36hafb9ca4_1 115 | - pyzmq=16.0.3=py36he2533c7_0 116 | - qt=5.6.2=h974d657_12 117 | - readline=7.0=ha6073c6_4 118 | - requests=2.18.4=py36he2e5f8d_1 119 | - scikit-learn=0.19.1=py36h7aa7ec6_0 120 | - scipy=1.0.0=py36hbf646e7_0 121 | - seaborn=0.8.1=py36hfad7ec4_0 122 | - setuptools=36.5.0=py36he42e2e1_0 123 | - simplegeneric=0.8.1=py36h2cb9092_0 124 | - sip=4.18.1=py36h51ed4ed_2 125 | - six=1.11.0=py36h372c433_1 126 | - sortedcontainers=1.5.7=py36hdf89491_0 127 | - sqlite=3.20.1=hb898158_2 128 | - statsmodels=0.8.0=py36h8533d0b_0 129 | - tblib=1.3.2=py36h34cf8b6_0 130 | - tensorflow-gpu=1.3.0=0 131 | - tensorflow-gpu-base=1.3.0=py36cuda8.0cudnn6.0_1 132 | - tensorflow-tensorboard=0.1.5=py36_0 133 | - terminado=0.6=py36ha25a19f_0 134 | - testpath=0.3.1=py36h8cadb63_0 135 | - theano=0.9.0=py36_0 136 | - tk=8.6.7=hc745277_3 137 | - toolz=0.8.2=py36h81f2dff_0 138 | - tornado=4.5.2=py36h1283b2a_0 139 | - traitlets=4.3.2=py36h674d592_0 140 | - urllib3=1.22=py36hbe7ace6_0 141 | - wcwidth=0.1.7=py36hdf4376a_0 142 | - webencodings=0.5.1=py36h800622e_1 143 | - werkzeug=0.12.2=py36hc703753_0 144 | - wheel=0.30.0=py36hfd4bba0_1 145 | - widgetsnbextension=3.0.8=py36h25a1d49_0 146 | - xz=5.2.3=h55aa19d_2 147 | - yaml=0.1.7=had09818_2 148 | - zeromq=4.2.2=hbedb6e5_2 149 | - zict=0.1.3=py36h3a3bf81_0 150 | - zlib=1.2.11=ha838bed_2 151 | - h5py=2.7.1=py36_2 152 | - keras=1.2.2=py36_0 153 | - watermark=1.5.0=py36_0 154 | - xgboost=0.6a2=py36_2 155 | - pip: 156 | - absl-py==0.1.6 157 | - dm-sonnet==1.14 158 | - ipython-genutils==0.2.0 159 | - jupyter-client==5.1.0 160 | - jupyter-core==4.4.0 161 | - nb-anacondacloud==1.4.0 162 | - nb-conda==2.2.1 163 | - nb-conda-kernels==2.1.0 164 | - prompt-toolkit==1.0.15 165 | - tables==3.4.2 166 | - tensorflow==1.3.0 167 | prefix: /home/amanthevinci/anaconda3/envs/deep-tf 168 | 169 | -------------------------------------------------------------------------------- /notebooks/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cognibit/Text-Normalization-Demo/36355f4a2c5187948fe786b7318259151f9a9db6/notebooks/.gitkeep -------------------------------------------------------------------------------- /notebooks/Preprocessing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Preprocessing\n", 8 | "---" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "These notebooks and the accompanying paper (Text Normalization using Memory Augmented Neural Networks) demonstrates an accuracy of 99.4% (English) and 99.3% (Russian) on the Text Normalization Challenge by Richard Sproat and Navdeep Jaitly. To achieve comparable and objective results, we need to preprocess the data provided by Richard Sproat and Navdeep Jaitly at [https://github.com/rwsproat/text-normalization-data]. From the README of the dataset:\n", 16 | "```\n", 17 | "In practice for the results reported in the paper only the first 100,002 lines\n", 18 | "of output-00099-of-00100 were used (for English), and the first 100,007 lines of\n", 19 | "output-00099-of-00100 for Russian.\n", 20 | "```\n", 21 | "Hence, the 'output-00099-of-00100' file is extracted for further use. \n", 22 | "This notebook prepares the raw data for the next stage of normalization." 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "## Import Libraries" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 1, 35 | "metadata": { 36 | "collapsed": true 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "import numpy as np\n", 41 | "import pandas as pd\n", 42 | "import seaborn as sns\n", 43 | "import matplotlib.pyplot as plt\n", 44 | "\n", 45 | "%matplotlib inline" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "## Global Config\n", 53 | "**Language : English or Russian?**" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 2, 59 | "metadata": { 60 | "collapsed": true 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "lang = 'english'\n", 65 | "# lang = 'russian'" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 3, 71 | "metadata": { 72 | "collapsed": true 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "if lang == 'english':\n", 77 | " # input data\n", 78 | " data_directory = '../data/english/'\n", 79 | " data = 'output-00099-of-00100'\n", 80 | " # output\n", 81 | " out = 'output-00099-of-00100_processed.csv'\n", 82 | " # test size \n", 83 | " test_rows = 100002\n", 84 | " \n", 85 | "elif lang == 'russian':\n", 86 | " # input data\n", 87 | " data_directory = '../data/russian/'\n", 88 | " data = 'output-00099-of-00100'\n", 89 | " # output\n", 90 | " out = 'output-00099-of-00100_processed.csv'\n", 91 | " # test size\n", 92 | " test_rows = 100007" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "## Load Data" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "By default, Pandas treats double quote as enclosing an entry so it includes all tabs and newlines in that entry until it reaches the next quote. To escape it we need to have the quoting argument set to QUOTE_NONE or 3 as given in the documentation - [https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html]\n" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 4, 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "name": "stdout", 116 | "output_type": "stream", 117 | "text": [ 118 | "\n", 119 | "RangeIndex: 100002 entries, 0 to 100001\n", 120 | "Data columns (total 3 columns):\n", 121 | "semiotic 100002 non-null object\n", 122 | "before 100002 non-null object\n", 123 | "after 92451 non-null object\n", 124 | "dtypes: object(3)\n", 125 | "memory usage: 2.3+ MB\n" 126 | ] 127 | } 128 | ], 129 | "source": [ 130 | "raw_data = pd.read_csv(data_directory+data, nrows=test_rows,\n", 131 | " header=None, sep='\\t', quoting = 3,\n", 132 | " names=['semiotic', 'before', 'after'])\n", 133 | "raw_data.info()" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 5, 139 | "metadata": {}, 140 | "outputs": [ 141 | { 142 | "data": { 143 | "text/html": [ 144 | "
\n", 145 | "\n", 158 | "\n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | "
semioticbeforeafter
0PLAINIt<self>
1PLAINcan<self>
2PLAINbe<self>
3PLAINsummarized<self>
4PLAINas<self>
5PLAINan<self>
6PUNCT\"sil
7PLAINerror<self>
8PLAINdriven<self>
9PLAINtransformation<self>
\n", 230 | "
" 231 | ], 232 | "text/plain": [ 233 | " semiotic before after\n", 234 | "0 PLAIN It \n", 235 | "1 PLAIN can \n", 236 | "2 PLAIN be \n", 237 | "3 PLAIN summarized \n", 238 | "4 PLAIN as \n", 239 | "5 PLAIN an \n", 240 | "6 PUNCT \" sil\n", 241 | "7 PLAIN error \n", 242 | "8 PLAIN driven \n", 243 | "9 PLAIN transformation " 244 | ] 245 | }, 246 | "execution_count": 5, 247 | "metadata": {}, 248 | "output_type": "execute_result" 249 | } 250 | ], 251 | "source": [ 252 | "raw_data.head(10)" 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": {}, 258 | "source": [ 259 | "## Data Analysis" 260 | ] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "metadata": {}, 265 | "source": [ 266 | "**What are the different type of semiotic classes available?**" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 6, 272 | "metadata": {}, 273 | "outputs": [ 274 | { 275 | "data": { 276 | "text/plain": [ 277 | "PLAIN 67894\n", 278 | "PUNCT 17746\n", 279 | " 7551\n", 280 | "DATE 2832\n", 281 | "LETTERS 1409\n", 282 | "CARDINAL 1037\n", 283 | "VERBATIM 1001\n", 284 | "MEASURE 142\n", 285 | "ORDINAL 103\n", 286 | "DECIMAL 92\n", 287 | "ELECTRONIC 49\n", 288 | "DIGIT 44\n", 289 | "MONEY 37\n", 290 | "TELEPHONE 37\n", 291 | "FRACTION 16\n", 292 | "TIME 8\n", 293 | "ADDRESS 4\n", 294 | "Name: semiotic, dtype: int64" 295 | ] 296 | }, 297 | "execution_count": 6, 298 | "metadata": {}, 299 | "output_type": "execute_result" 300 | } 301 | ], 302 | "source": [ 303 | "raw_data['semiotic'].value_counts()" 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "metadata": {}, 309 | "source": [ 310 | "The semiotic classes mentioned in the paper are:\n", 311 | "\n", 312 | "1. PLAIN\n", 313 | "2. PUNCT\n", 314 | "3. DATE\n", 315 | "4. TRANS\n", 316 | "5. LETTERS\n", 317 | "6. CARDINAL\n", 318 | "7. VERBATIM\n", 319 | "8. MEASURE\n", 320 | "9. ORDINAL\n", 321 | "10. DECIMAL\n", 322 | "11. ELECTRONIC\n", 323 | "12. DIGIT\n", 324 | "13. MONEY\n", 325 | "14. FRACTION\n", 326 | "15. TIME\n" 327 | ] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "metadata": {}, 332 | "source": [ 333 | "## Data Preprocessing" 334 | ] 335 | }, 336 | { 337 | "cell_type": "markdown", 338 | "metadata": {}, 339 | "source": [ 340 | "**Generating sentence and word token ids**" 341 | ] 342 | }, 343 | { 344 | "cell_type": "markdown", 345 | "metadata": {}, 346 | "source": [ 347 | "Our text normalization approach requires sentence and token ids to encode and generate batches" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": 7, 353 | "metadata": { 354 | "collapsed": true 355 | }, 356 | "outputs": [], 357 | "source": [ 358 | "# to avoid modifying something we are iterating over\n", 359 | "data = pd.DataFrame(columns=['sentence_id',\n", 360 | " 'token_id',\n", 361 | " 'semiotic',\n", 362 | " 'before',\n", 363 | " 'after'])\n", 364 | "# initialize columns and iterator\n", 365 | "sentence_id = 0\n", 366 | "token_id = -1" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": 8, 372 | "metadata": { 373 | "collapsed": true 374 | }, 375 | "outputs": [], 376 | "source": [ 377 | "for row in raw_data.itertuples():\n", 378 | " # look for end of sentences\n", 379 | " if (row.semiotic == '' and row.before == ''):\n", 380 | " sentence_id += 1\n", 381 | " token_id = -1\n", 382 | " continue\n", 383 | " else:\n", 384 | " token_id += 1\n", 385 | " \n", 386 | " new_row = {'sentence_id': sentence_id,\n", 387 | " 'token_id': token_id,\n", 388 | " 'semiotic': row.semiotic,\n", 389 | " 'before': row.before,\n", 390 | " 'after': row.after}\n", 391 | " data = data.append(new_row, ignore_index=True) " 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": 9, 397 | "metadata": {}, 398 | "outputs": [ 399 | { 400 | "data": { 401 | "text/html": [ 402 | "
\n", 403 | "\n", 416 | "\n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | "
sentence_idtoken_idsemioticbeforeafter
000PLAINIt<self>
101PLAINcan<self>
202PLAINbe<self>
303PLAINsummarized<self>
404PLAINas<self>
505PLAINan<self>
606PUNCT\"sil
707PLAINerror<self>
808PLAINdriven<self>
909PLAINtransformation<self>
\n", 510 | "
" 511 | ], 512 | "text/plain": [ 513 | " sentence_id token_id semiotic before after\n", 514 | "0 0 0 PLAIN It \n", 515 | "1 0 1 PLAIN can \n", 516 | "2 0 2 PLAIN be \n", 517 | "3 0 3 PLAIN summarized \n", 518 | "4 0 4 PLAIN as \n", 519 | "5 0 5 PLAIN an \n", 520 | "6 0 6 PUNCT \" sil\n", 521 | "7 0 7 PLAIN error \n", 522 | "8 0 8 PLAIN driven \n", 523 | "9 0 9 PLAIN transformation " 524 | ] 525 | }, 526 | "execution_count": 9, 527 | "metadata": {}, 528 | "output_type": "execute_result" 529 | } 530 | ], 531 | "source": [ 532 | "data.head(10)" 533 | ] 534 | }, 535 | { 536 | "cell_type": "markdown", 537 | "metadata": {}, 538 | "source": [ 539 | "**Transforming 'after' tokens** \n", 540 | "From the above mentioned paper:\n", 541 | "```\n", 542 | "Semiotic class instances are verbalized as sequences\n", 543 | "of fully spelled words, most ordinary words are left alone (rep-\n", 544 | "resented here as ), and punctuation symbols are mostly\n", 545 | "transduced to sil (for “silence”).\n", 546 | "```\n", 547 | "Hence we transform as follows:\n", 548 | "1. sil is replaced with < self >\n", 549 | "2. < self > is replaced with the before column\n" 550 | ] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": 10, 555 | "metadata": { 556 | "collapsed": true 557 | }, 558 | "outputs": [], 559 | "source": [ 560 | "sil_mask = (data['after'] == 'sil')\n", 561 | "data.loc[sil_mask, 'after'] = '' " 562 | ] 563 | }, 564 | { 565 | "cell_type": "code", 566 | "execution_count": 11, 567 | "metadata": { 568 | "collapsed": true 569 | }, 570 | "outputs": [], 571 | "source": [ 572 | "self_mask = (data['after'] == '')\n", 573 | "data.loc[self_mask, ('after')] = data.loc[self_mask, 'before']" 574 | ] 575 | }, 576 | { 577 | "cell_type": "markdown", 578 | "metadata": {}, 579 | "source": [ 580 | "Sanity Check..." 581 | ] 582 | }, 583 | { 584 | "cell_type": "code", 585 | "execution_count": 12, 586 | "metadata": {}, 587 | "outputs": [ 588 | { 589 | "data": { 590 | "text/html": [ 591 | "
\n", 592 | "\n", 605 | "\n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | "
sentence_idtoken_idsemioticbeforeafter
2760422551PUNCT::
2347218863PUNCT::
33683277515PUNCT,,
6972357274PUNCT,,
74352609311PUNCT..
\n", 659 | "
" 660 | ], 661 | "text/plain": [ 662 | " sentence_id token_id semiotic before after\n", 663 | "27604 2255 1 PUNCT : :\n", 664 | "23472 1886 3 PUNCT : :\n", 665 | "33683 2775 15 PUNCT , ,\n", 666 | "69723 5727 4 PUNCT , ,\n", 667 | "74352 6093 11 PUNCT . ." 668 | ] 669 | }, 670 | "execution_count": 12, 671 | "metadata": {}, 672 | "output_type": "execute_result" 673 | } 674 | ], 675 | "source": [ 676 | "data[sil_mask].sample(5)" 677 | ] 678 | }, 679 | { 680 | "cell_type": "code", 681 | "execution_count": 13, 682 | "metadata": {}, 683 | "outputs": [ 684 | { 685 | "data": { 686 | "text/html": [ 687 | "
\n", 688 | "\n", 701 | "\n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | "
sentence_idtoken_idsemioticbeforeafter
27460224211PUNCT..
95517595PLAINthethe
77947638111PLAINfarfar
44123487PLAINinin
4204634277PLAINTakayamaTakayama
\n", 755 | "
" 756 | ], 757 | "text/plain": [ 758 | " sentence_id token_id semiotic before after\n", 759 | "27460 2242 11 PUNCT . .\n", 760 | "9551 759 5 PLAIN the the\n", 761 | "77947 6381 11 PLAIN far far\n", 762 | "4412 348 7 PLAIN in in\n", 763 | "42046 3427 7 PLAIN Takayama Takayama" 764 | ] 765 | }, 766 | "execution_count": 13, 767 | "metadata": {}, 768 | "output_type": "execute_result" 769 | } 770 | ], 771 | "source": [ 772 | "data[self_mask].sample(5)" 773 | ] 774 | }, 775 | { 776 | "cell_type": "markdown", 777 | "metadata": {}, 778 | "source": [ 779 | "## Exporting Data" 780 | ] 781 | }, 782 | { 783 | "cell_type": "code", 784 | "execution_count": 14, 785 | "metadata": {}, 786 | "outputs": [ 787 | { 788 | "data": { 789 | "text/html": [ 790 | "
\n", 791 | "\n", 804 | "\n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | "
sentence_idtoken_idsemioticbeforeafter
3020PLAINSheShe
3121PLAINthenthen
3222PLAINcompelledcompelled
3323PLAINherher
3424PLAINtenantstenants
3525PLAINtoto
3626PLAINlevellevel
3727PLAINthethe
3828PLAINRoyalistRoyalist
3929PLAINsiegesiege
\n", 898 | "
" 899 | ], 900 | "text/plain": [ 901 | " sentence_id token_id semiotic before after\n", 902 | "30 2 0 PLAIN She She\n", 903 | "31 2 1 PLAIN then then\n", 904 | "32 2 2 PLAIN compelled compelled\n", 905 | "33 2 3 PLAIN her her\n", 906 | "34 2 4 PLAIN tenants tenants\n", 907 | "35 2 5 PLAIN to to\n", 908 | "36 2 6 PLAIN level level\n", 909 | "37 2 7 PLAIN the the\n", 910 | "38 2 8 PLAIN Royalist Royalist\n", 911 | "39 2 9 PLAIN siege siege" 912 | ] 913 | }, 914 | "execution_count": 14, 915 | "metadata": {}, 916 | "output_type": "execute_result" 917 | } 918 | ], 919 | "source": [ 920 | "data[30:40]" 921 | ] 922 | }, 923 | { 924 | "cell_type": "code", 925 | "execution_count": 15, 926 | "metadata": { 927 | "collapsed": true 928 | }, 929 | "outputs": [], 930 | "source": [ 931 | "data.to_csv(data_directory+out, index=False)" 932 | ] 933 | }, 934 | { 935 | "cell_type": "markdown", 936 | "metadata": {}, 937 | "source": [ 938 | "___" 939 | ] 940 | } 941 | ], 942 | "metadata": { 943 | "kernelspec": { 944 | "display_name": "Python 3", 945 | "language": "python", 946 | "name": "python3" 947 | }, 948 | "language_info": { 949 | "codemirror_mode": { 950 | "name": "ipython", 951 | "version": 3 952 | }, 953 | "file_extension": ".py", 954 | "mimetype": "text/x-python", 955 | "name": "python", 956 | "nbconvert_exporter": "python", 957 | "pygments_lexer": "ipython3", 958 | "version": "3.6.5" 959 | } 960 | }, 961 | "nbformat": 4, 962 | "nbformat_minor": 2 963 | } 964 | -------------------------------------------------------------------------------- /results/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cognibit/Text-Normalization-Demo/36355f4a2c5187948fe786b7318259151f9a9db6/results/.gitkeep -------------------------------------------------------------------------------- /results/base-paper_classwise_accuracy.csv: -------------------------------------------------------------------------------- 1 | Semiotic Class, En Count, En Accuracy, Ru Count, Ru Accuracy 2 | ALL,92416,0.997,93184,0.993 3 | PLAIN,68029,0.998,60747,0.999 4 | PUNCT,17726,1.000,20263,1.000 5 | DATE,2808,0.999,1495,0.976 6 | TRANS,nan,nan,4103,0.921 7 | LETTERS,1404,0.971,1839,0.991 8 | CARDINAL,1067,0.989,2387,0.940 9 | VERBATIM,894,0.980,1298,1.000 10 | MEASURE,142,0.986,409,0.883 11 | ORDINAL,103,0.971,427,0.956 12 | DECIMAL,89,1.000,60,0.867 13 | ELECTRONIC,21,1.000,2,1.000 14 | DIGIT,37,0.865,16,1.000 15 | MONEY,36,0.972,19,0.842 16 | FRACTION,13,0.923,23,0.826 17 | TIME,8,0.750,8,0.750 18 | -------------------------------------------------------------------------------- /results/english/Semiotic_Class-wise_Accuracy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cognibit/Text-Normalization-Demo/36355f4a2c5187948fe786b7318259151f9a9db6/results/english/Semiotic_Class-wise_Accuracy.png -------------------------------------------------------------------------------- /results/english/classwise_accuracy.csv: -------------------------------------------------------------------------------- 1 | semiotic-class,accuracy,count,correct 2 | ALL,0.994267233453397,92451,91921 3 | ADDRESS,1.0,4,4 4 | CARDINAL,0.9942140790742526,1037,1031 5 | DATE,0.9971751412429378,2832,2824 6 | DECIMAL,0.9891304347826086,92,91 7 | DIGIT,0.7954545454545454,44,35 8 | ELECTRONIC,0.7346938775510204,49,36 9 | FRACTION,0.6875,16,11 10 | LETTERS,0.971611071682044,1409,1369 11 | MEASURE,0.971830985915493,142,138 12 | MONEY,0.972972972972973,37,36 13 | ORDINAL,0.9805825242718447,103,101 14 | PLAIN,0.9939611747724394,67894,67484 15 | PUNCT,0.9988729854615125,17746,17726 16 | TELEPHONE,0.918918918918919,37,34 17 | TIME,0.75,8,6 18 | VERBATIM,0.994005994005994,1001,995 19 | -------------------------------------------------------------------------------- /results/english/mistakes.csv: -------------------------------------------------------------------------------- 1 | sentence_id,token_id,semiotic,before,class,after,truth 2 | 59,0,PLAIN,UPA,ToBeNormalized,u p a,UPA 3 | 66,6,PLAIN,SEO,ToBeNormalized,s e o,SEO 4 | 68,0,PLAIN,INSIGHT,ToBeNormalized,i n s i g h t,INSIGHT 5 | 80,2,PLAIN,WA,ToBeNormalized,w a,WA 6 | 80,4,PLAIN,SPIE,ToBeNormalized,s p i e,SPIE 7 | 90,0,PLAIN,LiPSA,ToBeNormalized,l i p s a,LiPSA 8 | 94,20,PLAIN,ROD,ToBeNormalized,r o d,ROD 9 | 100,2,PLAIN,LEGAL,ToBeNormalized,l e g a l,LEGAL 10 | 100,3,PLAIN,ASSISTANCE,ToBeNormalized,a s s i s t a n c e,ASSISTANCE 11 | 100,4,PLAIN,CENTRE,ToBeNormalized,c e n t r e,center 12 | 100,6,PLAIN,LAC,ToBeNormalized,l a c,LAC 13 | 156,3,MEASURE,0.001251 g/cm3,ToBeNormalized,zero point o o one two five one sil g per hour,zero point o o one two five one grams per c c 14 | 158,5,LETTERS,V,ToBeNormalized,five,V 15 | 184,24,PLAIN,doi,ToBeNormalized,d o i,doi 16 | 184,28,CARDINAL,14356007,ToBeNormalized,one million four hundred thirty five thousand six hundred seven,fourteen million three hundred fifty six thousand seven 17 | 191,3,PLAIN,RENAMO,ToBeNormalized,r e n a m o,RENAMO 18 | 205,12,CARDINAL,1572225424,ToBeNormalized,one billion five hundred seventy two million two hundred twenty two thousand four hundred twenty four,one billion five hundred seventy two million two hundred twenty five thousand four hundred twenty four 19 | 239,0,LETTERS,Acee,RemainSelf,Acee,a c e e 20 | 243,5,PLAIN,GUS,ToBeNormalized,g u s,GUS 21 | 253,4,PLAIN,DOS,ToBeNormalized,d o s,DOS 22 | 297,8,LETTERS,Vit,RemainSelf,Vit,v i t 23 | 303,15,PLAIN,authorisation,ToBeNormalized,t,authorization 24 | 309,0,FRACTION,1/0,ToBeNormalized,one meter,one zeroth 25 | 316,3,PLAIN,ski,ToBeNormalized,s k i,ski 26 | 349,8,PLAIN,Fei,ToBeNormalized,f e i,Fei 27 | 366,10,PLAIN,JA,ToBeNormalized,j a,JA 28 | 378,10,PLAIN,ser,ToBeNormalized,s e r,ser 29 | 455,3,FRACTION,"10/618,543",ToBeNormalized,ten sixteenth sixty one thousand five hundred forty three,ten six hundred eighteen thousand five hundred forty thirds 30 | 471,15,PLAIN,Up,ToBeNormalized,u p,Up 31 | 473,7,ELECTRONIC,#Selfie,ToBeNormalized,hash tag fourteen,hash tag selfie 32 | 490,7,VERBATIM,-,ToBeNormalized,to,- 33 | 507,0,PLAIN,BibliographyTranslations,ToBeNormalized,b i b l i o g r a p h y t r a n s,BibliographyTranslations 34 | 529,7,PLAIN,EE,ToBeNormalized,e e,EE 35 | 550,0,LETTERS,Ligi'ne,RemainSelf,Ligi'ne,l i g i n e 36 | 555,2,PLAIN,PROFILES,ToBeNormalized,p r o f i l e s,PROFILES 37 | 577,5,PLAIN,GUS,ToBeNormalized,g u s,GUS 38 | 593,15,PLAIN,ups,ToBeNormalized,u p s,ups 39 | 595,10,PLAIN,ANABlog,ToBeNormalized,a n a b l o g,ANABlog 40 | 626,7,PLAIN,ms,ToBeNormalized,m s,ms 41 | 627,8,PUNCT,-,ToBeNormalized,to,- 42 | 627,12,PUNCT,-,ToBeNormalized,to,- 43 | 633,13,PLAIN,ADUM,ToBeNormalized,a d u m,ADUM 44 | 636,15,PLAIN,pluralised,ToBeNormalized,popularized,pluralized 45 | 637,7,PLAIN,TRADOC,ToBeNormalized,t r a d o c,TRADOC 46 | 637,18,PLAIN,CARR,ToBeNormalized,c a r r,CARR 47 | 637,19,PLAIN,OPT,ToBeNormalized,o p t,OPT 48 | 659,0,PLAIN,MATCH,ToBeNormalized,m a t c h,MATCH 49 | 687,0,DIGIT,2007,ToBeNormalized,two thousand seven,two o o seven 50 | 694,11,PLAIN,MAR,ToBeNormalized,m a r,MAR 51 | 698,1,PLAIN,PIX,ToBeNormalized,p i x,PIX 52 | 701,1,PLAIN,WorldwideEastEndersExtrasGrownupsPlanet,ToBeNormalized,w_letter,WorldwideEastEndersExtrasGrownupsPlanet 53 | 704,0,PLAIN,Realising,RemainSelf,Realising,realizing 54 | 704,13,PLAIN,Crystalliser,RemainSelf,Crystalliser,crystallizer 55 | 711,0,PLAIN,LIT,ToBeNormalized,l i t,LIT 56 | 721,11,PLAIN,VI,ToBeNormalized,v i,VI 57 | 754,2,PLAIN,OCAMPO,ToBeNormalized,o c a m p o,OCAMPO 58 | 754,4,PLAIN,FROM,ToBeNormalized,f r o m,FROM 59 | 754,10,PLAIN,BUILDS,ToBeNormalized,b u i l d s,BUILDS 60 | 754,11,TELEPHONE,3-0 LEAD,ToBeNormalized,three sil o sil l e a d,three sil o sil lead 61 | 783,3,PLAIN,OF,ToBeNormalized,o f,OF 62 | 784,5,PLAIN,Ngoc,ToBeNormalized,n g o c,Ngoc 63 | 799,5,PLAIN,Maj,ToBeNormalized,m a j,Maj 64 | 802,10,PLAIN,RAF,ToBeNormalized,r a f,RAF 65 | 831,14,PLAIN,Sol,ToBeNormalized,s o l,Sol 66 | 846,6,PLAIN,CLIO,ToBeNormalized,c l i o,CLIO 67 | 864,1,PLAIN,BEEF,ToBeNormalized,b e e f,BEEF 68 | 918,7,PLAIN,ser,ToBeNormalized,s e r,ser 69 | 923,13,PLAIN,Lik,ToBeNormalized,l i k,Lik 70 | 936,5,PLAIN,CARD,ToBeNormalized,c a r d,CARD 71 | 942,1,PLAIN,Am's,ToBeNormalized,a m s's,Am's 72 | 942,4,DIGIT,314,ToBeNormalized,three hundred fourteen,three one four 73 | 964,18,PLAIN,mrs,ToBeNormalized,mister,mrs 74 | 998,6,PLAIN,Aud,ToBeNormalized,a u d,Aud 75 | 998,10,PLAIN,odd,ToBeNormalized,o d d,odd 76 | 1018,0,PLAIN,Ava,ToBeNormalized,a v a,Ava 77 | 1057,13,PLAIN,INRIA,ToBeNormalized,i n r i a,INRIA 78 | 1060,3,PLAIN,ASIA,ToBeNormalized,a s i a,ASIA 79 | 1071,15,PLAIN,Azo,ToBeNormalized,a z o,Azo 80 | 1079,21,PLAIN,BANYU,ToBeNormalized,b a n y u,BANYU 81 | 1091,4,LETTERS,Crkva,RemainSelf,Crkva,c r k v a 82 | 1092,5,PLAIN,HEW,ToBeNormalized,h e w,HEW 83 | 1101,0,PLAIN,CARE,ToBeNormalized,c a r e,CARE 84 | 1105,5,PLAIN,I,ToBeNormalized,one,I 85 | 1117,15,ELECTRONIC,//web.archive.org/web/20110105051516/http://www.fairfield.edu/x18852.html,ToBeNormalized,s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter e_letter b_letter dot a_letter r_letter c_letter h_letter i_letter v_letter e_letter dot o_letter r_letter g_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter e_letter b_letter _letter s_letter l_letter a_letter s_letter h_letter _letter t_letter w_letter o_letter _letter o_letter _letter o_letter n_letter e_letter _letter o_letter n_letter e_letter _letter o_letter _letter o_letter n_letter e_letter _letter o_letter _letter o_letter n_letter e_letter _letter o_letter n_letter e_letter _letter o_letter n_letter e_letter _letter o_letter n_letter e_letter _letter o_letter n_letter e_letter _letter o_letter n_letter e_letter _letter o_letter n_letter e_letter _letter o_letter n_letter e_letter _letter o_letter n_letter e_letter _letter o_letter n_letter e_letter _letter o_letter n_letter e_letter _letter o_letter n_letter e_letter _letter o_letter n_letter e_letter _letter o_letter n_letter e_letter _letter o_letter,s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter e_letter b_letter dot a_letter r_letter c_letter h_letter i_letter v_letter e_letter dot o_letter r_letter g_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter e_letter b_letter _letter s_letter l_letter a_letter s_letter h_letter _letter t_letter w_letter o_letter _letter o_letter _letter o_letter n_letter e_letter _letter o_letter n_letter e_letter _letter o_letter _letter o_letter n_letter e_letter _letter o_letter _letter f_letter i_letter v_letter e_letter _letter o_letter _letter f_letter i_letter v_letter e_letter _letter o_letter n_letter e_letter _letter f_letter i_letter v_letter e_letter _letter o_letter n_letter e_letter _letter s_letter i_letter x_letter _letter s_letter l_letter a_letter s_letter h_letter _letter h_letter _letter t_letter _letter t_letter _letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter _letter w_letter _letter w_letter dot f_letter a_letter i_letter r_letter f_letter i_letter e_letter l_letter d_letter dot e_letter _letter d_letter _letter u_letter _letter s_letter l_letter a_letter s_letter h_letter _letter x_letter _letter o_letter n_letter e_letter _letter e_letter i_letter g_letter h_letter t_letter _letter e_letter i_letter g_letter h_letter t_letter _letter f_letter i_letter v_letter e_letter _letter t_letter w_letter o_letter dot h_letter _letter t_letter _letter m_letter _letter l_letter 86 | 1121,10,PLAIN,Abba,ToBeNormalized,a b b a,Abba 87 | 1124,7,PLAIN,Yaesu,ToBeNormalized,y a e s u,Yaesu 88 | 1165,7,PLAIN,ser,ToBeNormalized,s e r,ser 89 | 1189,7,PLAIN,Oh,ToBeNormalized,o h,Oh 90 | 1193,9,PLAIN,centro,ToBeNormalized,center,centro 91 | 1203,2,PLAIN,CHARLES,ToBeNormalized,c h a r l e s,CHARLES 92 | 1203,4,PLAIN,MOODY,ToBeNormalized,m o o d y,MOODY 93 | 1211,20,ELECTRONIC,nethttp://www.pamirian.ru/Wakhi_language_transition.pdfAli,ToBeNormalized,n_letter e_letter t_letter h_letter t_letter t_letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter _letter w_letter _letter w_letter dot p_letter a_letter m_letter i_letter r_letter i_letter a_letter n_letter dot r_letter _letter u_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter a_letter k_letter h_letter i_letter _letter u_letter n_letter d_letter e_letter r_letter s_letter c_letter o_letter r_letter e_letter _letter p_letter a_letter r_letter e_letter n_letter t_letter _letter u_letter n_letter d_letter e_letter r_letter s_letter c_letter o_letter r_letter e_letter _letter p_letter a_letter r_letter e_letter n_letter t_letter _letter u_letter n_letter d_letter e_letter r_letter s_letter c_letter o_letter r_letter e_letter _letter p_letter a_letter r_letter e_letter n_letter t_letter _letter u_letter n_letter d_letter e_letter r_letter s_letter c_letter o_letter r_letter e_letter _letter p_letter a_letter r_letter e_letter n_letter t_letter _letter u_letter n_letter d_letter,n_letter _letter e_letter _letter t_letter _letter h_letter _letter t_letter _letter t_letter _letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter _letter w_letter _letter w_letter dot p_letter a_letter m_letter i_letter r_letter i_letter a_letter n_letter dot r_letter _letter u_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter a_letter k_letter h_letter i_letter _letter u_letter n_letter d_letter e_letter r_letter s_letter c_letter o_letter r_letter e_letter _letter l_letter a_letter n_letter g_letter u_letter a_letter g_letter e_letter _letter u_letter n_letter d_letter e_letter r_letter s_letter c_letter o_letter r_letter e_letter _letter t_letter r_letter a_letter n_letter s_letter i_letter t_letter i_letter o_letter n_letter dot p_letter d_letter f_letter a_letter l_letter i_letter 94 | 1243,13,PLAIN,IZMIR,ToBeNormalized,i z m i r,IZMIR 95 | 1252,6,PUNCT,-,ToBeNormalized,to,- 96 | 1289,2,ELECTRONIC,http://www.knchr.org/dmdocuments/KNCHR%20doc.pdfFollowing,ToBeNormalized,h_letter _letter t_letter _letter t_letter _letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter _letter w_letter _letter w_letter dot k_letter _letter n_letter _letter c_letter _letter h_letter _letter r_letter dot o_letter r_letter g_letter _letter s_letter l_letter a_letter s_letter h_letter _letter d_letter _letter m_letter _letter d_letter o_letter c_letter u_letter m_letter e_letter n_letter t_letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter k_letter _letter n_letter _letter c_letter _letter h_letter _letter r_letter _letter s_letter i_letter x_letter d_letter o_letter c_letter dot p_letter _letter d_letter _letter f_letter _letter f_letter o_letter l_letter l_letter o_letter n_letter i_letter n_letter g_letter,h_letter _letter t_letter _letter t_letter _letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter _letter w_letter _letter w_letter dot k_letter _letter n_letter _letter c_letter _letter h_letter _letter r_letter dot o_letter r_letter g_letter _letter s_letter l_letter a_letter s_letter h_letter _letter d_letter m_letter d_letter o_letter c_letter u_letter m_letter e_letter n_letter t_letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter k_letter _letter n_letter _letter c_letter _letter h_letter _letter r_letter _letter p_letter e_letter r_letter c_letter e_letter n_letter t_letter _letter t_letter w_letter e_letter n_letter t_letter y_letter _letter d_letter o_letter c_letter dot p_letter d_letter f_letter f_letter o_letter l_letter l_letter o_letter w_letter i_letter n_letter g_letter 97 | 1302,8,PLAIN,Abor,ToBeNormalized,a b o r,Abor 98 | 1343,6,PLAIN,NeXTSTEP,ToBeNormalized,n e x t s t e p,NeXTSTEP 99 | 1357,10,PLAIN,CARICOM,ToBeNormalized,c a r i c o m,CARICOM 100 | 1369,0,PLAIN,CY,ToBeNormalized,c y,CY 101 | 1385,4,PLAIN,Bodour,ToBeNormalized,b,Bodour 102 | 1388,11,PLAIN,KHAD,ToBeNormalized,k h a d,KHAD 103 | 1410,9,PLAIN,DEFRA,ToBeNormalized,d e f r a,DEFRA 104 | 1411,15,LETTERS,subg,RemainSelf,subg,s u b g 105 | 1414,2,DIGIT,450,ToBeNormalized,four hundred fifty,four five o 106 | 1427,6,PLAIN,SM,ToBeNormalized,s m,SM 107 | 1439,11,PLAIN,odor,ToBeNormalized,o d o r,odor 108 | 1456,18,PLAIN,est,ToBeNormalized,e s t,est 109 | 1475,0,PLAIN,Ava,ToBeNormalized,a v a,Ava 110 | 1520,6,DATE,2017/,ToBeNormalized,two thousand seventeen,twenty seventeen 111 | 1520,7,DATE,2016,ToBeNormalized,two thousand sixteen,twenty sixteen 112 | 1584,3,PLAIN,SPOILERS,ToBeNormalized,s p o i l e r s,SPOILERS 113 | 1612,16,PLAIN,ski,ToBeNormalized,s k i,ski 114 | 1626,1,PLAIN,mrs,ToBeNormalized,mister,mrs 115 | 1631,6,PLAIN,vols,ToBeNormalized,v o l s,vols 116 | 1681,4,PLAIN,Chu,ToBeNormalized,c h u,Chu 117 | 1699,7,ELECTRONIC,http://www.yafc-ftp.com/The,ToBeNormalized,h_letter _letter t_letter _letter t_letter _letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter _letter w_letter _letter w_letter dot y_letter a_letter f_letter c_letter _letter d_letter a_letter s_letter h_letter _letter f_letter t_letter p_letter dot c_letter o_letter m_letter _letter s_letter l_letter a_letter s_letter h_letter _letter t_letter h_letter e_letter,h_letter _letter t_letter _letter t_letter _letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter _letter w_letter _letter w_letter dot y_letter _letter a_letter _letter f_letter _letter c_letter _letter d_letter a_letter s_letter h_letter _letter f_letter _letter t_letter _letter p_letter dot c_letter o_letter m_letter _letter s_letter l_letter a_letter s_letter h_letter _letter t_letter h_letter e_letter 118 | 1707,0,PLAIN,FAO,ToBeNormalized,f a o,FAO 119 | 1726,11,MONEY,88.5 million HRK,ToBeNormalized,eighty eight point five million yen,eighty eight point five million croatian kunas 120 | 1785,1,LETTERS,Oolaa,RemainSelf,Oolaa,o o l a a 121 | 1813,9,PLAIN,Koi,ToBeNormalized,k o i,Koi 122 | 1831,5,LETTERS,xSnxTe,RemainSelf,xSnxTe,x s n x t e 123 | 1892,5,PLAIN,CRI,ToBeNormalized,c r i,CRI 124 | 1900,0,PLAIN,CRI,ToBeNormalized,c r i,CRI 125 | 1901,0,PLAIN,APRA,ToBeNormalized,a p r a,APRA 126 | 1903,6,PLAIN,SnO2,ToBeNormalized,s n o two,tin four oxide 127 | 1925,0,PLAIN,CART,ToBeNormalized,c a r t,CART 128 | 1925,12,PLAIN,CART,ToBeNormalized,c a r t,CART 129 | 1940,8,PLAIN,CRIs,ToBeNormalized,c r i's,CRIs 130 | 1967,0,LETTERS,Ilir,RemainSelf,Ilir,i l i r 131 | 1983,17,LETTERS,anth,RemainSelf,anth,a n t h 132 | 2027,2,PLAIN,obra,ToBeNormalized,o b r a,obra 133 | 2033,2,PLAIN,Mac's,ToBeNormalized,m a c's,Mac's 134 | 2109,3,PLAIN,millimetre,RemainSelf,millimetre,millimeter 135 | 2127,8,PLAIN,Ava,ToBeNormalized,a v a,Ava 136 | 2132,5,PLAIN,mrs,ToBeNormalized,m r s,mrs 137 | 2153,17,PLAIN,Stem,ToBeNormalized,s t e m,Stem 138 | 2153,19,PLAIN,ells,ToBeNormalized,e l l s,ells 139 | 2154,10,PLAIN,SAT,ToBeNormalized,s a t,SAT 140 | 2155,5,PLAIN,RY,ToBeNormalized,r y,RY 141 | 2163,12,PLAIN,Sept,ToBeNormalized,s e p t,Sept 142 | 2186,16,PLAIN,I'd,ToBeNormalized,i d,I'd 143 | 2191,2,PLAIN,ABS,ToBeNormalized,a b s,ABS 144 | 2203,1,PLAIN,THE,ToBeNormalized,t h e,THE 145 | 2203,3,PLAIN,BEHIND,ToBeNormalized,b e h i n d,BEHIND 146 | 2203,4,PLAIN,AVAAZ,ToBeNormalized,a v a a z,AVAAZ 147 | 2270,7,DIGIT,21770,ToBeNormalized,twenty one thousand seven hundred seventy,two one seven seven o 148 | 2274,1,PLAIN,WO,ToBeNormalized,w o,WO 149 | 2274,3,FRACTION,2006/118205,ToBeNormalized,two thousand six one thousand eight hundred fifths,two thousand six one hundred eighteen thousand two hundred fifths 150 | 2275,1,PUNCT,-,ToBeNormalized,to,- 151 | 2298,22,PLAIN,Oh,ToBeNormalized,o h,Oh 152 | 2320,12,PLAIN,ms,ToBeNormalized,m s,ms 153 | 2325,5,PLAIN,GLUT,ToBeNormalized,g l u t,GLUT 154 | 2344,5,PLAIN,APRA,ToBeNormalized,a p r a,APRA 155 | 2373,12,PLAIN,Bok,ToBeNormalized,b o k,Bok 156 | 2398,4,PUNCT,:,ToBeNormalized,to,: 157 | 2402,10,PLAIN,POW,ToBeNormalized,p o w,POW 158 | 2402,12,PLAIN,MIA,ToBeNormalized,m i a,MIA 159 | 2442,17,PLAIN,NOW,ToBeNormalized,n o w,NOW 160 | 2454,4,VERBATIM,-,ToBeNormalized,to,- 161 | 2466,1,PLAIN,YOU,ToBeNormalized,y o u,YOU 162 | 2505,1,PUNCT,-,ToBeNormalized,to,- 163 | 2505,3,VERBATIM,-,ToBeNormalized,to,- 164 | 2523,5,PLAIN,Of,ToBeNormalized,o f,Of 165 | 2528,3,PLAIN,RAFT,ToBeNormalized,r a f t,RAFT 166 | 2536,9,PLAIN,polarisation,ToBeNormalized,globalization,polarization 167 | 2542,1,PLAIN,ARTHUR,ToBeNormalized,a r t h u r,ARTHUR 168 | 2551,3,PLAIN,ES,ToBeNormalized,e s,ES 169 | 2566,9,PLAIN,catalysed,ToBeNormalized,catalogs,catalysed 170 | 2582,8,PLAIN,EVA,ToBeNormalized,e v a,EVA 171 | 2593,0,LETTERS,Akl,RemainSelf,Akl,a k l 172 | 2631,4,PLAIN,Spa,ToBeNormalized,s p a,Spa 173 | 2635,5,LETTERS,I,ToBeNormalized,one,I 174 | 2649,3,PLAIN,Ozat,ToBeNormalized,o z a t,Ozat 175 | 2650,15,PLAIN,AMS,ToBeNormalized,a m s,AMS 176 | 2697,5,PLAIN,AND,ToBeNormalized,a n d,AND 177 | 2701,1,ELECTRONIC,http://www.business-humanrights.org/Links/Repository/308254/,ToBeNormalized,h_letter _letter t_letter _letter t_letter _letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter _letter w_letter _letter w_letter dot b_letter u_letter s_letter i_letter n_letter e_letter s_letter s_letter _letter d_letter a_letter s_letter h_letter _letter h_letter u_letter m_letter a_letter n_letter r_letter i_letter g_letter h_letter t_letter s_letter dot o_letter r_letter g_letter _letter s_letter l_letter a_letter s_letter h_letter _letter l_letter i_letter n_letter k_letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter r_letter e_letter p_letter o_letter s_letter i_letter t_letter o_letter r_letter y_letter _letter s_letter l_letter a_letter s_letter h_letter _letter t_letter h_letter r_letter e_letter e_letter _letter s_letter l_letter a_letter s_letter h_letter,h_letter _letter t_letter _letter t_letter _letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter _letter w_letter _letter w_letter dot b_letter u_letter s_letter i_letter n_letter e_letter s_letter s_letter _letter d_letter a_letter s_letter h_letter _letter h_letter u_letter m_letter a_letter n_letter r_letter i_letter g_letter h_letter t_letter s_letter dot o_letter r_letter g_letter _letter s_letter l_letter a_letter s_letter h_letter _letter l_letter i_letter n_letter k_letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter r_letter e_letter p_letter o_letter s_letter i_letter t_letter o_letter r_letter y_letter _letter s_letter l_letter a_letter s_letter h_letter _letter t_letter h_letter r_letter e_letter e_letter _letter o_letter _letter e_letter i_letter g_letter h_letter t_letter _letter t_letter w_letter o_letter _letter f_letter i_letter v_letter e_letter _letter f_letter o_letter u_letter r_letter _letter s_letter l_letter a_letter s_letter h_letter 178 | 2747,0,PLAIN,Izadi,ToBeNormalized,i z a d i,Izadi 179 | 2764,1,PLAIN,aBontodBuenasuerteIntusanJose,ToBeNormalized,a_letter b o n t o d b u e n a s u e r t e i n t u s a n j o s e,aBontodBuenasuerteIntusanJose 180 | 2781,9,PUNCT,-,ToBeNormalized,to,- 181 | 2784,3,PLAIN,Luh,ToBeNormalized,l u h,Luh 182 | 2816,6,PLAIN,von,ToBeNormalized,v o n,von 183 | 2816,16,TELEPHONE,40 (1969) 111-124,ToBeNormalized,four o sil one nine six nine sil one four sil one two sil one four,four o sil one nine six nine sil one one one sil one two four 184 | 2880,2,PLAIN,Suzi,ToBeNormalized,s u z i,Suzi 185 | 2891,9,PLAIN,I,ToBeNormalized,the first,I 186 | 2906,11,PLAIN,ACS,ToBeNormalized,a c s,ACS 187 | 2949,6,PLAIN,synagogues,ToBeNormalized,synagog,synagogues 188 | 2979,7,PLAIN,SA,ToBeNormalized,s a,SA 189 | 2979,16,PLAIN,SA,ToBeNormalized,s a,SA 190 | 2981,9,PLAIN,NURBS,ToBeNormalized,n u r b s,NURBS 191 | 3024,13,DIGIT,11,ToBeNormalized,eleven,one one 192 | 3026,11,PLAIN,FAN,ToBeNormalized,f a n,FAN 193 | 3043,15,PLAIN,civilise,RemainSelf,civilise,civilize 194 | 3055,8,PLAIN,oak,ToBeNormalized,o a k,oak 195 | 3060,5,PLAIN,ASTRO,ToBeNormalized,a s t r o,ASTRO 196 | 3085,3,PLAIN,Pi,ToBeNormalized,p i,Pi 197 | 3120,7,PLAIN,LEED,ToBeNormalized,l e e d,LEED 198 | 3125,9,PLAIN,Eslov,ToBeNormalized,e s l o v,Eslov 199 | 3132,10,PUNCT,-,ToBeNormalized,to,- 200 | 3157,24,PLAIN,Sept,ToBeNormalized,s e p t,Sept 201 | 3162,1,ELECTRONIC,informationhttp://dynamic.stlouis-mo.gov/census/neighborhood.cfmhttp://dynamic.stlouis-mo.gov/census/neigh_comp.cfm,ToBeNormalized,i_letter n_letter f_letter o_letter r_letter m_letter a_letter t_letter i_letter o_letter n_letter h_letter t_letter t_letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter c_letter e_letter n_letter s_letter u_letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter n_letter e_letter n_letter g_letter h_letter i_letter c_letter r_letter h_letter o_letter o_letter d_letter dot c_letter _letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter c_letter e_letter n_letter s_letter u_letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter n_letter e_letter i_letter g_letter h_letter i_letter c_letter r_letter h_letter o_letter o_letter d_letter dot c_letter _letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter c_letter e_letter n_letter s_letter u_letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter n_letter e_letter i_letter g_letter h_letter i_letter c_letter o_letter m_letter p_letter dot c_letter _letter f_letter _letter m_letter,i_letter _letter n_letter _letter f_letter _letter o_letter _letter r_letter _letter m_letter _letter a_letter _letter t_letter _letter i_letter _letter o_letter _letter n_letter _letter h_letter _letter t_letter _letter t_letter _letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter d_letter y_letter n_letter a_letter m_letter i_letter c_letter dot s_letter t_letter l_letter o_letter u_letter i_letter s_letter _letter d_letter a_letter s_letter h_letter _letter m_letter o_letter dot g_letter o_letter v_letter _letter s_letter l_letter a_letter s_letter h_letter _letter c_letter e_letter n_letter s_letter u_letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter n_letter e_letter i_letter g_letter h_letter b_letter o_letter r_letter h_letter o_letter o_letter d_letter dot c_letter _letter f_letter _letter m_letter _letter h_letter _letter t_letter _letter t_letter _letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter d_letter y_letter n_letter a_letter m_letter i_letter c_letter dot s_letter t_letter l_letter o_letter u_letter i_letter s_letter _letter d_letter a_letter s_letter h_letter _letter m_letter o_letter dot g_letter o_letter v_letter _letter s_letter l_letter a_letter s_letter h_letter _letter c_letter e_letter n_letter s_letter u_letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter n_letter e_letter i_letter g_letter h_letter _letter u_letter n_letter d_letter e_letter r_letter s_letter c_letter o_letter r_letter e_letter _letter c_letter o_letter m_letter p_letter dot c_letter _letter f_letter _letter m_letter 202 | 3166,3,PLAIN,Eva,ToBeNormalized,e v a,Eva 203 | 3167,6,PLAIN,evil,ToBeNormalized,e v i l,evil 204 | 3246,4,PLAIN,valour,ToBeNormalized,vigor,valour 205 | 3268,5,PLAIN,riA,ToBeNormalized,r i a,riA 206 | 3268,8,PLAIN,Cha,ToBeNormalized,c h a,Cha 207 | 3276,1,PLAIN,FUKUHARA,ToBeNormalized,f u k u h a r a,FUKUHARA 208 | 3284,1,LETTERS,Var,RemainSelf,Var,v a r 209 | 3285,0,PLAIN,FOCUS,ToBeNormalized,f o c u s,FOCUS 210 | 3291,15,PLAIN,ESS,ToBeNormalized,e s s,ESS 211 | 3302,1,PLAIN,Ur,ToBeNormalized,u r,Ur 212 | 3311,8,CARDINAL,X,RemainSelf,X,ten 213 | 3337,8,PLAIN,OUTSTANDING,ToBeNormalized,o u t s t a n d i n g,OUTSTANDING 214 | 3337,10,PLAIN,IN,ToBeNormalized,i n,IN 215 | 3337,13,PLAIN,VARIETY,ToBeNormalized,v a r i e t y,VARIETY 216 | 3337,15,PLAIN,MUSIC,ToBeNormalized,m u s i c,MUSIC 217 | 3340,9,PLAIN,AN,ToBeNormalized,a n,AN 218 | 3340,18,PLAIN,RA,ToBeNormalized,r a,RA 219 | 3343,5,PLAIN,MAR,ToBeNormalized,m a r,MAR 220 | 3347,5,MEASURE,1/2 cc,ToBeNormalized,one _letter d_letter a_letter s_letter h_letter _letter t_letter w_letter o_letter _letter o_letter,half a c c 221 | 3347,8,PLAIN,ULTRAFINE,ToBeNormalized,u l t r a f i n e,ULTRAFINE 222 | 3347,14,PLAIN,UltraFine,ToBeNormalized,favorite,UltraFine 223 | 3347,16,PLAIN,SHORT,ToBeNormalized,s h o r t,SHORT 224 | 3347,22,PLAIN,MICROFINE,ToBeNormalized,m i c r o f i n e,MICROFINE 225 | 3350,4,PLAIN,FIN,ToBeNormalized,f i n,FIN 226 | 3352,2,PLAIN,Aja,ToBeNormalized,a j a,Aja 227 | 3380,15,PLAIN,SEZ,ToBeNormalized,s e z,SEZ 228 | 3407,7,PLAIN,so,ToBeNormalized,s o,so 229 | 3415,2,PLAIN,No,ToBeNormalized,number,No 230 | 3415,5,PLAIN,Ads,ToBeNormalized,a d s,Ads 231 | 3505,1,PLAIN,TeX,ToBeNormalized,t e x,TeX 232 | 3519,21,PLAIN,Am,ToBeNormalized,a m,Am 233 | 3549,6,PLAIN,UEFA,ToBeNormalized,u e f a,UEFA 234 | 3555,9,VERBATIM,-,ToBeNormalized,to,- 235 | 3562,5,ELECTRONIC,//www.mediacorp.sg/corporate-en/corporatehttp://www.ofcom.org.uk/static/archive/itc/itc_publications/codes_guidance/programme_code/section_4.asp.html,ToBeNormalized,s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter _letter w_letter _letter t_letter _letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter c_letter o_letter r_letter t_letter o_letter r_letter a_letter _letter d_letter a_letter s_letter h_letter _letter e_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter c_letter o_letter r_letter p_letter o_letter r_letter a_letter t_letter e_letter _letter h_letter _letter t_letter _letter t_letter _letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter a_letter r_letter c_letter h_letter i_letter v_letter e_letter _letter s_letter l_letter a_letter s_letter h_letter _letter i_letter t_letter c_letter h_letter i_letter t_letter c_letter u_letter p_letter u_letter r_letter t_letter i_letter c_letter a_letter t_letter i_letter o_letter n_letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter i_letter t_letter,s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter _letter w_letter _letter w_letter dot m_letter e_letter d_letter i_letter a_letter c_letter o_letter r_letter p_letter dot s_letter _letter g_letter _letter s_letter l_letter a_letter s_letter h_letter _letter c_letter o_letter r_letter p_letter o_letter r_letter a_letter t_letter e_letter _letter d_letter a_letter s_letter h_letter _letter e_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter c_letter o_letter r_letter p_letter o_letter r_letter a_letter t_letter e_letter h_letter t_letter t_letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter _letter w_letter _letter w_letter dot o_letter f_letter c_letter o_letter m_letter dot o_letter r_letter g_letter dot u_letter k_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter t_letter a_letter t_letter i_letter c_letter _letter s_letter l_letter a_letter s_letter h_letter _letter a_letter r_letter c_letter h_letter i_letter v_letter e_letter _letter s_letter l_letter a_letter s_letter h_letter _letter i_letter t_letter c_letter _letter s_letter l_letter a_letter s_letter h_letter _letter i_letter t_letter c_letter _letter u_letter n_letter d_letter e_letter r_letter s_letter c_letter o_letter r_letter e_letter _letter p_letter u_letter b_letter l_letter i_letter c_letter a_letter t_letter i_letter o_letter n_letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter c_letter o_letter d_letter e_letter s_letter _letter u_letter n_letter d_letter e_letter r_letter s_letter c_letter o_letter r_letter e_letter _letter g_letter u_letter i_letter d_letter a_letter n_letter c_letter e_letter _letter s_letter l_letter a_letter s_letter h_letter _letter p_letter r_letter o_letter g_letter r_letter a_letter m_letter _letter u_letter n_letter d_letter e_letter r_letter s_letter c_letter o_letter r_letter e_letter _letter c_letter o_letter d_letter e_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter e_letter c_letter t_letter i_letter o_letter n_letter _letter u_letter n_letter d_letter e_letter r_letter s_letter c_letter o_letter r_letter e_letter _letter f_letter o_letter u_letter r_letter dot a_letter _letter s_letter _letter p_letter dot h_letter _letter t_letter _letter m_letter _letter l_letter 236 | 3575,20,PLAIN,AIDS,ToBeNormalized,a i d s,AIDS 237 | 3601,2,PLAIN,DARPA,ToBeNormalized,d a r p a,DARPA 238 | 3601,10,PLAIN,programmed,ToBeNormalized,program,programmed 239 | 3603,1,PLAIN,CAST,ToBeNormalized,c a s t,CAST 240 | 3659,10,PLAIN,Suwo,ToBeNormalized,s u w o,Suwo 241 | 3661,6,PLAIN,EPs,ToBeNormalized,e p's,EPs 242 | 3669,8,LETTERS,Uy,RemainSelf,Uy,u y 243 | 3676,6,PUNCT,-,ToBeNormalized,to,- 244 | 3678,6,PLAIN,Idol,ToBeNormalized,i d o l,Idol 245 | 3736,2,PLAIN,MGR,ToBeNormalized,m g r,MGR 246 | 3738,5,PLAIN,Chi,ToBeNormalized,c h i,Chi 247 | 3749,13,PLAIN,VI,ToBeNormalized,the sixth,VI 248 | 3789,8,PUNCT,-,ToBeNormalized,to,- 249 | 3790,0,PLAIN,THE,ToBeNormalized,t h e,THE 250 | 3808,0,PLAIN,PLoS,ToBeNormalized,p l o s,PLoS 251 | 3907,0,PLAIN,Programmed,ToBeNormalized,program,Programmed 252 | 3920,7,PLAIN,BOO,ToBeNormalized,b o o,BOO 253 | 3951,2,DIGIT,126,ToBeNormalized,one hundred twenty six,one two six 254 | 3952,6,PLAIN,Xtra,ToBeNormalized,x t r a,Xtra 255 | 3961,8,PLAIN,rev,ToBeNormalized,r e v,rev 256 | 3965,3,PLAIN,MARATHON,ToBeNormalized,m a r a t h o n,MARATHON 257 | 3965,4,PLAIN,CUP,ToBeNormalized,c u p,CUP 258 | 3976,8,PLAIN,Sept,ToBeNormalized,s e p t,Sept 259 | 3984,21,PLAIN,stop,ToBeNormalized,s t o p,stop 260 | 3992,9,LETTERS,Aam,RemainSelf,Aam,a a m 261 | 4027,20,PUNCT,-,ToBeNormalized,to,- 262 | 4034,1,PLAIN,COMPANY,ToBeNormalized,c o m p a n y,COMPANY 263 | 4039,18,MEASURE,295 ch,ToBeNormalized,two hundred ninety five hours,two hundred ninety five chains 264 | 4048,0,PLAIN,CAT,ToBeNormalized,c a t,CAT 265 | 4079,8,PLAIN,XI,ToBeNormalized,eleven,XI 266 | 4085,8,VERBATIM,Θ,ToBeNormalized,eta,theta 267 | 4102,1,PLAIN,SM,ToBeNormalized,s m,SM 268 | 4111,5,PLAIN,tri,ToBeNormalized,t r i,tri 269 | 4126,5,PLAIN,SEC,ToBeNormalized,s e c,SEC 270 | 4136,3,ELECTRONIC,http://www.jstor.org/stable/2799027;,ToBeNormalized,h_letter _letter t_letter _letter t_letter _letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter _letter w_letter _letter w_letter dot j_letter _letter s_letter t_letter o_letter r_letter dot o_letter r_letter g_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter t_letter a_letter b_letter l_letter e_letter _letter s_letter l_letter a_letter s_letter h_letter _letter t_letter w_letter o_letter _letter s_letter e_letter v_letter e_letter n_letter _letter s_letter e_letter v_letter e_letter n_letter _letter s_letter e_letter v_letter e_letter n_letter _letter s_letter e_letter v_letter e_letter n_letter _letter s_letter e_letter v_letter e_letter n_letter _letter s_letter e_letter v_letter e_letter n_letter _letter s_letter e_letter v_letter e_letter n_letter _letter s_letter e_letter v_letter e_letter n_letter _letter s_letter e_letter v_letter e_letter n_letter _letter s_letter e_letter v_letter e_letter n_letter ,h_letter _letter t_letter _letter t_letter _letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter _letter w_letter _letter w_letter dot j_letter _letter s_letter t_letter o_letter r_letter dot o_letter r_letter g_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter t_letter a_letter b_letter l_letter e_letter _letter s_letter l_letter a_letter s_letter h_letter _letter t_letter w_letter o_letter _letter s_letter e_letter v_letter e_letter n_letter _letter n_letter i_letter n_letter e_letter _letter n_letter i_letter n_letter e_letter _letter o_letter _letter t_letter w_letter o_letter _letter s_letter e_letter v_letter e_letter n_letter _letter s_letter e_letter m_letter i_letter c_letter o_letter l_letter o_letter n_letter 271 | 4139,21,PLAIN,INTERVIEW,ToBeNormalized,i n t e r v i e w,INTERVIEW 272 | 4162,5,PLAIN,GUS,ToBeNormalized,g u s,GUS 273 | 4171,18,PUNCT,-,ToBeNormalized,to,- 274 | 4172,2,PUNCT,-,ToBeNormalized,to,- 275 | 4178,5,PLAIN,VEVO,ToBeNormalized,v e v o,VEVO 276 | 4189,6,TELEPHONE,1999-2000 BCA,ToBeNormalized,one nine nine nine sil two hundred sil b c a,one nine nine nine sil two thousand sil b c a 277 | 4195,1,PLAIN,Dilweg,RemainSelf,Dilweg,dil weg 278 | 4215,8,PLAIN,Li,ToBeNormalized,l i,Li 279 | 4235,1,PLAIN,Huba,ToBeNormalized,h u b a,Huba 280 | 4244,2,PLAIN,DIN,ToBeNormalized,d i n,DIN 281 | 4244,12,MEASURE,2 mA,ToBeNormalized,two a m,two milli amperes 282 | 4252,2,PLAIN,IRA,ToBeNormalized,i r a,IRA 283 | 4272,0,PLAIN,DIC,ToBeNormalized,d i c,DIC 284 | 4273,12,PLAIN,VIDEO,ToBeNormalized,v i d o i,VIDEO 285 | 4276,11,PLAIN,Ajaji,ToBeNormalized,a j a j i,Ajaji 286 | 4279,5,PLAIN,Rep,ToBeNormalized,r e p,Rep 287 | 4307,2,PLAIN,HOWZE,ToBeNormalized,h o w z e,HOWZE 288 | 4343,7,PLAIN,CAZy,ToBeNormalized,c a z y,CAZy 289 | 4365,7,PLAIN,Esma,ToBeNormalized,e s m a,Esma 290 | 4400,23,PLAIN,CEA,ToBeNormalized,c e a,CEA 291 | 4457,3,PLAIN,Isla,ToBeNormalized,i s l a,Isla 292 | 4458,3,PLAIN,SAT,ToBeNormalized,s a t,SAT 293 | 4480,9,PLAIN,pro,ToBeNormalized,p r o,pro 294 | 4480,14,LETTERS,Nea,RemainSelf,Nea,n e a 295 | 4500,5,PLAIN,BY,ToBeNormalized,b y,BY 296 | 4500,11,PLAIN,I,ToBeNormalized,the first,I 297 | 4520,4,PLAIN,Esma,ToBeNormalized,e s m a,Esma 298 | 4521,0,PLAIN,CURRICULUM,ToBeNormalized,c u r r i c u l u m,CURRICULUM 299 | 4536,3,PLAIN,Esma,ToBeNormalized,e s m a,Esma 300 | 4550,5,PLAIN,SA,ToBeNormalized,s a,SA 301 | 4565,5,PLAIN,UNESCO,ToBeNormalized,u n e s c o,UNESCO 302 | 4572,1,PUNCT,-,ToBeNormalized,to,- 303 | 4600,1,PLAIN,Viz,ToBeNormalized,v i z,Viz 304 | 4601,2,ELECTRONIC,www.cdc.gov/HealthyYouth/shpps/2006/factsheets/pdf/FS_Overview_SHPPS2006.pdf,ToBeNormalized,w_letter _letter w_letter _letter w_letter dot c_letter d_letter c_letter dot g_letter o_letter v_letter _letter s_letter l_letter a_letter s_letter h_letter _letter h_letter e_letter a_letter l_letter t_letter h_letter y_letter y_letter o_letter u_letter t_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter o_letter _letter o_letter _letter p_letter _letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter f_letter a_letter c_letter t_letter s_letter h_letter e_letter e_letter t_letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter p_letter _letter d_letter _letter f_letter _letter s_letter l_letter a_letter s_letter h_letter _letter f_letter _letter s_letter _letter t_letter h_letter r_letter e_letter e_letter _letter s_letter l_letter a_letter s_letter h_letter _letter f_letter o_letter u_letter r_letter _letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter f_letter o_letter r_letter t_letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter,w_letter _letter w_letter _letter w_letter dot c_letter _letter d_letter _letter c_letter dot g_letter o_letter v_letter _letter s_letter l_letter a_letter s_letter h_letter _letter h_letter e_letter a_letter l_letter t_letter h_letter y_letter y_letter o_letter u_letter t_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter _letter h_letter _letter p_letter _letter p_letter _letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter t_letter w_letter o_letter _letter o_letter _letter o_letter _letter s_letter i_letter x_letter _letter s_letter l_letter a_letter s_letter h_letter _letter f_letter a_letter c_letter t_letter s_letter h_letter e_letter e_letter t_letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter p_letter _letter d_letter _letter f_letter _letter s_letter l_letter a_letter s_letter h_letter _letter f_letter _letter s_letter _letter u_letter n_letter d_letter e_letter r_letter s_letter c_letter o_letter r_letter e_letter _letter o_letter v_letter e_letter r_letter v_letter i_letter e_letter w_letter _letter u_letter n_letter d_letter e_letter r_letter s_letter c_letter o_letter r_letter e_letter _letter s_letter _letter h_letter _letter p_letter _letter p_letter _letter s_letter _letter t_letter w_letter o_letter _letter o_letter _letter o_letter _letter s_letter i_letter x_letter dot p_letter _letter d_letter _letter f_letter 305 | 4614,9,PLAIN,FEBRUARY,ToBeNormalized,f e b r u a r y,FEBRUARY 306 | 4618,2,PLAIN,stylised,RemainSelf,stylised,stylized 307 | 4643,16,PLAIN,V,ToBeNormalized,the fifth,V 308 | 4651,16,PLAIN,IA,ToBeNormalized,i a,IA 309 | 4661,8,PLAIN,RITA,ToBeNormalized,r i t a,RITA 310 | 4677,18,LETTERS,Lun,RemainSelf,Lun,l u n 311 | 4680,7,PLAIN,V,ToBeNormalized,the fifth,V 312 | 4697,3,PLAIN,SEED,ToBeNormalized,s e e d,SEED 313 | 4704,1,PLAIN,NA's,ToBeNormalized,n a's,NA's 314 | 4733,3,PLAIN,Us,ToBeNormalized,u s,Us 315 | 4736,18,PLAIN,programmed,ToBeNormalized,program,programmed 316 | 4748,15,PLAIN,PCs,ToBeNormalized,p c's,PCs 317 | 4766,2,PLAIN,GUJARAT,ToBeNormalized,g u j a r a t,GUJARAT 318 | 4766,3,PLAIN,LEGISLATIVE,ToBeNormalized,l e g i s l a t i v e,LEGISLATIVE 319 | 4784,3,PLAIN,col,ToBeNormalized,colonel,col 320 | 4816,4,PLAIN,KI,ToBeNormalized,k i,KI 321 | 4834,9,PLAIN,LIX,ToBeNormalized,l i x,LIX 322 | 4835,11,PLAIN,ICA,ToBeNormalized,i c a,ICA 323 | 4850,10,PLAIN,ArmandoFabiolaJackieAlexAfter,ToBeNormalized,a r m a n d o f a b i o l a j a c k i e a l e x a x t e r,ArmandoFabiolaJackieAlexAfter 324 | 4859,0,PLAIN,PIN,ToBeNormalized,p i n,PIN 325 | 4885,5,PUNCT,-,ToBeNormalized,to,- 326 | 4895,12,DIGIT,1271,ToBeNormalized,twelve seventy one,one two seven one 327 | 4896,1,PLAIN,EL,ToBeNormalized,e l,EL 328 | 4907,4,PLAIN,Ass,ToBeNormalized,a s s,Ass 329 | 4911,14,PLAIN,WHOI,ToBeNormalized,w h o i,WHOI 330 | 4918,9,PLAIN,idolised,RemainSelf,idolised,idolized 331 | 4929,1,PLAIN,V,ToBeNormalized,five,V 332 | 4931,17,PLAIN,nuoc,ToBeNormalized,n u o c,nuoc 333 | 4931,18,PLAIN,ngoai,ToBeNormalized,n g o a i,ngoai 334 | 4932,19,PLAIN,Etna,ToBeNormalized,e t n a,Etna 335 | 4958,4,PLAIN,MIR,ToBeNormalized,m i r,MIR 336 | 4958,15,PLAIN,miRNA,ToBeNormalized,m i r n a,miRNA 337 | 4960,1,PLAIN,secularisation,ToBeNormalized,saint,secularization 338 | 4997,0,PLAIN,Qui,ToBeNormalized,q u i,Qui 339 | 4997,1,PLAIN,est,ToBeNormalized,e s t,est 340 | 4997,2,LETTERS,Abdu'l,RemainSelf,Abdu'l,a b d u l 341 | 5014,2,PLAIN,IRA,ToBeNormalized,i r a,IRA 342 | 5025,6,PLAIN,Vaux,ToBeNormalized,v a u x,Vaux 343 | 5034,0,PLAIN,Neurocomputational,ToBeNormalized,behavioral,Neurocomputational 344 | 5039,7,PLAIN,resocialisation,ToBeNormalized,w,resocialization 345 | 5064,9,PLAIN,ix,ToBeNormalized,i x,ix 346 | 5071,3,PLAIN,Caichigue,ToBeNormalized,catalog,Caichigue 347 | 5115,12,PLAIN,MIT,ToBeNormalized,m i t,MIT 348 | 5158,4,PLAIN,BAS,ToBeNormalized,b a s,BAS 349 | 5160,2,PLAIN,ABS,ToBeNormalized,a b s,ABS 350 | 5166,5,PLAIN,GUS,ToBeNormalized,g u s,GUS 351 | 5195,5,PLAIN,INTRODUCTION,ToBeNormalized,i n t r o d u c t i o n,INTRODUCTION 352 | 5195,7,PLAIN,CADES,ToBeNormalized,c a d e s,CADES 353 | 5195,10,PLAIN,SERVICE,ToBeNormalized,s e r v i c e,SERVICE 354 | 5195,11,PLAIN,PROVIDERS,ToBeNormalized,p r o v i d e r s,PROVIDERS 355 | 5196,8,PLAIN,czar,ToBeNormalized,c z a r,czar 356 | 5236,12,PLAIN,A's,ToBeNormalized,a s's,A's 357 | 5238,2,PLAIN,mrs,ToBeNormalized,m r s,mrs 358 | 5253,6,PLAIN,Zuk,ToBeNormalized,z u k,Zuk 359 | 5257,4,PLAIN,ANHYDRASE,ToBeNormalized,a n h y d r a s e,ANHYDRASE 360 | 5277,10,PLAIN,miRNA,ToBeNormalized,m i r n a,miRNA 361 | 5290,10,PLAIN,Lug,ToBeNormalized,l u g,Lug 362 | 5301,0,LETTERS,Mpigi,RemainSelf,Mpigi,m p i g i 363 | 5301,10,LETTERS,Mpigi,RemainSelf,Mpigi,m p i g i 364 | 5315,0,PLAIN,Kroh,ToBeNormalized,k r o h,Kroh 365 | 5316,11,LETTERS,Mpigi,RemainSelf,Mpigi,m p i g i 366 | 5381,6,DATE,00s,ToBeNormalized,hundreds,o o 367 | 5383,13,PLAIN,vii,ToBeNormalized,v i i,vii 368 | 5419,0,PLAIN,Nieuw,ToBeNormalized,n i e u w,Nieuw 369 | 5422,5,PLAIN,PANDAS,ToBeNormalized,p a n d a s,PANDAS 370 | 5511,8,ELECTRONIC,//www.nytimes.com/2014/06/19/fashion/no-body-talk-summer-camps.html,ToBeNormalized,s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter _letter w_letter _letter w_letter dot n_letter _letter y_letter _letter t_letter _letter i_letter _letter m_letter _letter e_letter _letter s_letter dot c_letter o_letter m_letter _letter s_letter l_letter a_letter s_letter h_letter _letter t_letter w_letter o_letter _letter o_letter _letter o_letter n_letter e_letter _letter o_letter n_letter e_letter _letter s_letter l_letter a_letter s_letter h_letter _letter o_letter _letter s_letter i_letter x_letter _letter s_letter l_letter a_letter s_letter h_letter _letter o_letter _letter s_letter i_letter x_letter _letter s_letter l_letter a_letter s_letter h_letter _letter o_letter _letter s_letter l_letter a_letter s_letter h_letter _letter f_letter o_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter f_letter o_letter u_letter r_letter _letter o_letter n_letter e_letter _letter s_letter l_letter a_letter s_letter h_letter _letter,s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter _letter w_letter _letter w_letter dot n_letter y_letter t_letter i_letter m_letter e_letter s_letter dot c_letter o_letter m_letter _letter s_letter l_letter a_letter s_letter h_letter _letter t_letter w_letter e_letter n_letter t_letter y_letter _letter f_letter o_letter u_letter r_letter t_letter e_letter e_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter o_letter _letter s_letter i_letter x_letter _letter s_letter l_letter a_letter s_letter h_letter _letter n_letter i_letter n_letter e_letter t_letter e_letter e_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter f_letter a_letter s_letter h_letter i_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter n_letter o_letter _letter d_letter a_letter s_letter h_letter _letter b_letter o_letter d_letter y_letter _letter d_letter a_letter s_letter h_letter _letter t_letter a_letter l_letter k_letter _letter d_letter a_letter s_letter h_letter _letter s_letter u_letter m_letter m_letter e_letter r_letter _letter d_letter a_letter s_letter h_letter _letter c_letter a_letter m_letter p_letter s_letter dot h_letter _letter t_letter _letter m_letter _letter l_letter 371 | 5553,13,PLAIN,SM,ToBeNormalized,s m,SM 372 | 5607,8,LETTERS,zvaz,RemainSelf,zvaz,z v a z 373 | 5633,4,PLAIN,PRU,ToBeNormalized,p r u,PRU 374 | 5633,5,PLAIN,CHA,ToBeNormalized,c h a,CHA 375 | 5634,11,PLAIN,XI,ToBeNormalized,eleven,XI 376 | 5634,15,PLAIN,WILD,ToBeNormalized,w i l d,WILD 377 | 5641,9,LETTERS,Arci,RemainSelf,Arci,a r c i 378 | 5683,3,CARDINAL,310780751,ToBeNormalized,thirty one million seventy eight thousand seventy five,three hundred ten million seven hundred eighty thousand seven hundred fifty one 379 | 5684,4,PLAIN,ZOLaw,ToBeNormalized,z o l a w,ZOLaw 380 | 5688,0,PLAIN,Issa,ToBeNormalized,i s s a,Issa 381 | 5705,14,PLAIN,Rep,ToBeNormalized,r e p,Rep 382 | 5711,7,PLAIN,mrs,ToBeNormalized,mister,mrs 383 | 5744,0,PLAIN,NASA,ToBeNormalized,n a s a,NASA 384 | 5746,8,LETTERS,Ilam,RemainSelf,Ilam,i l a m 385 | 5747,6,PLAIN,PCs,ToBeNormalized,p c's,PCs 386 | 5756,12,PLAIN,COO,ToBeNormalized,c o o,COO 387 | 5757,8,LETTERS,Fase,RemainSelf,Fase,f a s e 388 | 5769,6,PLAIN,Wu,ToBeNormalized,w u,Wu 389 | 5771,9,LETTERS,OAM,RemainSelf,OAM,o a m 390 | 5772,4,PLAIN,MAC,ToBeNormalized,m a c,MAC 391 | 5778,2,PLAIN,Xiu,ToBeNormalized,x i u,Xiu 392 | 5803,3,PLAIN,Isla,ToBeNormalized,i s l a,Isla 393 | 5828,2,FRACTION,-133/94,ToBeNormalized,minus thirty three hundred ninety fourths,minus one hundred thirty three ninety fourths 394 | 5841,2,PLAIN,Centro,ToBeNormalized,center,Centro 395 | 5841,7,PLAIN,sul,ToBeNormalized,s u l,sul 396 | 5848,15,PLAIN,Ach,ToBeNormalized,a c h,Ach 397 | 5851,5,DECIMAL,.763,ToBeNormalized,seven hundred sixty three,point seven six three 398 | 5865,3,FRACTION,⅞,ToBeNormalized,upsilon,seven eighths 399 | 5907,2,PLAIN,Sétif,ToBeNormalized,s e acute t i f,Sétif 400 | 5907,7,PLAIN,ES,ToBeNormalized,e s,ES 401 | 5907,8,PLAIN,Sétif,ToBeNormalized,s e acute t i f,Sétif 402 | 5928,11,PLAIN,DOS,ToBeNormalized,d o s,DOS 403 | 5945,5,DIGIT,68821,ToBeNormalized,sixty eight thousand eight hundred twenty one,six eight eight two one 404 | 5963,7,PLAIN,du,ToBeNormalized,d u,du 405 | 5963,9,PLAIN,er,ToBeNormalized,e r,er 406 | 5966,1,PLAIN,MIT,ToBeNormalized,m i t,MIT 407 | 5977,9,PLAIN,SAVE,ToBeNormalized,s a v e,SAVE 408 | 5988,6,PLAIN,OST,ToBeNormalized,o s t,OST 409 | 5991,3,DATE,10/10/00,ToBeNormalized,ten tenth,the tenth of october o o 410 | 5991,9,VERBATIM,-,ToBeNormalized,to,- 411 | 6009,8,PLAIN,PAC,ToBeNormalized,p a c,PAC 412 | 6021,12,PLAIN,andWissenschaftliche,ToBeNormalized,a_letter,andWissenschaftliche 413 | 6043,1,PLAIN,I'm,ToBeNormalized,one meter,I'm 414 | 6059,13,PLAIN,equalised,ToBeNormalized,e,equalized 415 | 6063,5,PLAIN,SPIN,ToBeNormalized,s p i n,SPIN 416 | 6079,10,ORDINAL,III,ToBeNormalized,three,the third 417 | 6084,1,PLAIN,ORCHESTRA,ToBeNormalized,o r c h e s t r a,ORCHESTRA 418 | 6084,3,PLAIN,SAMPLES,ToBeNormalized,s a m p l e s,SAMPLES 419 | 6099,0,LETTERS,mr,ToBeNormalized,mister,m r 420 | 6101,4,PLAIN,I,ToBeNormalized,the first,I 421 | 6112,7,PLAIN,KANU,ToBeNormalized,k a n u,KANU 422 | 6115,6,PLAIN,d'etre,ToBeNormalized,n e t r e,d'etre 423 | 6123,5,DATE,2010,ToBeNormalized,two o one o,twenty ten 424 | 6124,2,PLAIN,IL,ToBeNormalized,i l,IL 425 | 6142,7,PLAIN,ne's,ToBeNormalized,n e's,ne's 426 | 6151,1,PLAIN,neighbourhood's,ToBeNormalized,neighborhoods,neighbourhood's 427 | 6155,21,PLAIN,fundraise,ToBeNormalized,f_letter,fundraise 428 | 6168,4,PLAIN,AB,ToBeNormalized,a b,AB 429 | 6206,1,LETTERS,Tou,RemainSelf,Tou,t o u 430 | 6218,1,PLAIN,Aamir,ToBeNormalized,a a m i r,Aamir 431 | 6235,5,LETTERS,d'Yeu,RemainSelf,d'Yeu,d y e u 432 | 6235,15,PLAIN,V,ToBeNormalized,the fifth,V 433 | 6258,14,PLAIN,COM,ToBeNormalized,c o m,COM 434 | 6259,2,PLAIN,all,ToBeNormalized,a l l,all 435 | 6261,11,ELECTRONIC,https://web.archive.org/20130716070450/http://www.warriors.co.nz:80/playerprofiledisplay/Warriors/Suaia%20Matagi/7207,ToBeNormalized,h_letter _letter t_letter _letter t_letter _letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter e_letter b_letter dot a_letter r_letter c_letter h_letter i_letter v_letter e_letter dot o_letter r_letter g_letter _letter s_letter l_letter a_letter s_letter h_letter _letter t_letter w_letter o_letter _letter o_letter _letter o_letter _letter o_letter _letter o_letter _letter o_letter _letter o_letter _letter o_letter _letter o_letter _letter o_letter _letter o_letter _letter o_letter _letter o_letter _letter o_letter _letter o_letter _letter s_letter e_letter v_letter e_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter h_letter t_letter t_letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter h_letter _letter t_letter _letter t_letter _letter p_letter _letter,h_letter _letter t_letter _letter t_letter _letter p_letter _letter s_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter e_letter b_letter dot a_letter r_letter c_letter h_letter i_letter v_letter e_letter dot o_letter r_letter g_letter _letter s_letter l_letter a_letter s_letter h_letter _letter t_letter w_letter o_letter _letter o_letter _letter o_letter n_letter e_letter _letter t_letter h_letter r_letter e_letter e_letter _letter o_letter _letter s_letter e_letter v_letter e_letter n_letter _letter o_letter n_letter e_letter _letter s_letter i_letter x_letter _letter o_letter _letter s_letter e_letter v_letter e_letter n_letter _letter o_letter _letter f_letter o_letter u_letter r_letter _letter f_letter i_letter v_letter e_letter _letter o_letter _letter s_letter l_letter a_letter s_letter h_letter _letter h_letter _letter t_letter _letter t_letter _letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter _letter w_letter _letter w_letter dot w_letter a_letter r_letter r_letter i_letter o_letter r_letter s_letter dot c_letter o_letter dot n_letter _letter z_letter _letter c_letter o_letter l_letter o_letter n_letter _letter e_letter i_letter g_letter h_letter t_letter y_letter _letter s_letter l_letter a_letter s_letter h_letter _letter p_letter l_letter a_letter y_letter e_letter r_letter p_letter r_letter o_letter f_letter i_letter l_letter e_letter d_letter i_letter s_letter p_letter l_letter a_letter y_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter a_letter r_letter r_letter i_letter o_letter r_letter s_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter u_letter a_letter i_letter a_letter _letter p_letter e_letter r_letter c_letter e_letter n_letter t_letter _letter t_letter w_letter e_letter n_letter t_letter y_letter _letter m_letter a_letter t_letter a_letter g_letter i_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter e_letter v_letter e_letter n_letter _letter t_letter w_letter o_letter _letter o_letter _letter s_letter e_letter v_letter e_letter n_letter 436 | 6280,0,PLAIN,EGAN,ToBeNormalized,e g a n,EGAN 437 | 6280,2,PLAIN,JOHN,ToBeNormalized,j o h n,JOHN 438 | 6280,4,PLAIN,THOMAS,ToBeNormalized,t h o m a s,THOMAS 439 | 6284,3,PLAIN,Up,ToBeNormalized,u p,Up 440 | 6289,1,PLAIN,AS,ToBeNormalized,a s,AS 441 | 6314,21,PLAIN,Naama,ToBeNormalized,n a a m a,Naama 442 | 6329,4,PLAIN,Oh,ToBeNormalized,o h,Oh 443 | 6339,2,DIGIT,2013,ToBeNormalized,twenty thirteen,two o one three 444 | 6358,5,PLAIN,Ill,ToBeNormalized,i l l,Ill 445 | 6365,6,DATE,2010,ToBeNormalized,two o one o,twenty ten 446 | 6410,19,PLAIN,DID,ToBeNormalized,d i d,DID 447 | 6431,4,PLAIN,Prus',ToBeNormalized,p r u's,Prus' 448 | 6438,9,PLAIN,Lok,ToBeNormalized,l o k,Lok 449 | 6473,2,PLAIN,Dev,ToBeNormalized,d e v,Dev 450 | 6473,13,PLAIN,Dev,ToBeNormalized,d e v,Dev 451 | 6482,4,PLAIN,Prus',ToBeNormalized,p r u's,Prus' 452 | 6496,4,PLAIN,UCCIO,ToBeNormalized,u c c i o,UCCIO 453 | 6496,10,PLAIN,DE,ToBeNormalized,d e,DE 454 | 6504,6,PLAIN,revolutionise,RemainSelf,revolutionise,revolutionize 455 | 6506,10,PLAIN,ACT,ToBeNormalized,a c t,ACT 456 | 6524,11,PLAIN,AIDS,ToBeNormalized,a i d s,AIDS 457 | 6577,21,PLAIN,MUN,ToBeNormalized,m u n,MUN 458 | 6581,6,LETTERS,nul,RemainSelf,nul,n u l 459 | 6612,5,PLAIN,DOI,ToBeNormalized,d o i,DOI 460 | 6618,7,PLAIN,Ku,ToBeNormalized,k u,Ku 461 | 6637,2,PLAIN,I,ToBeNormalized,the first,I 462 | 6648,11,PLAIN,GOV,ToBeNormalized,g o v,GOV 463 | 6711,9,PLAIN,ISO,ToBeNormalized,i s o,ISO 464 | 6717,1,PLAIN,I,ToBeNormalized,one,I 465 | 6743,1,PLAIN,est,ToBeNormalized,e s t,est 466 | 6777,3,CARDINAL,V,ToBeNormalized,the fifth,five 467 | 6800,0,PLAIN,REGIO,ToBeNormalized,r e g i o,REGIO 468 | 6800,20,PLAIN,ACID,ToBeNormalized,a c i d,ACID 469 | 6814,7,PLAIN,AG,ToBeNormalized,a g,AG 470 | 6818,0,PLAIN,DAR,ToBeNormalized,d a r,DAR 471 | 6844,1,PLAIN,Umro,ToBeNormalized,u m r o,Umro 472 | 6856,5,PLAIN,B's,ToBeNormalized,b's,B's 473 | 6857,9,PLAIN,Scat,ToBeNormalized,s c a t,Scat 474 | 6874,4,PLAIN,I,ToBeNormalized,the first,I 475 | 6878,5,PLAIN,Glas,ToBeNormalized,g l a's,Glas 476 | 6887,18,PUNCT,-,ToBeNormalized,to,- 477 | 6938,5,PLAIN,GUS,ToBeNormalized,g u s,GUS 478 | 6945,1,ORDINAL,II,ToBeNormalized,two,the second 479 | 6946,4,PLAIN,digitised,RemainSelf,digitised,digitized 480 | 6946,11,PLAIN,Digitisation,ToBeNormalized,digitized,Digitisation 481 | 6964,12,PLAIN,cue,ToBeNormalized,c u e,cue 482 | 6970,3,PLAIN,GONZALES,ToBeNormalized,g o n z a l e s,GONZALES 483 | 6972,7,PLAIN,synthestration,ToBeNormalized,s_letter,synthestration 484 | 6978,1,PLAIN,iPad,ToBeNormalized,i p a d,iPad 485 | 6999,4,PLAIN,Shab,ToBeNormalized,s h a b,Shab 486 | 7012,2,ELECTRONIC,http://www.tmaxsoft.com/product/productView.do,ToBeNormalized,h_letter _letter t_letter _letter t_letter _letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter _letter w_letter _letter w_letter dot t_letter _letter m_letter _letter a_letter _letter x_letter _letter s_letter _letter o_letter _letter f_letter _letter t_letter dot c_letter o_letter m_letter _letter s_letter l_letter a_letter s_letter h_letter _letter p_letter r_letter o_letter d_letter u_letter c_letter t_letter _letter s_letter l_letter a_letter s_letter h_letter _letter p_letter r_letter o_letter d_letter u_letter c_letter t_letter v_letter i_letter e_letter w_letter dot d_letter o_letter,h_letter _letter t_letter _letter t_letter _letter p_letter _letter c_letter o_letter l_letter o_letter n_letter _letter s_letter l_letter a_letter s_letter h_letter _letter s_letter l_letter a_letter s_letter h_letter _letter w_letter _letter w_letter _letter w_letter dot t_letter m_letter a_letter x_letter s_letter o_letter f_letter t_letter dot c_letter o_letter m_letter _letter s_letter l_letter a_letter s_letter h_letter _letter p_letter r_letter o_letter d_letter u_letter c_letter t_letter _letter s_letter l_letter a_letter s_letter h_letter _letter p_letter r_letter o_letter d_letter u_letter c_letter t_letter v_letter i_letter e_letter w_letter dot d_letter o_letter 487 | 7029,3,PLAIN,DOS,ToBeNormalized,d o s,DOS 488 | 7031,14,PLAIN,DOS,ToBeNormalized,d o s,DOS 489 | 7033,19,PLAIN,CASTOR,ToBeNormalized,c a s t o r,CASTOR 490 | 7039,5,PLAIN,ab,ToBeNormalized,a b,ab 491 | 7040,3,PLAIN,IU,ToBeNormalized,i u,IU 492 | 7086,12,PLAIN,LIF,ToBeNormalized,l i f,LIF 493 | 7086,21,PLAIN,STAT,ToBeNormalized,s t a t,STAT 494 | 7088,11,PLAIN,circumcised,RemainSelf,circumcised,circumcized 495 | 7090,6,PLAIN,GRIN,ToBeNormalized,g r i n,GRIN 496 | 7094,0,PLAIN,HSI,ToBeNormalized,h s i,HSI 497 | 7099,0,PUNCT,¿,ToBeNormalized,f,¿ 498 | 7109,6,PLAIN,TAP,ToBeNormalized,t a p,TAP 499 | 7112,4,PUNCT,:,ToBeNormalized,to,: 500 | 7113,4,LETTERS,IV,ToBeNormalized,the fourth,i v 501 | 7123,12,LETTERS,Ssese,RemainSelf,Ssese,s s e s e 502 | 7132,13,DATE,1991,ToBeNormalized,one thousand nine hundred ninety one,nineteen ninety one 503 | 7180,9,LETTERS,Ald,RemainSelf,Ald,a l d 504 | 7196,7,PLAIN,XI,ToBeNormalized,eleven,XI 505 | 7196,10,TIME,18:00:00Z,ToBeNormalized,eighteen hours u seconds,eighteen hours zero minutes and zero seconds z 506 | 7230,16,DATE,1968,ToBeNormalized,one thousand nine hundred sixty eight,nineteen sixty eight 507 | 7234,4,PLAIN,ski,ToBeNormalized,s k i,ski 508 | 7235,13,PLAIN,Lok,ToBeNormalized,l o k,Lok 509 | 7250,3,TIME,0:02:01,ToBeNormalized,zero hours two minutes and one seconds,zero hours two minutes and one second 510 | 7256,5,PLAIN,CAB,ToBeNormalized,c a b,CAB 511 | 7286,4,PLAIN,ABA,ToBeNormalized,a b a,ABA 512 | 7291,2,PLAIN,OBE,ToBeNormalized,o b e,OBE 513 | 7295,15,PLAIN,scam,ToBeNormalized,s c a m,scam 514 | 7298,8,PLAIN,est,ToBeNormalized,e s t,est 515 | 7334,1,CARDINAL,1298015,ToBeNormalized,one two nine eight o one five,one million two hundred ninety eight thousand fifteen 516 | 7354,18,PLAIN,IX,ToBeNormalized,nine,IX 517 | 7364,11,PLAIN,Asaf,ToBeNormalized,a s a f,Asaf 518 | 7372,6,LETTERS,Graz'zt,RemainSelf,Graz'zt,g r a z z t 519 | 7385,1,LETTERS,Suat,RemainSelf,Suat,s u a t 520 | 7385,4,PLAIN,odul,ToBeNormalized,o d u l,odul 521 | 7388,20,PUNCT,-,ToBeNormalized,to,- 522 | 7388,24,PUNCT,-,ToBeNormalized,to,- 523 | 7392,7,PLAIN,I,ToBeNormalized,the first,I 524 | 7409,5,PLAIN,LAN,ToBeNormalized,l a n,LAN 525 | 7440,5,PLAIN,GO,ToBeNormalized,g o,GO 526 | 7487,2,LETTERS,Ekow,RemainSelf,Ekow,e k o w 527 | 7492,8,LETTERS,Lwala,RemainSelf,Lwala,l w a l a 528 | 7495,1,PLAIN,Ski,ToBeNormalized,s k i,Ski 529 | 7498,4,LETTERS,bd,RemainSelf,bd,b d 530 | 7530,5,LETTERS,Smer,RemainSelf,Smer,s m e r 531 | 7532,18,LETTERS,Mzee,RemainSelf,Mzee,m z e e 532 | -------------------------------------------------------------------------------- /results/russian/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cognibit/Text-Normalization-Demo/36355f4a2c5187948fe786b7318259151f9a9db6/results/russian/.gitkeep -------------------------------------------------------------------------------- /results/russian/Semiotic_Class-wise_Accuracy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cognibit/Text-Normalization-Demo/36355f4a2c5187948fe786b7318259151f9a9db6/results/russian/Semiotic_Class-wise_Accuracy.png -------------------------------------------------------------------------------- /results/russian/classwise_accuracy.csv: -------------------------------------------------------------------------------- 1 | semiotic-class,accuracy,count,correct 2 | ALL,0.9928752306965964,93196,92532 3 | CARDINAL,0.9417922948073701,2388,2249 4 | DATE,0.9732441471571907,1495,1455 5 | DECIMAL,0.9,60,54 6 | DIGIT,1.0,16,16 7 | ELECTRONIC,0.6041666666666666,48,29 8 | FRACTION,0.6086956521739131,23,14 9 | LETTERS,0.9907608695652174,1840,1823 10 | MEASURE,0.8978102189781022,411,369 11 | MONEY,0.8947368421052632,19,17 12 | ORDINAL,0.9461358313817331,427,404 13 | PLAIN,0.994688407139769,64764,64420 14 | PUNCT,0.9998519542045006,20264,20261 15 | TELEPHONE,0.8202247191011236,89,73 16 | TIME,0.75,8,6 17 | VERBATIM,0.9985119047619048,1344,1342 18 | -------------------------------------------------------------------------------- /setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | echo "Downloading and extracting required files" 3 | wget https://storage.googleapis.com/text_normalization/test_data.zip 4 | wget https://storage.googleapis.com/text_normalization/dnc_model.zip 5 | rm -rf data 6 | rm -rf models 7 | unzip test_data.zip 8 | unzip dnc_model.zip 9 | rm test_data.zip 10 | rm dnc_model.zip 11 | echo "Finished" 12 | 13 | -------------------------------------------------------------------------------- /src/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cognibit/Text-Normalization-Demo/36355f4a2c5187948fe786b7318259151f9a9db6/src/.gitkeep -------------------------------------------------------------------------------- /src/DNCnormalize.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Cognibit Solutions LLP. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """ 16 | Text Normalization using Differentiable Neural Computer 17 | 18 | """ 19 | 20 | import os 21 | import numpy as np 22 | import tensorflow as tf 23 | from collections import OrderedDict 24 | 25 | from lib.seq2seq import Seq2SeqModel 26 | 27 | 28 | # ---------------------- 29 | # Model Flag Parameters 30 | # ---------------------- 31 | config={ 32 | 'cell_type':'dnc', 33 | 'attention_type':'bahdanau', 34 | 'hidden_units':1024, 35 | 'depth':1, 36 | 'embedding_size':32, 37 | 'memory_size':256, 38 | 'word_size':64, 39 | 'num_writes':1, 40 | 'num_reads':5, 41 | 'clip_value':20, 42 | 'beam_width':1, 43 | 'max_decode_step':150, 44 | 'use_residual':False, 45 | 'attn_input_feeding':True, 46 | 'use_dropout':False, 47 | 'dropout_rate':0.3, 48 | 'use_fp16':False 49 | 50 | } 51 | 52 | 53 | def normalize(enc_data, enc_len, model_path,batch_size=200,use_memory=True): 54 | """Normalize encoded data using the trained DNC model given""" 55 | 56 | # Initiate TF session 57 | tf.reset_default_graph() 58 | dnc_predictions=[] 59 | with tf.Session() as sess: 60 | print('Using DNC model at {}'.format(model_path)) 61 | model=create_model_decode(batch_size=batch_size,use_memory=use_memory) 62 | restore_model(model,sess,model_path) 63 | 64 | num_batches=int(enc_data.shape[0]/batch_size) 65 | print('Number of batches: {}'.format(num_batches)) 66 | 67 | for i in range(num_batches): 68 | predict=model.predict(sess,enc_data[i*batch_size:i*batch_size+batch_size], 69 | enc_len[i*batch_size:i*batch_size+batch_size]) 70 | predict = np.split(predict,batch_size,axis=0) 71 | dnc_predictions.extend(predict) 72 | 73 | if i%(int(num_batches/25)) == 0: 74 | print('Normalized {} out of {}'.format((i+1)*batch_size, 75 | num_batches*batch_size)) 76 | 77 | #Process the last batch by adding zeros to the end 78 | if(enc_data.shape[0]%batch_size != 0): 79 | lastbatch = enc_data[num_batches*batch_size:] 80 | lastbatch_len= enc_len[num_batches*batch_size:] 81 | lastbatch=np.concatenate((lastbatch,np.zeros([batch_size-lastbatch.shape[0], 82 | lastbatch.shape[1]])),axis=0) 83 | 84 | lastbatch_len=np.concatenate((lastbatch_len, 85 | np.ones([batch_size-lastbatch_len.shape[0]])),axis=0) 86 | 87 | predict=model.predict(sess,lastbatch,lastbatch_len) 88 | predict=np.split(predict,batch_size,axis=0) 89 | dnc_predictions.extend(predict) 90 | 91 | return dnc_predictions 92 | 93 | def create_model_decode(batch_size,use_memory): 94 | model = Seq2SeqModel(config,'decode',batch_size,use_memory=use_memory) 95 | return model 96 | 97 | def restore_model(model, sess, model_path): 98 | print('Reloading model parameters...') 99 | model.restore(sess, model_path) 100 | return None -------------------------------------------------------------------------------- /src/Encoder.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Cognibit Solutions LLP. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """ 16 | Generate Required Encoding for XGBoost and DNC Model 17 | 18 | """ 19 | 20 | import pickle 21 | import numpy as np 22 | from multiprocessing import Pool 23 | import pandas as pd 24 | import itertools 25 | 26 | class XGBoostEncodingGenerator: 27 | 28 | def __init__(self,space_letter=0,max_num_features = 30,pad_size = 1,boundary_letter = -1): 29 | self.space_letter=space_letter 30 | self.max_num_features=max_num_features 31 | self.boundary_letter=boundary_letter 32 | self.pad_size=pad_size 33 | 34 | def context_window_transform(self,data, pad_size,flush_progress=True): 35 | pre = np.zeros(self.max_num_features) 36 | pre = [pre for x in np.arange(pad_size)] 37 | data = pre + data + pre 38 | neo_data = [] 39 | for i in np.arange(len(data) - pad_size * 2): 40 | row = [] 41 | if(flush_progress and i%100==0): 42 | print('Processed %f%%'%((i/(len(data) - pad_size * 2-1))*100),end='\r') 43 | for x in data[i : i + pad_size * 2 + 1]: 44 | row.append([self.boundary_letter]) 45 | row.append(x) 46 | row.append([self.boundary_letter]) 47 | merged=list(itertools.chain(*row)) 48 | neo_data.append(merged) 49 | if(flush_progress): 50 | print('Processed 100% ',end='\r') 51 | return neo_data 52 | 53 | def encode(self,df): 54 | x_data = [] 55 | for x in df['before'].values: 56 | x_row = np.ones(self.max_num_features, dtype=int) * self.space_letter 57 | for xi, i in zip(list(str(x)), np.arange(self.max_num_features)): 58 | x_row[i] = ord(xi) 59 | x_data.append(x_row) 60 | return np.array(self.context_window_transform(x_data, self.pad_size), dtype = np.int16) 61 | 62 | def encode_csv(self,csv_file): 63 | csv=pd.read_csv(csv_file) 64 | encoding=self.encode(csv) 65 | print('Finished Encoding %s'%csv_file) 66 | return encoding 67 | 68 | def encode_csvs_parallel(self,csv_list,n_threads=8): 69 | """ 70 | Encode Multiple CSVs in parallel 71 | """ 72 | if (n_threads < 1): 73 | assert ('nthreads is 1, cannot proceeed!') 74 | threads = Pool(n_threads) 75 | all_enc=threads.map(self.encode_csv,csv_list) 76 | return all_enc -------------------------------------------------------------------------------- /src/XGBclassify.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Cognibit Solutions LLP. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """ 16 | Predict a token as 'ToBeNormalized' or 'RemainSame' using XGBoost 17 | """ 18 | import pickle 19 | import xgboost 20 | import numpy as np 21 | import pandas as pd 22 | from Encoder import XGBoostEncodingGenerator 23 | 24 | class XGB: 25 | """XGBoost 26 | 27 | API wrapper for trained XGBoost model 28 | """ 29 | 30 | def __init__(self, path='../models/english/en-xgb[0.5]'): 31 | """Initialize & load model with its parameters""" 32 | # init model 33 | self.model = pickle.load(open(path, "rb")) 34 | # load model 35 | # self.model.load_model(path) 36 | # init parameters 37 | self.max_num_features = 30 38 | self.pad_size = 1 39 | self.boundary_letter = -1 40 | self.space_letter = 0 41 | self.labels = ['RemainSelf', 'ToBeNormalized'] 42 | return None 43 | 44 | def predict(self, data): 45 | """XGBoost prediction 46 | 47 | Classifies the dataframe's 'before' tokens 48 | 49 | Args: 50 | data: pandas dataframe having 'before' column 51 | 52 | Returns: 53 | y_labels: list of class labels 54 | """ 55 | # pre-process data 56 | encoded_data = self._encode(data) 57 | enc_gen = XGBoostEncodingGenerator() 58 | 59 | contextual_data = np.array(enc_gen.context_window_transform(encoded_data, self.pad_size)) 60 | columns=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', 61 | '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', 62 | '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', 63 | '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', 64 | '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', 65 | '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', 66 | '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', 67 | '85', '86', '87', '88', '89', '90', '91', '92', '93'] 68 | X = pd.DataFrame(data=contextual_data, columns=columns) 69 | 70 | # classify as RemainSelf or ToBeNormalized 71 | y = self.model.predict(X) 72 | y_labels = [self.labels[int(i)] for i in y] 73 | return y_labels 74 | 75 | def _encode(self, data): 76 | """Encodes data into vectors""" 77 | encoded_data = [] 78 | for x in data['before'].values: 79 | x_row = np.ones(self.max_num_features, dtype=int) * self.space_letter 80 | for xi, i in zip(list(str(x)), np.arange(self.max_num_features)): 81 | x_row[i] = ord(xi) 82 | encoded_data.append(x_row) 83 | return encoded_data 84 | 85 | def _context_window_transform(self, data, pad_size): 86 | """Transforms into a context window""" 87 | pre = np.zeros(self.max_num_features) 88 | pre = [pre for x in np.arange(pad_size)] 89 | data = pre + data + pre 90 | context_data = [] 91 | for i in np.arange(len(data) - pad_size * 2): 92 | row = [] 93 | for x in data[i: i + pad_size * 2 + 1]: 94 | row.append([self.boundary_letter]) 95 | row.append(x) 96 | row.append([self.boundary_letter]) 97 | context_data.append([int(x) for y in row for x in y]) 98 | return context_data 99 | -------------------------------------------------------------------------------- /src/classification_report.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Cognibit Solutions LLP. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """ 16 | 17 | Generates classification report for the trained XGBoost models 18 | """ 19 | 20 | import itertools 21 | import numpy as np 22 | import matplotlib.pyplot as plt 23 | from sklearn.metrics import confusion_matrix as cm 24 | from sklearn.metrics import precision_recall_curve 25 | from sklearn.metrics import average_precision_score 26 | from sklearn.metrics import classification_report as report 27 | 28 | def preprocessing(results, truth): 29 | # preprocessing 30 | results.loc[truth['before']==truth['after'],'truth']='RemainSelf' 31 | results.loc[truth['before']!=truth['after'],'truth']='ToBeNormalized' 32 | truth['class']='' 33 | truth.loc[truth['before']!=truth['after'],'class']='ToBeNormalized' 34 | truth.loc[truth['before']==truth['after'],'class']='RemainSelf' 35 | return results, truth 36 | 37 | def f1_scores(results, truth): 38 | print(report(truth['class'].tolist(), results['class'].tolist())) 39 | 40 | def confusion_matrix(results, truth, lang): 41 | matrix = cm(truth['class'].tolist(), results['class'].tolist()) 42 | plot_confusion_matrix(matrix, classes=['ToBeNormalized', 'RemainSelf'], 43 | title='XGBoost Confusion Matrix [{}]'.format(lang)) 44 | 45 | def pr_curve(results, truth, lang): 46 | truth.loc[truth['class']=='ToBeNormalized', 'class'] = 1 47 | truth.loc[truth['class']=='RemainSelf', 'class'] = 0 48 | results.loc[results['class']=='ToBeNormalized', 'class'] = 1 49 | results.loc[results['class']=='RemainSelf', 'class'] = 0 50 | 51 | average_precision = average_precision_score(truth['class'].tolist(), results['class'].tolist()) 52 | precision, recall, threshold = precision_recall_curve(truth['class'].tolist(), results['class'].tolist()) 53 | 54 | plt.step(recall, precision, color='b', alpha=0.2, where='post') 55 | plt.fill_between(recall, precision, alpha=0.2, color='b') 56 | plt.xlabel('Recall') 57 | plt.ylabel('Precision') 58 | plt.ylim([0.0, 1.05]) 59 | plt.xlim([0.0, 1.0]) 60 | plt.title('Precision-Recall Curve: AP={0:0.2f} [{1}]'.format(average_precision, lang)) 61 | plt.show() 62 | 63 | def plot_confusion_matrix(cm, classes, 64 | title='Confusion matrix', 65 | cmap=plt.cm.Blues): 66 | """ 67 | This function prints and plots the confusion matrix. 68 | """ 69 | plt.imshow(cm, interpolation='nearest', cmap=cmap) 70 | plt.title(title) 71 | plt.colorbar() 72 | tick_marks = np.arange(len(classes)) 73 | plt.xticks(tick_marks, classes, rotation=45) 74 | plt.yticks(tick_marks, classes) 75 | 76 | fmt = 'd' 77 | thresh = cm.max() / 2. 78 | for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): 79 | plt.text(j, i, format(cm[i, j], fmt), 80 | horizontalalignment="center", 81 | color="white" if cm[i, j] > thresh else "black") 82 | 83 | plt.ylabel('True label') 84 | plt.xlabel('Predicted label') 85 | plt.tight_layout() 86 | 87 | -------------------------------------------------------------------------------- /src/lib/access.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """DNC access modules.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import collections 22 | import sonnet as snt 23 | import tensorflow as tf 24 | 25 | from . import addressing 26 | from . import util 27 | 28 | AccessState = collections.namedtuple('AccessState', ( 29 | 'memory', 'read_weights', 'write_weights', 'linkage', 'usage')) 30 | 31 | 32 | def _erase_and_write(memory, address, reset_weights, values): 33 | """Module to erase and write in the external memory. 34 | 35 | Erase operation: 36 | M_t'(i) = M_{t-1}(i) * (1 - w_t(i) * e_t) 37 | 38 | Add operation: 39 | M_t(i) = M_t'(i) + w_t(i) * a_t 40 | 41 | where e are the reset_weights, w the write weights and a the values. 42 | 43 | Args: 44 | memory: 3-D tensor of shape `[batch_size, memory_size, word_size]`. 45 | address: 3-D tensor `[batch_size, num_writes, memory_size]`. 46 | reset_weights: 3-D tensor `[batch_size, num_writes, word_size]`. 47 | values: 3-D tensor `[batch_size, num_writes, word_size]`. 48 | 49 | Returns: 50 | 3-D tensor of shape `[batch_size, num_writes, word_size]`. 51 | """ 52 | with tf.name_scope('erase_memory', values=[memory, address, reset_weights]): 53 | expand_address = tf.expand_dims(address, 3) 54 | reset_weights = tf.expand_dims(reset_weights, 2) 55 | weighted_resets = expand_address * reset_weights 56 | reset_gate = tf.reduce_prod(1 - weighted_resets, [1]) 57 | memory *= reset_gate 58 | 59 | with tf.name_scope('additive_write', values=[memory, address, values]): 60 | add_matrix = tf.matmul(address, values, adjoint_a=True) 61 | memory += add_matrix 62 | 63 | return memory 64 | 65 | 66 | class MemoryAccess(snt.RNNCore): 67 | """Access module of the Differentiable Neural Computer. 68 | 69 | This memory module supports multiple read and write heads. It makes use of: 70 | 71 | * `addressing.TemporalLinkage` to track the temporal ordering of writes in 72 | memory for each write head. 73 | * `addressing.FreenessAllocator` for keeping track of memory usage, where 74 | usage increase when a memory location is written to, and decreases when 75 | memory is read from that the controller says can be freed. 76 | 77 | Write-address selection is done by an interpolation between content-based 78 | lookup and using unused memory. 79 | 80 | Read-address selection is done by an interpolation of content-based lookup 81 | and following the link graph in the forward or backwards read direction. 82 | """ 83 | 84 | def __init__(self, 85 | memory_size=128, 86 | word_size=20, 87 | num_reads=1, 88 | num_writes=1, 89 | name='memory_access'): 90 | """Creates a MemoryAccess module. 91 | 92 | Args: 93 | memory_size: The number of memory slots (N in the DNC paper). 94 | word_size: The width of each memory slot (W in the DNC paper) 95 | num_reads: The number of read heads (R in the DNC paper). 96 | num_writes: The number of write heads (fixed at 1 in the paper). 97 | name: The name of the module. 98 | """ 99 | super(MemoryAccess, self).__init__(name=name) 100 | self._memory_size = memory_size 101 | self._word_size = word_size 102 | self._num_reads = num_reads 103 | self._num_writes = num_writes 104 | 105 | self._write_content_weights_mod = addressing.CosineWeights( 106 | num_writes, word_size, name='write_content_weights') 107 | self._read_content_weights_mod = addressing.CosineWeights( 108 | num_reads, word_size, name='read_content_weights') 109 | 110 | self._linkage = addressing.TemporalLinkage(memory_size, num_writes) 111 | self._freeness = addressing.Freeness(memory_size) 112 | 113 | def _build(self, inputs, prev_state): 114 | """Connects the MemoryAccess module into the graph. 115 | 116 | Args: 117 | inputs: tensor of shape `[batch_size, input_size]`. This is used to 118 | control this access module. 119 | prev_state: Instance of `AccessState` containing the previous state. 120 | 121 | Returns: 122 | A tuple `(output, next_state)`, where `output` is a tensor of shape 123 | `[batch_size, num_reads, word_size]`, and `next_state` is the new 124 | `AccessState` named tuple at the current time t. 125 | """ 126 | inputs = self._read_inputs(inputs) 127 | 128 | # Update usage using inputs['free_gate'] and previous read & write weights. 129 | usage = self._freeness( 130 | write_weights=prev_state.write_weights, 131 | free_gate=inputs['free_gate'], 132 | read_weights=prev_state.read_weights, 133 | prev_usage=prev_state.usage) 134 | 135 | # Write to memory. 136 | write_weights = self._write_weights(inputs, prev_state.memory, usage) 137 | memory = _erase_and_write( 138 | prev_state.memory, 139 | address=write_weights, 140 | reset_weights=inputs['erase_vectors'], 141 | values=inputs['write_vectors']) 142 | 143 | linkage_state = self._linkage(write_weights, prev_state.linkage) 144 | 145 | # Read from memory. 146 | read_weights = self._read_weights( 147 | inputs, 148 | memory=memory, 149 | prev_read_weights=prev_state.read_weights, 150 | link=linkage_state.link) 151 | read_words = tf.matmul(read_weights, memory) 152 | 153 | return (read_words, AccessState( 154 | memory=memory, 155 | read_weights=read_weights, 156 | write_weights=write_weights, 157 | linkage=linkage_state, 158 | usage=usage)) 159 | 160 | def _read_inputs(self, inputs): 161 | """Applies transformations to `inputs` to get control for this module.""" 162 | 163 | def _linear(first_dim, second_dim, name, activation=None): 164 | """Returns a linear transformation of `inputs`, followed by a reshape.""" 165 | linear = snt.Linear(first_dim * second_dim, name=name)(inputs) 166 | if activation is not None: 167 | linear = activation(linear, name=name + '_activation') 168 | return tf.reshape(linear, [-1, first_dim, second_dim]) 169 | 170 | # v_t^i - The vectors to write to memory, for each write head `i`. 171 | write_vectors = _linear(self._num_writes, self._word_size, 'write_vectors') 172 | 173 | # e_t^i - Amount to erase the memory by before writing, for each write head. 174 | erase_vectors = _linear(self._num_writes, self._word_size, 'erase_vectors', 175 | tf.sigmoid) 176 | 177 | # f_t^j - Amount that the memory at the locations read from at the previous 178 | # time step can be declared unused, for each read head `j`. 179 | free_gate = tf.sigmoid( 180 | snt.Linear(self._num_reads, name='free_gate')(inputs)) 181 | 182 | # g_t^{a, i} - Interpolation between writing to unallocated memory and 183 | # content-based lookup, for each write head `i`. Note: `a` is simply used to 184 | # identify this gate with allocation vs writing (as defined below). 185 | allocation_gate = tf.sigmoid( 186 | snt.Linear(self._num_writes, name='allocation_gate')(inputs)) 187 | 188 | # g_t^{w, i} - Overall gating of write amount for each write head. 189 | write_gate = tf.sigmoid( 190 | snt.Linear(self._num_writes, name='write_gate')(inputs)) 191 | 192 | # \pi_t^j - Mixing between "backwards" and "forwards" positions (for 193 | # each write head), and content-based lookup, for each read head. 194 | num_read_modes = 1 + 2 * self._num_writes 195 | read_mode = snt.BatchApply(tf.nn.softmax)( 196 | _linear(self._num_reads, num_read_modes, name='read_mode')) 197 | 198 | # Parameters for the (read / write) "weights by content matching" modules. 199 | write_keys = _linear(self._num_writes, self._word_size, 'write_keys') 200 | write_strengths = snt.Linear(self._num_writes, name='write_strengths')( 201 | inputs) 202 | 203 | read_keys = _linear(self._num_reads, self._word_size, 'read_keys') 204 | read_strengths = snt.Linear(self._num_reads, name='read_strengths')(inputs) 205 | 206 | result = { 207 | 'read_content_keys': read_keys, 208 | 'read_content_strengths': read_strengths, 209 | 'write_content_keys': write_keys, 210 | 'write_content_strengths': write_strengths, 211 | 'write_vectors': write_vectors, 212 | 'erase_vectors': erase_vectors, 213 | 'free_gate': free_gate, 214 | 'allocation_gate': allocation_gate, 215 | 'write_gate': write_gate, 216 | 'read_mode': read_mode, 217 | } 218 | return result 219 | 220 | def _write_weights(self, inputs, memory, usage): 221 | """Calculates the memory locations to write to. 222 | 223 | This uses a combination of content-based lookup and finding an unused 224 | location in memory, for each write head. 225 | 226 | Args: 227 | inputs: Collection of inputs to the access module, including controls for 228 | how to chose memory writing, such as the content to look-up and the 229 | weighting between content-based and allocation-based addressing. 230 | memory: A tensor of shape `[batch_size, memory_size, word_size]` 231 | containing the current memory contents. 232 | usage: Current memory usage, which is a tensor of shape `[batch_size, 233 | memory_size]`, used for allocation-based addressing. 234 | 235 | Returns: 236 | tensor of shape `[batch_size, num_writes, memory_size]` indicating where 237 | to write to (if anywhere) for each write head. 238 | """ 239 | with tf.name_scope('write_weights', values=[inputs, memory, usage]): 240 | # c_t^{w, i} - The content-based weights for each write head. 241 | write_content_weights = self._write_content_weights_mod( 242 | memory, inputs['write_content_keys'], 243 | inputs['write_content_strengths']) 244 | 245 | # a_t^i - The allocation weights for each write head. 246 | write_allocation_weights = self._freeness.write_allocation_weights( 247 | usage=usage, 248 | write_gates=(inputs['allocation_gate'] * inputs['write_gate']), 249 | num_writes=self._num_writes) 250 | 251 | # Expands gates over memory locations. 252 | allocation_gate = tf.expand_dims(inputs['allocation_gate'], -1) 253 | write_gate = tf.expand_dims(inputs['write_gate'], -1) 254 | 255 | # w_t^{w, i} - The write weightings for each write head. 256 | return write_gate * (allocation_gate * write_allocation_weights + 257 | (1 - allocation_gate) * write_content_weights) 258 | 259 | def _read_weights(self, inputs, memory, prev_read_weights, link): 260 | """Calculates read weights for each read head. 261 | 262 | The read weights are a combination of following the link graphs in the 263 | forward or backward directions from the previous read position, and doing 264 | content-based lookup. The interpolation between these different modes is 265 | done by `inputs['read_mode']`. 266 | 267 | Args: 268 | inputs: Controls for this access module. This contains the content-based 269 | keys to lookup, and the weightings for the different read modes. 270 | memory: A tensor of shape `[batch_size, memory_size, word_size]` 271 | containing the current memory contents to do content-based lookup. 272 | prev_read_weights: A tensor of shape `[batch_size, num_reads, 273 | memory_size]` containing the previous read locations. 274 | link: A tensor of shape `[batch_size, num_writes, memory_size, 275 | memory_size]` containing the temporal write transition graphs. 276 | 277 | Returns: 278 | A tensor of shape `[batch_size, num_reads, memory_size]` containing the 279 | read weights for each read head. 280 | """ 281 | with tf.name_scope( 282 | 'read_weights', values=[inputs, memory, prev_read_weights, link]): 283 | # c_t^{r, i} - The content weightings for each read head. 284 | content_weights = self._read_content_weights_mod( 285 | memory, inputs['read_content_keys'], inputs['read_content_strengths']) 286 | 287 | # Calculates f_t^i and b_t^i. 288 | forward_weights = self._linkage.directional_read_weights( 289 | link, prev_read_weights, forward=True) 290 | backward_weights = self._linkage.directional_read_weights( 291 | link, prev_read_weights, forward=False) 292 | 293 | backward_mode = inputs['read_mode'][:, :, :self._num_writes] 294 | forward_mode = ( 295 | inputs['read_mode'][:, :, self._num_writes:2 * self._num_writes]) 296 | content_mode = inputs['read_mode'][:, :, 2 * self._num_writes] 297 | 298 | read_weights = ( 299 | tf.expand_dims(content_mode, 2) * content_weights + tf.reduce_sum( 300 | tf.expand_dims(forward_mode, 3) * forward_weights, 2) + 301 | tf.reduce_sum(tf.expand_dims(backward_mode, 3) * backward_weights, 2)) 302 | 303 | return read_weights 304 | 305 | @property 306 | def state_size(self): 307 | """Returns a tuple of the shape of the state tensors.""" 308 | return AccessState( 309 | memory=tf.TensorShape([self._memory_size, self._word_size]), 310 | read_weights=tf.TensorShape([self._num_reads, self._memory_size]), 311 | write_weights=tf.TensorShape([self._num_writes, self._memory_size]), 312 | linkage=self._linkage.state_size, 313 | usage=self._freeness.state_size) 314 | 315 | @property 316 | def output_size(self): 317 | """Returns the output shape.""" 318 | return tf.TensorShape([self._num_reads, self._word_size]) 319 | -------------------------------------------------------------------------------- /src/lib/addressing.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """DNC addressing modules.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import collections 22 | import sonnet as snt 23 | import tensorflow as tf 24 | 25 | from . import util 26 | 27 | # Ensure values are greater than epsilon to avoid numerical instability. 28 | _EPSILON = 1e-6 29 | 30 | TemporalLinkageState = collections.namedtuple('TemporalLinkageState', 31 | ('link', 'precedence_weights')) 32 | 33 | 34 | def _vector_norms(m): 35 | squared_norms = tf.reduce_sum(m * m, axis=2, keep_dims=True) 36 | return tf.sqrt(squared_norms + _EPSILON) 37 | 38 | 39 | def weighted_softmax(activations, strengths, strengths_op): 40 | """Returns softmax over activations multiplied by positive strengths. 41 | 42 | Args: 43 | activations: A tensor of shape `[batch_size, num_heads, memory_size]`, of 44 | activations to be transformed. Softmax is taken over the last dimension. 45 | strengths: A tensor of shape `[batch_size, num_heads]` containing strengths to 46 | multiply by the activations prior to the softmax. 47 | strengths_op: An operation to transform strengths before softmax. 48 | 49 | Returns: 50 | A tensor of same shape as `activations` with weighted softmax applied. 51 | """ 52 | transformed_strengths = tf.expand_dims(strengths_op(strengths), -1) 53 | sharp_activations = activations * transformed_strengths 54 | softmax = snt.BatchApply(module_or_op=tf.nn.softmax) 55 | return softmax(sharp_activations) 56 | 57 | 58 | class CosineWeights(snt.AbstractModule): 59 | """Cosine-weighted attention. 60 | 61 | Calculates the cosine similarity between a query and each word in memory, then 62 | applies a weighted softmax to return a sharp distribution. 63 | """ 64 | 65 | def __init__(self, 66 | num_heads, 67 | word_size, 68 | strength_op=tf.nn.softplus, 69 | name='cosine_weights'): 70 | """Initializes the CosineWeights module. 71 | 72 | Args: 73 | num_heads: number of memory heads. 74 | word_size: memory word size. 75 | strength_op: operation to apply to strengths (default is tf.nn.softplus). 76 | name: module name (default 'cosine_weights') 77 | """ 78 | super(CosineWeights, self).__init__(name=name) 79 | self._num_heads = num_heads 80 | self._word_size = word_size 81 | self._strength_op = strength_op 82 | 83 | def _build (self, memory, keys, strengths): 84 | """Connects the CosineWeights module into the graph. 85 | 86 | Args: 87 | memory: A 3-D tensor of shape `[batch_size, memory_size, word_size]`. 88 | keys: A 3-D tensor of shape `[batch_size, num_heads, word_size]`. 89 | strengths: A 2-D tensor of shape `[batch_size, num_heads]`. 90 | 91 | Returns: 92 | Weights tensor of shape `[batch_size, num_heads, memory_size]`. 93 | """ 94 | # Calculates the inner product between the query vector and words in memory. 95 | dot = tf.matmul(keys, memory, adjoint_b=True) 96 | 97 | # Outer product to compute denominator (euclidean norm of query and memory). 98 | memory_norms = _vector_norms(memory) 99 | key_norms = _vector_norms(keys) 100 | norm = tf.matmul(key_norms, memory_norms, adjoint_b=True) 101 | 102 | # Calculates cosine similarity between the query vector and words in memory. 103 | similarity = dot / (norm + _EPSILON) 104 | 105 | return weighted_softmax(similarity, strengths, self._strength_op) 106 | 107 | 108 | class TemporalLinkage(snt.RNNCore): 109 | """Keeps track of write order for forward and backward addressing. 110 | 111 | This is a pseudo-RNNCore module, whose state is a pair `(link, 112 | precedence_weights)`, where `link` is a (collection of) graphs for (possibly 113 | multiple) write heads (represented by a tensor with values in the range 114 | [0, 1]), and `precedence_weights` records the "previous write locations" used 115 | to build the link graphs. 116 | 117 | The function `directional_read_weights` computes addresses following the 118 | forward and backward directions in the link graphs. 119 | """ 120 | 121 | def __init__(self, memory_size, num_writes, name='temporal_linkage'): 122 | """Construct a TemporalLinkage module. 123 | 124 | Args: 125 | memory_size: The number of memory slots. 126 | num_writes: The number of write heads. 127 | name: Name of the module. 128 | """ 129 | super(TemporalLinkage, self).__init__(name=name) 130 | self._memory_size = memory_size 131 | self._num_writes = num_writes 132 | 133 | def _build(self, write_weights, prev_state): 134 | """Calculate the updated linkage state given the write weights. 135 | 136 | Args: 137 | write_weights: A tensor of shape `[batch_size, num_writes, memory_size]` 138 | containing the memory addresses of the different write heads. 139 | prev_state: `TemporalLinkageState` tuple containg a tensor `link` of 140 | shape `[batch_size, num_writes, memory_size, memory_size]`, and a 141 | tensor `precedence_weights` of shape `[batch_size, num_writes, 142 | memory_size]` containing the aggregated history of recent writes. 143 | 144 | Returns: 145 | A `TemporalLinkageState` tuple `next_state`, which contains the updated 146 | link and precedence weights. 147 | """ 148 | link = self._link(prev_state.link, prev_state.precedence_weights, 149 | write_weights) 150 | precedence_weights = self._precedence_weights(prev_state.precedence_weights, 151 | write_weights) 152 | return TemporalLinkageState( 153 | link=link, precedence_weights=precedence_weights) 154 | 155 | def directional_read_weights(self, link, prev_read_weights, forward): 156 | """Calculates the forward or the backward read weights. 157 | 158 | For each read head (at a given address), there are `num_writes` link graphs 159 | to follow. Thus this function computes a read address for each of the 160 | `num_reads * num_writes` pairs of read and write heads. 161 | 162 | Args: 163 | link: tensor of shape `[batch_size, num_writes, memory_size, 164 | memory_size]` representing the link graphs L_t. 165 | prev_read_weights: tensor of shape `[batch_size, num_reads, 166 | memory_size]` containing the previous read weights w_{t-1}^r. 167 | forward: Boolean indicating whether to follow the "future" direction in 168 | the link graph (True) or the "past" direction (False). 169 | 170 | Returns: 171 | tensor of shape `[batch_size, num_reads, num_writes, memory_size]` 172 | """ 173 | with tf.name_scope('directional_read_weights'): 174 | # We calculate the forward and backward directions for each pair of 175 | # read and write heads; hence we need to tile the read weights and do a 176 | # sort of "outer product" to get this. 177 | expanded_read_weights = tf.stack([prev_read_weights] * self._num_writes, 178 | 1) 179 | result = tf.matmul(expanded_read_weights, link, adjoint_b=forward) 180 | # Swap dimensions 1, 2 so order is [batch, reads, writes, memory]: 181 | return tf.transpose(result, perm=[0, 2, 1, 3]) 182 | 183 | def _link(self, prev_link, prev_precedence_weights, write_weights): 184 | """Calculates the new link graphs. 185 | 186 | For each write head, the link is a directed graph (represented by a matrix 187 | with entries in range [0, 1]) whose vertices are the memory locations, and 188 | an edge indicates temporal ordering of writes. 189 | 190 | Args: 191 | prev_link: A tensor of shape `[batch_size, num_writes, memory_size, 192 | memory_size]` representing the previous link graphs for each write 193 | head. 194 | prev_precedence_weights: A tensor of shape `[batch_size, num_writes, 195 | memory_size]` which is the previous "aggregated" write weights for 196 | each write head. 197 | write_weights: A tensor of shape `[batch_size, num_writes, memory_size]` 198 | containing the new locations in memory written to. 199 | 200 | Returns: 201 | A tensor of shape `[batch_size, num_writes, memory_size, memory_size]` 202 | containing the new link graphs for each write head. 203 | """ 204 | with tf.name_scope('link'): 205 | batch_size = prev_link.get_shape()[0].value 206 | write_weights_i = tf.expand_dims(write_weights, 3) 207 | write_weights_j = tf.expand_dims(write_weights, 2) 208 | prev_precedence_weights_j = tf.expand_dims(prev_precedence_weights, 2) 209 | prev_link_scale = 1 - write_weights_i - write_weights_j 210 | new_link = write_weights_i * prev_precedence_weights_j 211 | link = prev_link_scale * prev_link + new_link 212 | # Return the link with the diagonal set to zero, to remove self-looping 213 | # edges. 214 | return tf.matrix_set_diag( 215 | link, 216 | tf.zeros( 217 | [batch_size, self._num_writes, self._memory_size], 218 | dtype=link.dtype)) 219 | 220 | def _precedence_weights(self, prev_precedence_weights, write_weights): 221 | """Calculates the new precedence weights given the current write weights. 222 | 223 | The precedence weights are the "aggregated write weights" for each write 224 | head, where write weights with sum close to zero will leave the precedence 225 | weights unchanged, but with sum close to one will replace the precedence 226 | weights. 227 | 228 | Args: 229 | prev_precedence_weights: A tensor of shape `[batch_size, num_writes, 230 | memory_size]` containing the previous precedence weights. 231 | write_weights: A tensor of shape `[batch_size, num_writes, memory_size]` 232 | containing the new write weights. 233 | 234 | Returns: 235 | A tensor of shape `[batch_size, num_writes, memory_size]` containing the 236 | new precedence weights. 237 | """ 238 | with tf.name_scope('precedence_weights'): 239 | write_sum = tf.reduce_sum(write_weights, 2, keep_dims=True) 240 | return (1 - write_sum) * prev_precedence_weights + write_weights 241 | 242 | @property 243 | def state_size(self): 244 | """Returns a `TemporalLinkageState` tuple of the state tensors' shapes.""" 245 | return TemporalLinkageState( 246 | link=tf.TensorShape( 247 | [self._num_writes, self._memory_size, self._memory_size]), 248 | precedence_weights=tf.TensorShape([self._num_writes, 249 | self._memory_size]),) 250 | 251 | 252 | class Freeness(snt.RNNCore): 253 | """Memory usage that is increased by writing and decreased by reading. 254 | 255 | This module is a pseudo-RNNCore whose state is a tensor with values in 256 | the range [0, 1] indicating the usage of each of `memory_size` memory slots. 257 | 258 | The usage is: 259 | 260 | * Increased by writing, where usage is increased towards 1 at the write 261 | addresses. 262 | * Decreased by reading, where usage is decreased after reading from a 263 | location when free_gate is close to 1. 264 | 265 | The function `write_allocation_weights` can be invoked to get free locations 266 | to write to for a number of write heads. 267 | """ 268 | 269 | def __init__(self, memory_size, name='freeness'): 270 | """Creates a Freeness module. 271 | 272 | Args: 273 | memory_size: Number of memory slots. 274 | name: Name of the module. 275 | """ 276 | super(Freeness, self).__init__(name=name) 277 | self._memory_size = memory_size 278 | 279 | def _build(self, write_weights, free_gate, read_weights, prev_usage): 280 | """Calculates the new memory usage u_t. 281 | 282 | Memory that was written to in the previous time step will have its usage 283 | increased; memory that was read from and the controller says can be "freed" 284 | will have its usage decreased. 285 | 286 | Args: 287 | write_weights: tensor of shape `[batch_size, num_writes, 288 | memory_size]` giving write weights at previous time step. 289 | free_gate: tensor of shape `[batch_size, num_reads]` which indicates 290 | which read heads read memory that can now be freed. 291 | read_weights: tensor of shape `[batch_size, num_reads, 292 | memory_size]` giving read weights at previous time step. 293 | prev_usage: tensor of shape `[batch_size, memory_size]` giving 294 | usage u_{t - 1} at the previous time step, with entries in range 295 | [0, 1]. 296 | 297 | Returns: 298 | tensor of shape `[batch_size, memory_size]` representing updated memory 299 | usage. 300 | """ 301 | # Calculation of usage is not differentiable with respect to write weights. 302 | write_weights = tf.stop_gradient(write_weights) 303 | usage = self._usage_after_write(prev_usage, write_weights) 304 | usage = self._usage_after_read(usage, free_gate, read_weights) 305 | return usage 306 | 307 | def write_allocation_weights(self, usage, write_gates, num_writes): 308 | """Calculates freeness-based locations for writing to. 309 | 310 | This finds unused memory by ranking the memory locations by usage, for each 311 | write head. (For more than one write head, we use a "simulated new usage" 312 | which takes into account the fact that the previous write head will increase 313 | the usage in that area of the memory.) 314 | 315 | Args: 316 | usage: A tensor of shape `[batch_size, memory_size]` representing 317 | current memory usage. 318 | write_gates: A tensor of shape `[batch_size, num_writes]` with values in 319 | the range [0, 1] indicating how much each write head does writing 320 | based on the address returned here (and hence how much usage 321 | increases). 322 | num_writes: The number of write heads to calculate write weights for. 323 | 324 | Returns: 325 | tensor of shape `[batch_size, num_writes, memory_size]` containing the 326 | freeness-based write locations. Note that this isn't scaled by 327 | `write_gate`; this scaling must be applied externally. 328 | """ 329 | with tf.name_scope('write_allocation_weights'): 330 | # expand gatings over memory locations 331 | write_gates = tf.expand_dims(write_gates, -1) 332 | 333 | allocation_weights = [] 334 | for i in range(num_writes): 335 | allocation_weights.append(self._allocation(usage)) 336 | # update usage to take into account writing to this new allocation 337 | usage += ((1 - usage) * write_gates[:, i, :] * allocation_weights[i]) 338 | 339 | # Pack the allocation weights for the write heads into one tensor. 340 | return tf.stack(allocation_weights, axis=1) 341 | 342 | def _usage_after_write(self, prev_usage, write_weights): 343 | """Calcualtes the new usage after writing to memory. 344 | 345 | Args: 346 | prev_usage: tensor of shape `[batch_size, memory_size]`. 347 | write_weights: tensor of shape `[batch_size, num_writes, memory_size]`. 348 | 349 | Returns: 350 | New usage, a tensor of shape `[batch_size, memory_size]`. 351 | """ 352 | with tf.name_scope('usage_after_write'): 353 | # Calculate the aggregated effect of all write heads 354 | write_weights = 1 - tf.reduce_prod(1 - write_weights, [1]) 355 | return prev_usage + (1 - prev_usage) * write_weights 356 | 357 | def _usage_after_read(self, prev_usage, free_gate, read_weights): 358 | """Calcualtes the new usage after reading and freeing from memory. 359 | 360 | Args: 361 | prev_usage: tensor of shape `[batch_size, memory_size]`. 362 | free_gate: tensor of shape `[batch_size, num_reads]` with entries in the 363 | range [0, 1] indicating the amount that locations read from can be 364 | freed. 365 | read_weights: tensor of shape `[batch_size, num_reads, memory_size]`. 366 | 367 | Returns: 368 | New usage, a tensor of shape `[batch_size, memory_size]`. 369 | """ 370 | with tf.name_scope('usage_after_read'): 371 | free_gate = tf.expand_dims(free_gate, -1) 372 | free_read_weights = free_gate * read_weights 373 | phi = tf.reduce_prod(1 - free_read_weights, [1], name='phi') 374 | return prev_usage * phi 375 | 376 | def _allocation(self, usage): 377 | r"""Computes allocation by sorting `usage`. 378 | 379 | This corresponds to the value a = a_t[\phi_t[j]] in the paper. 380 | 381 | Args: 382 | usage: tensor of shape `[batch_size, memory_size]` indicating current 383 | memory usage. This is equal to u_t in the paper when we only have one 384 | write head, but for multiple write heads, one should update the usage 385 | while iterating through the write heads to take into account the 386 | allocation returned by this function. 387 | 388 | Returns: 389 | Tensor of shape `[batch_size, memory_size]` corresponding to allocation. 390 | """ 391 | with tf.name_scope('allocation'): 392 | # Ensure values are not too small prior to cumprod. 393 | usage = _EPSILON + (1 - _EPSILON) * usage 394 | 395 | nonusage = 1 - usage 396 | sorted_nonusage, indices = tf.nn.top_k( 397 | nonusage, k=self._memory_size, name='sort') 398 | sorted_usage = 1 - sorted_nonusage 399 | prod_sorted_usage = tf.cumprod(sorted_usage, axis=1, exclusive=True) 400 | sorted_allocation = sorted_nonusage * prod_sorted_usage 401 | inverse_indices = util.batch_invert_permutation(indices) 402 | 403 | # This final line "unsorts" sorted_allocation, so that the indexing 404 | # corresponds to the original indexing of `usage`. 405 | return util.batch_gather(sorted_allocation, inverse_indices) 406 | 407 | @property 408 | def state_size(self): 409 | """Returns the shape of the state tensor.""" 410 | return tf.TensorShape([self._memory_size]) 411 | -------------------------------------------------------------------------------- /src/lib/dnc.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """DNC Cores. 16 | 17 | These modules create a DNC core. They take input, pass parameters to the memory 18 | access module, and integrate the output of memory to form an output. 19 | """ 20 | 21 | from __future__ import absolute_import 22 | from __future__ import division 23 | from __future__ import print_function 24 | 25 | import collections 26 | import numpy as np 27 | import sonnet as snt 28 | import tensorflow as tf 29 | 30 | from . import access 31 | 32 | DNCState = collections.namedtuple('DNCState', ('access_output', 'access_state', 33 | 'controller_state')) 34 | 35 | 36 | class DNC(snt.RNNCore): 37 | """DNC core module. 38 | 39 | Contains controller and memory access module. 40 | """ 41 | 42 | def __init__(self, 43 | access_config, 44 | controller_config, 45 | output_size, 46 | clip_value=None,use_memory=True, 47 | name='dnc'): 48 | """Initializes the DNC core. 49 | 50 | Args: 51 | access_config: dictionary of access module configurations. 52 | controller_config: dictionary of controller (LSTM) module configurations. 53 | output_size: output dimension size of core. 54 | clip_value: clips controller and core output values to between 55 | `[-clip_value, clip_value]` if specified. 56 | name: module name (default 'dnc'). 57 | 58 | Raises: 59 | TypeError: if direct_input_size is not None for any access module other 60 | than KeyValueMemory. 61 | """ 62 | super(DNC, self).__init__(name=name) 63 | 64 | with self._enter_variable_scope(): 65 | self._controller = snt.LSTM(**controller_config) 66 | self._access = access.MemoryAccess(**access_config) 67 | self.use_memory=use_memory 68 | self._access_output_size = np.prod(self._access.output_size.as_list()) 69 | self._output_size = output_size 70 | self._clip_value = clip_value or 0 71 | 72 | self._output_size = tf.TensorShape([output_size]) 73 | self._state_size = DNCState( 74 | access_output=self._access_output_size, 75 | access_state=self._access.state_size, 76 | controller_state=self._controller.state_size) 77 | 78 | def _clip_if_enabled(self, x): 79 | if self._clip_value > 0: 80 | return tf.clip_by_value(x, -self._clip_value, self._clip_value) 81 | else: 82 | return x 83 | 84 | def _build(self, inputs, prev_state): 85 | """Connects the DNC core into the graph. 86 | 87 | Args: 88 | inputs: Tensor input. 89 | prev_state: A `DNCState` tuple containing the fields `access_output`, 90 | `access_state` and `controller_state`. `access_state` is a 3-D Tensor 91 | of shape `[batch_size, num_reads, word_size]` containing read words. 92 | `access_state` is a tuple of the access module's state, and 93 | `controller_state` is a tuple of controller module's state. 94 | 95 | Returns: 96 | A tuple `(output, next_state)` where `output` is a tensor and `next_state` 97 | is a `DNCState` tuple containing the fields `access_output`, 98 | `access_state`, and `controller_state`. 99 | """ 100 | 101 | prev_access_output = prev_state.access_output 102 | prev_access_state = prev_state.access_state 103 | prev_controller_state = prev_state.controller_state 104 | 105 | batch_flatten = snt.BatchFlatten() 106 | if self.use_memory is False: 107 | prev_access_output=prev_access_output*0 108 | controller_input = tf.concat( 109 | [batch_flatten(inputs), batch_flatten(prev_access_output)], 1) 110 | 111 | controller_output, controller_state = self._controller( 112 | controller_input, prev_controller_state) 113 | 114 | controller_output = self._clip_if_enabled(controller_output) 115 | controller_state = snt.nest.map(self._clip_if_enabled, controller_state) 116 | 117 | access_output, access_state = self._access(controller_output, 118 | prev_access_state) 119 | if self.use_memory is False: 120 | access_output=access_output*0 121 | output = tf.concat([controller_output, batch_flatten(access_output)], 1) 122 | output = snt.Linear( 123 | output_size=self._output_size.as_list()[0], 124 | name='output_linear')(output) 125 | output = self._clip_if_enabled(output) 126 | 127 | return output, DNCState( 128 | access_output=access_output, 129 | access_state=access_state, 130 | controller_state=controller_state) 131 | 132 | def initial_state(self, batch_size, dtype=tf.float32): 133 | return DNCState( 134 | controller_state=self._controller.initial_state(batch_size, dtype), 135 | access_state=self._access.initial_state(batch_size, dtype), 136 | access_output=tf.zeros( 137 | [batch_size] + self._access.output_size.as_list(), dtype)) 138 | 139 | @property 140 | def state_size(self): 141 | return self._state_size 142 | 143 | @property 144 | def output_size(self): 145 | return self._output_size 146 | -------------------------------------------------------------------------------- /src/lib/seq2seq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Copyright 2018 Cognibit Solutions LLP. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | """ 18 | Sequence to sequence DNC model, Training and prediction library. 19 | """ 20 | import math 21 | 22 | import numpy as np 23 | import tensorflow as tf 24 | import tensorflow.contrib.seq2seq as seq2seq 25 | 26 | from tensorflow.python.ops.rnn_cell import GRUCell 27 | from tensorflow.python.ops.rnn_cell import LSTMCell 28 | from tensorflow.python.ops.rnn_cell import MultiRNNCell 29 | from tensorflow.python.ops.rnn_cell import DropoutWrapper, ResidualWrapper 30 | from tensorflow.python.ops import array_ops 31 | from tensorflow.python.ops import control_flow_ops 32 | from tensorflow.python.framework import constant_op 33 | from tensorflow.python.framework import dtypes 34 | from tensorflow.python.layers.core import Dense 35 | from tensorflow.python.util import nest 36 | 37 | from tensorflow.contrib.seq2seq.python.ops import attention_wrapper 38 | from tensorflow.contrib.seq2seq.python.ops import beam_search_decoder 39 | 40 | from .dnc import DNC 41 | 42 | 43 | class Seq2SeqModel(object): 44 | def __init__(self, config, mode, dnc_batch_size=None,use_memory=True): 45 | assert mode.lower() in ['train', 'decode'] 46 | 47 | self.config = config 48 | self.mode = mode.lower() 49 | 50 | self.cell_type = config['cell_type'] 51 | self.hidden_units = config['hidden_units'] 52 | self.depth = config['depth'] 53 | self.attention_type = config['attention_type'] 54 | self.embedding_size = config['embedding_size'] 55 | # self.bidirectional = config.bidirectional 56 | self.num_encoder_symbols = config['num_encoder_symbols'] 57 | self.num_decoder_symbols = config['num_decoder_symbols'] 58 | self.start_token = config['start_token'] 59 | self.end_token = config['end_token'] 60 | self.use_residual = config['use_residual'] 61 | self.attn_input_feeding = config['attn_input_feeding'] 62 | self.use_dropout = config['use_dropout'] 63 | self.keep_prob = 1.0 - config['dropout_rate'] 64 | self.global_step = tf.Variable(0, trainable=False, name='global_step') 65 | self.global_epoch_step = tf.Variable(0, trainable=False, name='global_epoch_step') 66 | self.global_epoch_step_op = \ 67 | tf.assign(self.global_epoch_step, self.global_epoch_step + 1) 68 | 69 | self.dtype = tf.float16 if config['use_fp16'] else tf.float32 70 | self.keep_prob_placeholder = tf.placeholder(self.dtype, shape=[], name='keep_prob') 71 | 72 | self.use_beamsearch_decode = False 73 | if self.mode == 'decode': 74 | self.beam_width = config['beam_width'] 75 | self.use_beamsearch_decode = True if self.beam_width > 1 else False 76 | self.max_decode_step = config['max_decode_step'] 77 | else: 78 | self.optimizer = config['optimizer'] 79 | self.learning_rate = config['learning_rate'] 80 | self.max_gradient_norm = config['max_gradient_norm'] 81 | 82 | if (self.cell_type == 'dnc'): 83 | self.num_reads = config['num_reads'] 84 | self.num_writes = config['num_writes'] 85 | self.word_size = config['word_size'] 86 | self.memory_size = config['memory_size'] 87 | self.clip_value = config['clip_value'] 88 | cell_type = DNC 89 | access_config = { 90 | "memory_size": self.memory_size, 91 | "word_size": self.word_size, 92 | "num_reads": self.num_reads, 93 | "num_writes": self.num_writes, 94 | } 95 | controller_config = { 96 | "hidden_size": self.hidden_units, 97 | } 98 | self.dnc_cell = cell_type(access_config=access_config, controller_config=controller_config, 99 | output_size=self.hidden_units, clip_value=self.clip_value,use_memory=use_memory) 100 | self.dncInitial = self.dnc_cell.initial_state 101 | #Dynamic Batch Size for DNC not yet supported hence we will use static batch size 102 | self.dnc_batch_size=dnc_batch_size 103 | 104 | self.build_model() 105 | 106 | def build_model(self): 107 | print("building model..") 108 | 109 | # Building encoder and decoder networks 110 | self.init_placeholders() 111 | self.build_encoder() 112 | self.build_decoder() 113 | 114 | # Merge all the training summaries 115 | self.summary_op = tf.summary.merge_all() 116 | 117 | def init_placeholders(self): 118 | # encoder_inputs: [batch_size, max_time_steps] 119 | 120 | self.encoder_inputs = tf.placeholder(dtype=tf.int32, 121 | shape=(None, None), name='encoder_inputs') 122 | 123 | # encoder_inputs_length: [batch_size] 124 | self.encoder_inputs_length = tf.placeholder( 125 | dtype=tf.int32, shape=(None,), name='encoder_inputs_length') 126 | 127 | # get dynamic batch_size 128 | self.batch_size = tf.shape(self.encoder_inputs)[0] 129 | if self.mode == 'train': 130 | # decoder_inputs: [batch_size, max_time_steps] 131 | self.decoder_inputs = tf.placeholder( 132 | dtype=tf.int32, shape=(None, None), name='decoder_inputs') 133 | # decoder_inputs_length: [batch_size] 134 | self.decoder_inputs_length = tf.placeholder( 135 | dtype=tf.int32, shape=(None,), name='decoder_inputs_length') 136 | 137 | decoder_start_token = tf.ones( 138 | shape=[self.batch_size, 1], dtype=tf.int32) * self.start_token 139 | decoder_end_token = tf.ones( 140 | shape=[self.batch_size, 1], dtype=tf.int32) * self.end_token 141 | 142 | # decoder_inputs_train: [batch_size , max_time_steps + 1] 143 | # insert _GO symbol in front of each decoder input 144 | self.decoder_inputs_train = tf.concat([decoder_start_token, 145 | self.decoder_inputs], axis=1) 146 | 147 | # decoder_inputs_length_train: [batch_size] 148 | self.decoder_inputs_length_train = self.decoder_inputs_length + 1 149 | 150 | # decoder_targets_train: [batch_size, max_time_steps + 1] 151 | # insert EOS symbol at the end of each decoder input 152 | self.decoder_targets_train = tf.concat([self.decoder_inputs, 153 | decoder_end_token], axis=1) 154 | 155 | def init_encoder_variable(self): 156 | self.encoder_cell = self.build_encoder_cell() 157 | 158 | # Initialize encoder_embeddings to have variance=1. 159 | sqrt3 = math.sqrt(3) # Uniform(-sqrt(3), sqrt(3)) has variance=1. 160 | initializer = tf.random_uniform_initializer(-sqrt3, sqrt3, dtype=self.dtype) 161 | 162 | self.encoder_embeddings = tf.get_variable(name='embedding', 163 | shape=[self.num_encoder_symbols, self.embedding_size], 164 | initializer=initializer, dtype=self.dtype) 165 | 166 | # Embedded_inputs: [batch_size, time_step, embedding_size] 167 | self.encoder_inputs_embedded = tf.nn.embedding_lookup( 168 | params=self.encoder_embeddings, ids=self.encoder_inputs) 169 | 170 | # Input projection layer to feed embedded inputs to the cell 171 | # ** Essential when use_residual=True to match input/output dims 172 | input_layer = Dense(self.hidden_units, dtype=self.dtype, name='input_projection') 173 | 174 | # Embedded inputs having gone through input projection layer 175 | self.encoder_inputs_embedded = input_layer(self.encoder_inputs_embedded) 176 | 177 | # Encode input sequences into context vectors: 178 | # encoder_outputs: [batch_size, max_time_step, cell_output_size] 179 | # encoder_state: [batch_size, cell_output_size] 180 | 181 | if (self.cell_type == 'dnc'): 182 | initial_state = self.dncInitial(self.dnc_batch_size) 183 | self.encoder_outputs, self.encoder_last_state = tf.nn.dynamic_rnn( 184 | cell=self.encoder_cell, inputs=self.encoder_inputs_embedded, 185 | sequence_length=self.encoder_inputs_length, dtype=self.dtype, 186 | time_major=False, initial_state=initial_state) 187 | else: 188 | self.encoder_outputs, self.encoder_last_state = tf.nn.dynamic_rnn( 189 | cell=self.encoder_cell, inputs=self.encoder_inputs_embedded, 190 | sequence_length=self.encoder_inputs_length, dtype=self.dtype, 191 | time_major=False) 192 | 193 | def build_encoder(self): 194 | print("building encoder..") 195 | try: 196 | with tf.variable_scope('encoder'): 197 | self.init_encoder_variable() 198 | except: 199 | with tf.variable_scope('encoder', reuse=True): 200 | self.init_encoder_variable() 201 | 202 | def init_decoder_variable(self): 203 | # Building decoder_cell and decoder_initial_state 204 | self.decoder_cell, self.decoder_initial_state = self.build_decoder_cell() 205 | 206 | # Initialize decoder embeddings to have variance=1. 207 | sqrt3 = math.sqrt(3) # Uniform(-sqrt(3), sqrt(3)) has variance=1. 208 | initializer = tf.random_uniform_initializer(-sqrt3, sqrt3, dtype=self.dtype) 209 | 210 | self.decoder_embeddings = tf.get_variable(name='embedding', 211 | shape=[self.num_decoder_symbols, self.embedding_size], 212 | initializer=initializer, dtype=self.dtype) 213 | 214 | # Input projection layer to feed embedded inputs to the cell 215 | # ** Essential when use_residual=True to match input/output dims 216 | input_layer = Dense(self.hidden_units, dtype=self.dtype, name='input_projection') 217 | 218 | # Output projection layer to convert cell_outputs to logits 219 | output_layer = Dense(self.num_decoder_symbols, name='output_projection') 220 | 221 | if self.mode == 'train': 222 | # decoder_inputs_embedded: [batch_size, max_time_step + 1, embedding_size] 223 | self.decoder_inputs_embedded = tf.nn.embedding_lookup( 224 | params=self.decoder_embeddings, ids=self.decoder_inputs_train) 225 | 226 | # Embedded inputs having gone through input projection layer 227 | self.decoder_inputs_embedded = input_layer(self.decoder_inputs_embedded) 228 | 229 | # Helper to feed inputs for training: read inputs from dense ground truth vectors 230 | training_helper = seq2seq.TrainingHelper(inputs=self.decoder_inputs_embedded, 231 | sequence_length=self.decoder_inputs_length_train, 232 | time_major=False, 233 | name='training_helper') 234 | 235 | training_decoder = seq2seq.BasicDecoder(cell=self.decoder_cell, 236 | helper=training_helper, 237 | initial_state=self.decoder_initial_state, 238 | output_layer=output_layer) 239 | # output_layer=None) 240 | 241 | # Maximum decoder time_steps in current batch 242 | max_decoder_length = tf.reduce_max(self.decoder_inputs_length_train) 243 | 244 | # decoder_outputs_train: BasicDecoderOutput 245 | # namedtuple(rnn_outputs, sample_id) 246 | # decoder_outputs_train.rnn_output: [batch_size, max_time_step + 1, num_decoder_symbols] if output_time_major=False 247 | # [max_time_step + 1, batch_size, num_decoder_symbols] if output_time_major=True 248 | # decoder_outputs_train.sample_id: [batch_size], tf.int32 249 | (self.decoder_outputs_train, self.decoder_last_state_train, 250 | self.decoder_outputs_length_train) = (seq2seq.dynamic_decode( 251 | decoder=training_decoder, 252 | output_time_major=False, 253 | impute_finished=True, 254 | maximum_iterations=max_decoder_length)) 255 | 256 | # More efficient to do the projection on the batch-time-concatenated tensor 257 | # logits_train: [batch_size, max_time_step + 1, num_decoder_symbols] 258 | # self.decoder_logits_train = output_layer(self.decoder_outputs_train.rnn_output) 259 | self.decoder_logits_train = tf.identity(self.decoder_outputs_train.rnn_output) 260 | # Use argmax to extract decoder symbols to emit 261 | self.decoder_pred_train = tf.argmax(self.decoder_logits_train, axis=-1, 262 | name='decoder_pred_train') 263 | 264 | # masks: masking for valid and padded time steps, [batch_size, max_time_step + 1] 265 | masks = tf.sequence_mask(lengths=self.decoder_inputs_length_train, 266 | maxlen=max_decoder_length, dtype=self.dtype, name='masks') 267 | 268 | # Computes per word average cross-entropy over a batch 269 | # Internally calls 'nn_ops.sparse_softmax_cross_entropy_with_logits' by default 270 | self.loss = seq2seq.sequence_loss(logits=self.decoder_logits_train, 271 | targets=self.decoder_targets_train, 272 | weights=masks, 273 | average_across_timesteps=True, 274 | average_across_batch=True, ) 275 | # Training summary for the current batch_loss 276 | tf.summary.scalar('loss', self.loss) 277 | 278 | # Contruct graphs for minimizing loss 279 | self.init_optimizer() 280 | 281 | elif self.mode == 'decode': 282 | 283 | # Start_tokens: [batch_size,] `int32` vector 284 | start_tokens = tf.ones([self.batch_size, ], tf.int32) * self.start_token 285 | end_token = self.end_token 286 | 287 | def embed_and_input_proj(inputs): 288 | return input_layer(tf.nn.embedding_lookup(self.decoder_embeddings, inputs)) 289 | 290 | if not self.use_beamsearch_decode: 291 | # Helper to feed inputs for greedy decoding: uses the argmax of the output 292 | decoding_helper = seq2seq.GreedyEmbeddingHelper(start_tokens=start_tokens, 293 | end_token=end_token, 294 | embedding=embed_and_input_proj) 295 | # Basic decoder performs greedy decoding at each time step 296 | print("building greedy decoder..") 297 | inference_decoder = seq2seq.BasicDecoder(cell=self.decoder_cell, 298 | helper=decoding_helper, 299 | initial_state=self.decoder_initial_state, 300 | output_layer=output_layer) 301 | else: 302 | # Beamsearch is used to approximately find the most likely translation 303 | print("building beamsearch decoder..") 304 | inference_decoder = beam_search_decoder.BeamSearchDecoder(cell=self.decoder_cell, 305 | embedding=embed_and_input_proj, 306 | start_tokens=start_tokens, 307 | end_token=end_token, 308 | initial_state=self.decoder_initial_state, 309 | beam_width=self.beam_width, 310 | output_layer=output_layer, ) 311 | 312 | (self.decoder_outputs_decode, self.decoder_last_state_decode, 313 | self.decoder_outputs_length_decode) = (seq2seq.dynamic_decode( 314 | decoder=inference_decoder, 315 | output_time_major=False, 316 | # impute_finished=True, # error occurs 317 | maximum_iterations=self.max_decode_step)) 318 | 319 | if not self.use_beamsearch_decode: 320 | # decoder_outputs_decode.sample_id: [batch_size, max_time_step] 321 | # Or use argmax to find decoder symbols to emit: 322 | # self.decoder_pred_decode = tf.argmax(self.decoder_outputs_decode.rnn_output, 323 | # axis=-1, name='decoder_pred_decode') 324 | 325 | # Here, we use expand_dims to be compatible with the result of the beamsearch decoder 326 | # decoder_pred_decode: [batch_size, max_time_step, 1] (output_major=False) 327 | self.decoder_pred_decode = tf.expand_dims(self.decoder_outputs_decode.sample_id, -1) 328 | 329 | else: 330 | # Use beam search to approximately find the most likely translation 331 | # decoder_pred_decode: [batch_size, max_time_step, beam_width] (output_major=False) 332 | self.decoder_pred_decode = self.decoder_outputs_decode.predicted_ids 333 | 334 | def build_decoder(self): 335 | print("building decoder and attention..") 336 | try: 337 | with tf.variable_scope('decoder'): 338 | self.init_decoder_variable() 339 | except: 340 | with tf.variable_scope('decoder', reuse=True): 341 | self.init_decoder_variable() 342 | 343 | def build_single_cell(self): 344 | 345 | if (self.cell_type.lower() == 'gru'): 346 | cell_type = GRUCell 347 | cell = cell_type(self.hidden_units) 348 | else: 349 | cell_type = LSTMCell 350 | cell = cell_type(self.hidden_units) 351 | 352 | if self.use_dropout: 353 | cell = DropoutWrapper(cell, dtype=self.dtype, 354 | output_keep_prob=self.keep_prob_placeholder, ) 355 | if self.use_residual: 356 | cell = ResidualWrapper(cell) 357 | 358 | return cell 359 | 360 | # Building encoder cell 361 | def build_encoder_cell(self): 362 | if (self.cell_type.lower() != 'dnc'): 363 | return MultiRNNCell([self.build_single_cell() for i in range(self.depth)]) 364 | else: 365 | return self.dnc_cell 366 | 367 | # Building decoder cell and attention. Also returns decoder_initial_state 368 | def build_decoder_cell(self): 369 | 370 | encoder_outputs = self.encoder_outputs 371 | encoder_last_state = self.encoder_last_state 372 | encoder_inputs_length = self.encoder_inputs_length 373 | # To use BeamSearchDecoder, encoder_outputs, encoder_last_state, encoder_inputs_length 374 | # needs to be tiled so that: [batch_size, .., ..] -> [batch_size x beam_width, .., ..] 375 | if self.use_beamsearch_decode: 376 | print("use beamsearch decoding..") 377 | encoder_outputs = seq2seq.tile_batch( 378 | self.encoder_outputs, multiplier=self.beam_width) 379 | encoder_last_state = nest.map_structure( 380 | lambda s: seq2seq.tile_batch(s, self.beam_width), self.encoder_last_state) 381 | encoder_inputs_length = seq2seq.tile_batch( 382 | self.encoder_inputs_length, multiplier=self.beam_width) 383 | 384 | # Building attention mechanism: Default Bahdanau 385 | # 'Bahdanau' style attention: https://arxiv.org/abs/1409.0473 386 | self.attention_mechanism = attention_wrapper.BahdanauAttention( 387 | num_units=self.hidden_units, memory=encoder_outputs, 388 | memory_sequence_length=encoder_inputs_length, ) 389 | # 'Luong' style attention: https://arxiv.org/abs/1508.04025 390 | if self.attention_type.lower() == 'luong': 391 | self.attention_mechanism = attention_wrapper.LuongAttention( 392 | num_units=self.hidden_units, memory=encoder_outputs, 393 | memory_sequence_length=encoder_inputs_length, ) 394 | 395 | def attn_decoder_input_fn(inputs, attention): 396 | if not self.attn_input_feeding: 397 | return inputs 398 | 399 | # Essential when use_residual=True 400 | _input_layer = Dense(self.hidden_units, dtype=self.dtype, 401 | name='attn_input_feeding') 402 | return _input_layer(array_ops.concat([inputs, attention], -1)) 403 | 404 | if (self.cell_type != 'dnc'): 405 | # Building decoder_cell 406 | self.decoder_cell_list = [ 407 | self.build_single_cell() for i in range(self.depth)] 408 | decoder_initial_state = encoder_last_state 409 | 410 | # AttentionWrapper wraps RNNCell with the attention_mechanism 411 | # Note: We implement Attention mechanism only on the top decoder layer 412 | self.decoder_cell_list[-1] = attention_wrapper.AttentionWrapper( 413 | cell=self.decoder_cell_list[-1], 414 | attention_mechanism=self.attention_mechanism, 415 | attention_layer_size=self.hidden_units, 416 | cell_input_fn=attn_decoder_input_fn, 417 | initial_cell_state=encoder_last_state[-1], 418 | alignment_history=False, 419 | name='Attention_Wrapper') 420 | 421 | # To be compatible with AttentionWrapper, the encoder last state 422 | # of the top layer should be converted into the AttentionWrapperState form 423 | # We can easily do this by calling AttentionWrapper.zero_state 424 | 425 | # Also if beamsearch decoding is used, the batch_size argument in .zero_state 426 | # should be ${decoder_beam_width} times to the origianl batch_size 427 | batch_size = self.batch_size if not self.use_beamsearch_decode \ 428 | else self.batch_size * self.beam_width 429 | # Initialised with this encoder state 430 | 431 | initial_state = [state for state in encoder_last_state] 432 | 433 | initial_state[-1] = self.decoder_cell_list[-1].zero_state( 434 | batch_size=batch_size, dtype=self.dtype) 435 | decoder_initial_state = tuple(initial_state) 436 | 437 | return MultiRNNCell(self.decoder_cell_list), decoder_initial_state 438 | else: 439 | decoder_cell = attention_wrapper.AttentionWrapper( 440 | cell=self.dnc_cell, 441 | attention_mechanism=self.attention_mechanism, 442 | attention_layer_size=self.hidden_units, 443 | cell_input_fn=attn_decoder_input_fn, 444 | initial_cell_state=encoder_last_state, 445 | alignment_history=False, 446 | name='Attention_Wrapper') 447 | decoder_initial_state = decoder_cell.zero_state(batch_size=self.dnc_batch_size, dtype=self.dtype) 448 | return decoder_cell, decoder_initial_state 449 | 450 | def init_optimizer(self): 451 | print("setting optimizer..") 452 | # Gradients and SGD update operation for training the model 453 | trainable_params = tf.trainable_variables() 454 | if self.optimizer.lower() == 'adadelta': 455 | self.opt = tf.train.AdadeltaOptimizer(learning_rate=self.learning_rate) 456 | elif self.optimizer.lower() == 'adam': 457 | self.opt = tf.train.AdamOptimizer(learning_rate=self.learning_rate) 458 | elif self.optimizer.lower() == 'rmsprop': 459 | self.opt = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate) 460 | else: 461 | self.opt = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate) 462 | 463 | # Compute gradients of loss w.r.t. all trainable variables 464 | gradients = tf.gradients(self.loss, trainable_params) 465 | 466 | # Clip gradients by a given maximum_gradient_norm 467 | clip_gradients, _ = tf.clip_by_global_norm(gradients, self.max_gradient_norm) 468 | 469 | # Update the model 470 | self.updates = self.opt.apply_gradients( 471 | zip(clip_gradients, trainable_params), global_step=self.global_step) 472 | 473 | def save(self, sess, path, var_list=None, global_step=None): 474 | # var_list = None returns the list of all saveable variables 475 | saver = tf.train.Saver(var_list) 476 | 477 | # temporary code 478 | # del tf.get_collection_ref('LAYER_NAME_UIDS')[0] 479 | save_path = saver.save(sess, save_path=path, global_step=global_step) 480 | print('model saved at %s' % save_path) 481 | 482 | def restore(self, sess, path, var_list=None): 483 | # var_list = None returns the list of all saveable variables 484 | saver = tf.train.Saver(var_list,reshape=True) 485 | saver.restore(sess, save_path=path) 486 | print('model restored from %s' % path) 487 | 488 | def train(self, sess, encoder_inputs, encoder_inputs_length, 489 | decoder_inputs, decoder_inputs_length): 490 | """Run a train step of the model feeding the given inputs. 491 | 492 | Args: 493 | session: tensorflow session to use. 494 | encoder_inputs: a numpy int matrix of [batch_size, max_source_time_steps] 495 | to feed as encoder inputs 496 | encoder_inputs_length: a numpy int vector of [batch_size] 497 | to feed as sequence lengths for each element in the given batch 498 | decoder_inputs: a numpy int matrix of [batch_size, max_target_time_steps] 499 | to feed as decoder inputs 500 | decoder_inputs_length: a numpy int vector of [batch_size] 501 | to feed as sequence lengths for each element in the given batch 502 | 503 | Returns: 504 | A triple consisting of gradient norm (or None if we did not do backward), 505 | average perplexity, and the outputs. 506 | """ 507 | # Check if the model is 'training' mode 508 | if self.mode.lower() != 'train': 509 | raise ValueError("train step can only be operated in train mode") 510 | 511 | input_feed = self.check_feeds(encoder_inputs, encoder_inputs_length, 512 | decoder_inputs, decoder_inputs_length, False) 513 | # Input feeds for dropout 514 | input_feed[self.keep_prob_placeholder.name] = self.keep_prob 515 | 516 | output_feed = [self.updates, # Update Op that does optimization 517 | self.loss, # Loss for current batch 518 | self.summary_op] # Training summary 519 | 520 | outputs = sess.run(output_feed, input_feed) 521 | return outputs[1], outputs[2] # loss, summary 522 | 523 | def eval(self, sess, encoder_inputs, encoder_inputs_length, 524 | decoder_inputs, decoder_inputs_length): 525 | """Run a evaluation step of the model feeding the given inputs. 526 | 527 | Args: 528 | session: tensorflow session to use. 529 | encoder_inputs: a numpy int matrix of [batch_size, max_source_time_steps] 530 | to feed as encoder inputs 531 | encoder_inputs_length: a numpy int vector of [batch_size] 532 | to feed as sequence lengths for each element in the given batch 533 | decoder_inputs: a numpy int matrix of [batch_size, max_target_time_steps] 534 | to feed as decoder inputs 535 | decoder_inputs_length: a numpy int vector of [batch_size] 536 | to feed as sequence lengths for each element in the given batch 537 | 538 | Returns: 539 | A triple consisting of gradient norm (or None if we did not do backward), 540 | average perplexity, and the outputs. 541 | """ 542 | 543 | input_feed = self.check_feeds(encoder_inputs, encoder_inputs_length, 544 | decoder_inputs, decoder_inputs_length, False) 545 | # Input feeds for dropout 546 | input_feed[self.keep_prob_placeholder.name] = 1.0 547 | 548 | output_feed = [self.loss, # Loss for current batch 549 | self.summary_op] # Training summary 550 | outputs = sess.run(output_feed, input_feed) 551 | return outputs[0], outputs[1] # loss 552 | 553 | def predict(self, sess, encoder_inputs, encoder_inputs_length): 554 | 555 | input_feed = self.check_feeds(encoder_inputs, encoder_inputs_length, 556 | decoder_inputs=None, decoder_inputs_length=None, 557 | decode=True) 558 | 559 | # Input feeds for dropout 560 | input_feed[self.keep_prob_placeholder.name] = 1.0 561 | 562 | output_feed = [self.decoder_pred_decode] 563 | outputs = sess.run(output_feed, input_feed) 564 | 565 | # GreedyDecoder: [batch_size, max_time_step] 566 | return outputs[0] # BeamSearchDecoder: [batch_size, max_time_step, beam_width] 567 | 568 | def check_feeds(self, encoder_inputs, encoder_inputs_length, 569 | decoder_inputs, decoder_inputs_length, decode): 570 | """ 571 | Args: 572 | encoder_inputs: a numpy int matrix of [batch_size, max_source_time_steps] 573 | to feed as encoder inputs 574 | encoder_inputs_length: a numpy int vector of [batch_size] 575 | to feed as sequence lengths for each element in the given batch 576 | decoder_inputs: a numpy int matrix of [batch_size, max_target_time_steps] 577 | to feed as decoder inputs 578 | decoder_inputs_length: a numpy int vector of [batch_size] 579 | to feed as sequence lengths for each element in the given batch 580 | decode: a scalar boolean that indicates decode mode 581 | Returns: 582 | A feed for the model that consists of encoder_inputs, encoder_inputs_length, 583 | decoder_inputs, decoder_inputs_length 584 | """ 585 | 586 | input_batch_size = encoder_inputs.shape[0] 587 | if input_batch_size != encoder_inputs_length.shape[0]: 588 | raise ValueError("Encoder inputs and their lengths must be equal in their " 589 | "batch_size, %d != %d" % (input_batch_size, encoder_inputs_length.shape[0])) 590 | 591 | if not decode: 592 | target_batch_size = decoder_inputs.shape[0] 593 | if target_batch_size != input_batch_size: 594 | raise ValueError("Encoder inputs and Decoder inputs must be equal in their " 595 | "batch_size, %d != %d" % (input_batch_size, target_batch_size)) 596 | if target_batch_size != decoder_inputs_length.shape[0]: 597 | raise ValueError("Decoder targets and their lengths must be equal in their " 598 | "batch_size, %d != %d" % (target_batch_size, decoder_inputs_length.shape[0])) 599 | 600 | input_feed = {} 601 | 602 | input_feed[self.encoder_inputs.name] = encoder_inputs 603 | input_feed[self.encoder_inputs_length.name] = encoder_inputs_length 604 | 605 | if not decode: 606 | input_feed[self.decoder_inputs.name] = decoder_inputs 607 | input_feed[self.decoder_inputs_length.name] = decoder_inputs_length 608 | 609 | return input_feed 610 | -------------------------------------------------------------------------------- /src/lib/util.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Google Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """DNC util ops and modules.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import numpy as np 22 | import tensorflow as tf 23 | 24 | 25 | def batch_invert_permutation(permutations): 26 | """Returns batched `tf.invert_permutation` for every row in `permutations`.""" 27 | with tf.name_scope('batch_invert_permutation', values=[permutations]): 28 | unpacked = tf.unstack(permutations) 29 | inverses = [tf.invert_permutation(permutation) for permutation in unpacked] 30 | return tf.stack(inverses) 31 | 32 | 33 | def batch_gather(values, indices): 34 | """Returns batched `tf.gather` for every row in the input.""" 35 | with tf.name_scope('batch_gather', values=[values, indices]): 36 | unpacked = zip(tf.unstack(values), tf.unstack(indices)) 37 | result = [tf.gather(value, index) for value, index in unpacked] 38 | return tf.stack(result) 39 | 40 | 41 | def one_hot(length, index): 42 | """Return an nd array of given `length` filled with 0s and a 1 at `index`.""" 43 | result = np.zeros(length) 44 | result[index] = 1 45 | return result 46 | -------------------------------------------------------------------------------- /src/preprocessing.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Cognibit Solutions LLP. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """ 16 | This script prepares the raw data for the next stage of normalization. 17 | """ 18 | 19 | import os 20 | import sys 21 | import pandas as pd 22 | from multiprocessing import Pool 23 | 24 | 25 | def preprocessing(file): 26 | print('Launch Processing of {}'.format(file)) 27 | output = file+'_processed.csv' 28 | 29 | # By default, Pandas treats double quote as enclosing an entry so it includes all tabs and newlines in that entry 30 | # until it reaches the next quote. To escape it we need to have the quoting argument set to QUOTE_NONE or 3 as 31 | # given in the documentation - [https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html] 32 | raw_data = pd.read_csv(file, header=None, sep='\t', quoting = 3, names=['semiotic', 'before', 'after']) 33 | 34 | # Generating sentence and word token ids 35 | # Our text normalization approach requires sentence and token ids to encode and generate batches 36 | data = pd.DataFrame(columns=['sentence_id', 37 | 'token_id', 38 | 'semiotic', 39 | 'before', 40 | 'after']) 41 | # initialize columns and iterator 42 | sentence_id = 0 43 | token_id = -1 44 | 45 | # heavy processing ahead 46 | for row in raw_data.itertuples(): 47 | # look for end of sentences 48 | if row.semiotic == '' and row.before == '': 49 | sentence_id += 1 50 | token_id = -1 51 | continue 52 | else: 53 | token_id += 1 54 | 55 | new_row = {'sentence_id': sentence_id, 56 | 'token_id': token_id, 57 | 'semiotic': row.semiotic, 58 | 'before': row.before, 59 | 'after': row.after} 60 | data = data.append(new_row, ignore_index=True) 61 | print('Processing Sentence#{} of {}'.format(sentence_id, file)) 62 | 63 | # **Transforming 'after' tokens** 64 | # From the above mentioned paper: 65 | # ``` 66 | # Semiotic class instances are verbalized as sequences 67 | # of fully spelled words, most ordinary words are left alone (rep- 68 | # resented here as ), and punctuation symbols are mostly 69 | # transduced to sil (for “silence”). 70 | # ``` 71 | # Hence we transform as follows: 72 | # 1. sil is replaced with < self > 73 | # 2. < self > is replaced with the before column 74 | # 75 | sil_mask = (data['after'] == 'sil') 76 | data.loc[sil_mask, 'after'] = '' 77 | self_mask = (data['after'] == '') 78 | data.loc[self_mask, ('after')] = data.loc[self_mask, 'before'] 79 | 80 | # Exporting Data 81 | data.to_csv(output, index=False) 82 | print('Done {}'.format(file)) 83 | return True 84 | 85 | def split_dataframe(df, size=10*1024*1024): 86 | """Splits huge dataframes(CSVs) into smaller segments of given size in bytes""" 87 | 88 | # size of each row 89 | row_size = df.memory_usage().sum() / len(df) 90 | # maximum number of rows in each segment 91 | row_limit = int(size // row_size) 92 | # number of segments 93 | seg_num = (len(df)+row_limit-1)//row_limit 94 | # split df into segments 95 | segments = [df.iloc[i*row_limit : (i+1)*row_limit] for i in range(seg_num)] 96 | 97 | return segments 98 | 99 | 100 | if __name__ == '__main__': 101 | path = sys.argv[1] 102 | jobs = int(sys.argv[2]) 103 | 104 | # split large CSVs 105 | for dirpath, _, filenames in os.walk(path): 106 | for file in filenames: 107 | df = pd.read_csv(os.path.join(dirpath, file),header=None, sep='\t', quoting = 3, names=['semiotic', 'before', 'after']) 108 | df_splits = split_dataframe(df, 10*1024*1024) 109 | # save each split and delete original 110 | for i in range(len(df_splits)): 111 | split_file = file+'_part{}'.format(i+1) 112 | df_splits[i].to_csv(os.path.join(dirpath, split_file)) 113 | os.remove(os.path.join(dirpath, file)) 114 | print("Splitted original file into chunks...") 115 | 116 | files=[] 117 | for dirpath, _, filenames in os.walk(path): 118 | for file in filenames: 119 | files.append(os.path.join(dirpath, file)) 120 | 121 | pool=Pool(jobs) 122 | pool.map(preprocessing, files) 123 | 124 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Cognibit Solutions LLP. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """ 16 | 17 | Utility Modules for Text Normalization 18 | """ 19 | 20 | import pickle 21 | import numpy as np 22 | from multiprocessing import Pool 23 | 24 | 25 | class Encoder: 26 | def __init__(self, vocab_file, wlook=3, time_major=False): 27 | self.vocab_file = vocab_file 28 | self.wlook = wlook 29 | self.time_major = time_major 30 | 31 | def encode(self, df, nthreads=8): 32 | if (nthreads < 1): 33 | assert ('nthreads is 1, cannot proceeed!') 34 | else: 35 | row_len = df.shape[0] 36 | batch_len = int(row_len / nthreads) 37 | last_batch = row_len % nthreads 38 | batches = [] 39 | for i in range(nthreads): 40 | if (i != nthreads - 1): 41 | batches.append(df.iloc[i * batch_len:i * batch_len + batch_len]) 42 | else: 43 | batches.append(df.iloc[i * batch_len:]) 44 | threads = Pool(nthreads) 45 | encoded_dfs = threads.map(self.run_single_batch, batches) 46 | encoding, encoding_len = zip(*encoded_dfs) 47 | col_len = 0 48 | for e in encoding: 49 | if (e.shape[1] > col_len): 50 | col_len = e.shape[1] 51 | encoding = list(encoding) 52 | for i in range(len(encoding)): 53 | encoding[i] = np.concatenate((encoding[i], np.zeros([encoding[i].shape[0] 54 | , col_len - encoding[i].shape[1]])), axis=1) 55 | encoding = np.concatenate(encoding) 56 | encoding_len = np.concatenate(encoding_len) 57 | return encoding, encoding_len 58 | 59 | def run_single_batch(self, df): 60 | batch_gen = EncodingGenerator(self.vocab_file, self.wlook, self.time_major) 61 | return batch_gen.encode(df) 62 | 63 | 64 | class EncodingGenerator: 65 | def __init__(self, vocab_file, wlook=3, time_major=False): 66 | self.train_grp = None 67 | self.row_len = None 68 | with open(vocab_file, 'rb') as handle: 69 | self.vocab_dict = pickle.loads(handle.read()) 70 | self.sent_id = 0 71 | self.token_id = 0 72 | self.row_count = 0 73 | self.wlook = wlook 74 | self.time_major = time_major 75 | self.group_keys = None 76 | 77 | def __input_lookup(self, char): 78 | if (char in self.vocab_dict['input']): 79 | return self.vocab_dict['input'][char] 80 | else: 81 | return self.vocab_dict['input'][''] 82 | 83 | def __input_word_lookup(self, word): 84 | lookups = [] 85 | word = str(word) 86 | print 87 | for c in word: 88 | lookups.append(self.__input_lookup(c)) 89 | return lookups 90 | 91 | def __next_element(self): 92 | sent = self.train_grp.get_group(self.group_keys[self.sent_id]) 93 | if (self.token_id > sent.shape[0] - 1): 94 | self.sent_id = (self.sent_id + 1) % self.train_grp.ngroups 95 | self.token_id = 0 96 | sent = self.train_grp.get_group(self.group_keys[self.sent_id]) 97 | token_count = sent.shape[0] 98 | row_dict = dict() 99 | new_row = [] 100 | for k in range(-self.wlook, self.wlook + 1): 101 | if (k == 0): 102 | new_row.append(self.__input_lookup('')) 103 | lookup = self.__input_word_lookup(sent.iloc[k + self.token_id, :]['before']) 104 | new_row.extend(lookup) 105 | new_row.append(self.__input_lookup('')) 106 | new_row.append(self.__input_lookup(' ')) 107 | elif ((self.token_id + k < 0 or self.token_id + k > token_count - 1) == False): 108 | lookup = self.__input_word_lookup(sent.iloc[k + self.token_id, :]['before']) 109 | new_row.extend(lookup) 110 | new_row.append(self.__input_lookup(' ')) 111 | new_row.append(self.__input_lookup('')) 112 | self.token_id = self.token_id + 1 113 | return new_row 114 | 115 | def encode(self, df): 116 | self.train_grp = df.groupby(by='sentence_id') 117 | self.row_len = df.shape[0] 118 | self.group_keys = list(self.train_grp.groups.keys()) 119 | input_batches = [] 120 | max_inp_len = 0 121 | for b in range(self.row_len): 122 | i = self.__next_element() 123 | input_batches.append(i) 124 | if (len(i) > max_inp_len): 125 | max_inp_len = len(i) 126 | # Add the padding characters 127 | input_batches_len = np.zeros([self.row_len]) 128 | count = 0 129 | for b in input_batches: 130 | input_batches_len[count] = len(b) 131 | count = count + 1 132 | for i in range(0, max_inp_len - len(b)): 133 | b.append(self.__input_lookup('')) 134 | 135 | input_batches = np.array(input_batches) 136 | 137 | if (self.time_major == True): 138 | input_batches = input_batches.T 139 | 140 | return input_batches, input_batches_len 141 | 142 | 143 | class Normalized2String: 144 | def __init__(self, vocab_file): 145 | with open(vocab_file, 'rb') as handle: 146 | self.vocab_dict = pickle.loads(handle.read()) 147 | output_id_dict = self.vocab_dict['output'] 148 | self.output_id_dict_rev = {v: k for k, v in output_id_dict.items()} 149 | 150 | def to_str(self, prediction): 151 | """ 152 | prediction : A 1D numpy array 153 | """ 154 | final_str = '' 155 | for id in prediction: 156 | word = self.__output_lookup_inverse(id) 157 | if word == '': 158 | break 159 | else: 160 | final_str = final_str +' '+ str(word) 161 | return final_str[1:] 162 | 163 | def __output_lookup_inverse(self, id): 164 | return self.output_id_dict_rev[id] --------------------------------------------------------------------------------